summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgebele <gebele@in-silico.ch>2016-01-22 23:18:56 +0100
committergebele <gebele@in-silico.ch>2016-01-22 23:18:56 +0100
commit88ac5fbe3d8d3141fbad81460b13d6cb8284da26 (patch)
tree783f9da714155a5848c84905159a51b002c6a80c
parent09b750e1639d351d24cff3cca74681c761b17503 (diff)
refined prediction
-rw-r--r--.gitignore1
-rw-r--r--Gemfile1
-rw-r--r--application.rb5
-rw-r--r--feature-filter.rb36
-rw-r--r--nanoparticles.rb47
-rw-r--r--relevant-features.json1
-rw-r--r--views/predict.haml2
-rw-r--r--views/prediction.haml147
8 files changed, 157 insertions, 83 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b844b14
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+Gemfile.lock
diff --git a/Gemfile b/Gemfile
index 7a3ee30..9193d08 100644
--- a/Gemfile
+++ b/Gemfile
@@ -2,3 +2,4 @@ source "https://rubygems.org"
gemspec
gem "sinatra"
gem "haml"
+gem "rserve-client"
diff --git a/application.rb b/application.rb
index 50f027a..16985d5 100644
--- a/application.rb
+++ b/application.rb
@@ -10,8 +10,9 @@ get '/?' do
end
get '/predict/?' do
- @data = JSON.parse(File.read("./data.json"))
- @example = @data[@data.keys.sample]["physchem"]
+ data = JSON.parse(File.read("./data.json"))
+ relevant_features = JSON.parse(File.read("./relevant-features.json"))
+ @example = data[data.keys.sample]["physchem"].select{|f,v| relevant_features.keys.include? f}
#@json_example = JSON.pretty_generate(@example)
haml :predict
end
diff --git a/feature-filter.rb b/feature-filter.rb
new file mode 100644
index 0000000..3765842
--- /dev/null
+++ b/feature-filter.rb
@@ -0,0 +1,36 @@
+require 'rserve'
+require 'json'
+require 'yaml'
+require 'csv'
+
+R = Rserve::Connection.new
+ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])"
+
+def feature_filter
+ data = JSON.parse(File.read("./data.json"))
+ features = data["G15.AC"]["physchem"].keys
+ R.assign "tox", data.collect{|id,cats| cats["tox"][ENDPOINT]}
+ filtered_features = {}
+ features.each do |feature|
+ R.assign "feature", data.collect{|id,cats| cats["physchem"][feature]}
+ begin
+ #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
+ R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')"
+ pvalue = R.eval("cor$p.value").to_ruby
+ if pvalue <= 0.05
+ r = R.eval("cor$estimate").to_ruby
+ filtered_features[feature] = {}
+ filtered_features[feature]["pvalue"] = pvalue
+ filtered_features[feature]["r"] = r
+ end
+ rescue
+ f = data.collect{|id,cats| cats["physchem"][feature]}
+ f = R.eval("feature").to_ruby
+ p f.collect{|f| p f; Math.log f}
+ p R.eval("log(feature)").to_ruby
+ end
+ end
+ filtered_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+end
+
+puts feature_filter.to_json
diff --git a/nanoparticles.rb b/nanoparticles.rb
index e554029..e34e509 100644
--- a/nanoparticles.rb
+++ b/nanoparticles.rb
@@ -1,3 +1,4 @@
+require 'rserve'
require 'json'
require 'yaml'
require 'csv'
@@ -9,24 +10,32 @@ def predict params
sim_sum = 0
weighted_sum = 0
match = nil
+ relevant_features = JSON.parse(File.read("./relevant-features.json"))
+ weights = relevant_features.values.collect{|v| v["r"]}
JSON.parse(File.read("./data.json")).each do |id,categories|
- if params.values == categories["physchem"].values
- match = {:id => categories}
+ neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values
+ if params.values == neighbor_values
+ match = {id => categories}
else
- sim = cosine_similarity(params.values,categories["physchem"].values)
- neighbor = categories
- neighbor["similarity"] = sim
- neighbor["id"] = id
- sim_sum += sim
- weighted_sum += sim*Math.log(categories["tox"][ENDPOINT])
- neighbors << neighbor
+ sim = weighted_cosine_similarity(params.values,neighbor_values,weights)
+ if sim > 0.95
+ neighbor = categories
+ neighbor["similarity"] = sim
+ neighbor["sim"] = cosine_similarity(params.values,neighbor_values)
+ neighbor["id"] = id
+ sim_sum += sim
+ weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT])
+ #weighted_sum += sim*categories["tox"][ENDPOINT]
+ neighbors << neighbor
+ end
end
end
neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
+ sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
{
:query => params,
:match => match,
- :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)},
+ :prediction => {ENDPOINT => prediction},
:neighbors => neighbors
}
end
@@ -44,18 +53,32 @@ end
def dot_product(a, b)
products = a.zip(b).map{|a, b| a * b}
- products.inject(0){|s,p| s + p}
+ products.inject(0) {|s,p| s + p}
end
def magnitude(point)
- squares = point.map{|x| x.to_f ** 2}
+ squares = point.map{|x| x ** 2}
Math.sqrt(squares.inject(0) {|s, c| s + c})
end
+# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
def cosine_similarity(a, b)
dot_product(a, b) / (magnitude(a) * magnitude(b))
end
+def weighted_cosine_similarity(a, b, w)
+ dot_product = 0
+ magnitude_a = 0
+ magnitude_b = 0
+ (0..a.size-1).each do |i|
+ dot_product += w[i].abs*a[i]*b[i]
+ magnitude_a += w[i].abs*a[i]**2
+ magnitude_b += w[i].abs*b[i]**2
+ end
+ dot_product/Math.sqrt(magnitude_a*magnitude_b)
+
+end
+
#@endpoint = @data.collect{|r| r[5]}
def neighbors query
diff --git a/relevant-features.json b/relevant-features.json
new file mode 100644
index 0000000..8da5f0a
--- /dev/null
+++ b/relevant-features.json
@@ -0,0 +1 @@
+{"Localized Surface Plasmon Resonance (LSPR) index Human serum (Sigma #H4522)":{"pvalue":2.7781332789800217e-10,"r":0.5673665177300351},"ZETA POTENTIAL Change":{"pvalue":2.071052722262855e-09,"r":-0.5435850485069125},"ZETA POTENTIAL [mV]":{"pvalue":2.025461243171378e-08,"r":0.5141366184476799},"Intensity Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":7.296459421546331e-06,"r":0.4220954992507819},"Volume Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.00014304093718453537,"r":0.3627394229575142},"Z-Average Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.0013415551974216289,"r":0.3089691292079323},"Localized Surface Plasmon Resonance (LSPR) index Change":{"pvalue":0.0020134808685383643,"r":0.2980672974262308},"Total surface area (SAtot) Human serum (Sigma #H4522) [cm^2]":{"pvalue":0.009242146870015556,"r":-0.2529069909205578},"Localized Surface Plasmon Resonance (LSPR) index":{"pvalue":0.010019903744117542,"r":0.2502907155942361},"Volume Mean Hydrodynamic Diameter Change":{"pvalue":0.013109585368169974,"r":0.24139711318406543},"Polydispersity index Change":{"pvalue":0.020454612521909568,"r":-0.22597482832068308},"Volume Mean Hydrodynamic Diameter [nm]":{"pvalue":0.033722917694600785,"r":0.20744306643756433},"Polydispersity index [nm]":{"pvalue":0.04433575298419301,"r":0.19667713971723438},"Number Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.04474107346052025,"r":0.19631065152636545}}
diff --git a/views/predict.haml b/views/predict.haml
index 6abd5bc..4584cbf 100644
--- a/views/predict.haml
+++ b/views/predict.haml
@@ -1,6 +1,6 @@
%div.well
%form{:role=>"form", :action=> to("/predict"), :method=>"post"}
- %span.help-block
+ %h3.help-block
Please characterise a Gold nanoparticle
#input
- size = @example.size
diff --git a/views/prediction.haml b/views/prediction.haml
index 99bd2a0..113e934 100644
--- a/views/prediction.haml
+++ b/views/prediction.haml
@@ -22,94 +22,105 @@
%h5= key
%input.form-control{:id=>id,:type=>"text",:value=>"#{val}", :disabled=>"true"}
#match.tab-pane.fade
- - if @prediction[:match]
- %table{:id=>"match"}
+ - if @prediction[:match]
+ %table{:id=>"match", :style=>"border-style: 1px solid black;margin-top:10px;"}
%thead
%tr
- - @prediction[:match][:id].keys.each do |key|
- %th= key.capitalize
+ %th
+ ID
+ %th
+ Composition
+ %th
+ Tox
+ %th
+ Physchem
%tbody
%tr
+ / ID
+ %td
+ %h5= @prediction[:match].keys[0]
/ composition
%td
- - @prediction[:match][:id]["composition"].each do |k,v|
+ - @prediction[:match].values[0]["composition"].each do |k,v|
%h5= k
%p= v
/ tox
%td
- - @prediction[:match][:id]["tox"].each do |k,v|
+ - @prediction[:match].values[0]["tox"].each do |k,v|
%h5= k
%p= v
/ physchem
%td
- - @prediction[:match][:id]["physchem"].each do |k,v|
+ - @prediction[:match].values[0]["physchem"].each do |k,v|
%h5= k
%p= v
- else
%h5 No match
#prediction.tab-pane.in.active
- - @prediction[:prediction].each do |k,v|
- %h5= k
- %p= v.round(5)
+ - if @prediction[:prediction]
+ - @prediction[:prediction].each do |k,v|
+ %h5= k
+ %p= v.round(5)
#neighbors.tab-pane.fade
- :javascript
- $(document).ready(function(){
- $("table#match").tablesorter({
- debug: false,
- theme: "bootstrap",
- headerTemplate: '{content} {icon}',
- widgets: ['uitheme'],
- headers: {0: {sorter: false}, 1: {sorter: false}, 2: {sorter: false}},
- sortList: [[1,1]],
- widthFixed: false
+ - if @prediction[:neighbors]
+ :javascript
+ $(document).ready(function(){
+ $("table#match").tablesorter({
+ debug: false,
+ theme: "bootstrap",
+ headerTemplate: '{content} {icon}',
+ widgets: ['uitheme'],
+ headers: {0: {sorter: false}, 1: {sorter: false}, 2: {sorter: false}, 3: {sorter: false}},
+ sortList: [[1,1]],
+ widthFixed: false
+ });
});
- });
- $(document).ready(function(){
- $("table#neighbors").tablesorter({
- debug: false,
- theme: "bootstrap",
- headerTemplate: '{content} {icon}',
- widgets: ['uitheme'],
- sortList: [[1,1]],
- widthFixed: false
+ $(document).ready(function(){
+ $("table#neighbors").tablesorter({
+ debug: false,
+ theme: "bootstrap",
+ headerTemplate: '{content} {icon}',
+ widgets: ['uitheme'],
+ sortList: [[1,1]],
+ widthFixed: false
+ });
});
- });
- %div.table-responsive
- %table.tablesorter{:id=>"neighbors", :style=>"border-style: 1px solid black;margin-top:10px;"}
- %thead
- %tr
- %th{:style =>"vertical-align:middle;"}
- ID
- %th{:style =>"vertical-align:middle;"}
- Similarity
- %th{:style =>"vertical-align:middle;"}
- Composition
- %th{:style =>"vertical-align:middle;"}
- Tox
- %th{:style =>"vertical-align:middle;"}
- Physchem
- %tbody
- - @prediction[:neighbors].each do |neighbor|
- %tr
- / ID
- %td
- %h5= neighbor["id"]
- / Similarity
- %td
- %h5= neighbor["similarity"].round(3)
- / Composition
- %td
- - neighbor["composition"].each do |k,v|
- %h5= k
- %p= v
- / Tox
- %td
- - neighbor["tox"].each do |k,v|
- %h5= k
- %p= v.round(3)
- / Physchem
- %td
- - neighbor["physchem"].each do |k,v|
- %h5= k
- %p= v
+ %div.table-responsive
+ %table.tablesorter{:id=>"neighbors", :style=>"border-style: 1px solid black;margin-top:10px;"}
+ %thead
+ %tr
+ %th{:style =>"vertical-align:middle;"}
+ ID
+ %th{:style =>"vertical-align:middle;"}
+ Similarity
+ %th{:style =>"vertical-align:middle;"}
+ Composition
+ %th{:style =>"vertical-align:middle;"}
+ Tox
+ %th{:style =>"vertical-align:middle;"}
+ Physchem
+ %tbody
+ - @prediction[:neighbors].each do |neighbor|
+ %tr
+ / ID
+ %td
+ %h5= neighbor["id"]
+ / Similarity
+ %td
+ %h5= neighbor["similarity"].round(3)
+ / Composition
+ %td
+ - neighbor["composition"].each do |k,v|
+ %h5= k
+ %p= v
+ / Tox
+ %td
+ - neighbor["tox"].each do |k,v|
+ %h5= k
+ %p= v.round(3)
+ / Physchem
+ %td
+ - neighbor["physchem"].each do |k,v|
+ %h5= k
+ %p= v