refined prediction

author: gebele <gebele@in-silico.ch> 2016-01-22 23:18:56 +0100
committer: gebele <gebele@in-silico.ch> 2016-01-22 23:18:56 +0100
commit: 88ac5fbe3d8d3141fbad81460b13d6cb8284da26 (patch)
tree: 783f9da714155a5848c84905159a51b002c6a80c
parent: 09b750e1639d351d24cff3cca74681c761b17503 (diff)
8 files changed, 157 insertions, 83 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b844b14
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+Gemfile.lock
diff --git a/Gemfile b/Gemfile
index 7a3ee30..9193d08 100644
--- a/Gemfile
+++ b/Gemfile
@@ -2,3 +2,4 @@ source "https://rubygems.org"
 gemspec
 gem "sinatra"
 gem "haml"
+gem "rserve-client"
diff --git a/application.rb b/application.rb
index 50f027a..16985d5 100644
--- a/application.rb
+++ b/application.rb
@@ -10,8 +10,9 @@ get '/?' do
 end
 
 get '/predict/?' do
-  @data = JSON.parse(File.read("./data.json"))
-  @example = @data[@data.keys.sample]["physchem"]
+  data = JSON.parse(File.read("./data.json"))
+  relevant_features = JSON.parse(File.read("./relevant-features.json"))
+  @example = data[data.keys.sample]["physchem"].select{|f,v| relevant_features.keys.include? f}
   #@json_example = JSON.pretty_generate(@example)
   haml :predict
 end
diff --git a/feature-filter.rb b/feature-filter.rb
new file mode 100644
index 0000000..3765842
--- /dev/null
+++ b/feature-filter.rb
@@ -0,0 +1,36 @@
+require 'rserve'
+require 'json'
+require 'yaml'
+require 'csv'
+
+R = Rserve::Connection.new
+ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])"
+
+def feature_filter
+  data = JSON.parse(File.read("./data.json"))
+  features = data["G15.AC"]["physchem"].keys
+  R.assign "tox", data.collect{|id,cats| cats["tox"][ENDPOINT]}
+  filtered_features = {}
+  features.each do |feature|
+    R.assign "feature", data.collect{|id,cats| cats["physchem"][feature]}
+    begin
+      #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
+      R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')"
+      pvalue = R.eval("cor$p.value").to_ruby
+      if pvalue <= 0.05
+        r = R.eval("cor$estimate").to_ruby
+        filtered_features[feature] = {}
+        filtered_features[feature]["pvalue"] = pvalue
+        filtered_features[feature]["r"] = r
+      end
+    rescue
+      f = data.collect{|id,cats| cats["physchem"][feature]}
+      f = R.eval("feature").to_ruby
+      p f.collect{|f| p f; Math.log f}
+      p R.eval("log(feature)").to_ruby
+    end
+  end
+  filtered_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+end
+
+puts feature_filter.to_json
diff --git a/nanoparticles.rb b/nanoparticles.rb
index e554029..e34e509 100644
--- a/nanoparticles.rb
+++ b/nanoparticles.rb
@@ -1,3 +1,4 @@
+require 'rserve'
 require 'json'
 require 'yaml'
 require 'csv'
@@ -9,24 +10,32 @@ def predict params
   sim_sum = 0
   weighted_sum = 0
   match = nil
+  relevant_features = JSON.parse(File.read("./relevant-features.json"))
+  weights = relevant_features.values.collect{|v| v["r"]}
   JSON.parse(File.read("./data.json")).each do |id,categories|
-    if params.values == categories["physchem"].values
-      match = {:id => categories}
+    neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values
+    if params.values == neighbor_values
+      match = {id => categories}
     else
-      sim = cosine_similarity(params.values,categories["physchem"].values)
-      neighbor = categories
-      neighbor["similarity"] = sim
-      neighbor["id"] = id
-      sim_sum += sim
-      weighted_sum += sim*Math.log(categories["tox"][ENDPOINT])
-      neighbors << neighbor
+      sim = weighted_cosine_similarity(params.values,neighbor_values,weights)
+      if sim > 0.95
+        neighbor = categories
+        neighbor["similarity"] = sim
+        neighbor["sim"] = cosine_similarity(params.values,neighbor_values)
+        neighbor["id"] = id
+        sim_sum += sim
+        weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT])
+        #weighted_sum += sim*categories["tox"][ENDPOINT]
+        neighbors << neighbor
+      end
     end
   end
   neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
+  sim_sum == 0 ? prediction = nil : prediction =  10**(weighted_sum/sim_sum)
   {
     :query => params,
     :match => match,
-    :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)},
+    :prediction => {ENDPOINT => prediction},
     :neighbors => neighbors
   }
 end
@@ -44,18 +53,32 @@ end
 
 def dot_product(a, b)
   products = a.zip(b).map{|a, b| a * b}
-  products.inject(0){|s,p| s + p}
+  products.inject(0) {|s,p| s + p}
 end
 
 def magnitude(point)
-  squares = point.map{|x| x.to_f ** 2}
+  squares = point.map{|x| x ** 2}
   Math.sqrt(squares.inject(0) {|s, c| s + c})
 end
 
+# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
 def cosine_similarity(a, b)
   dot_product(a, b) / (magnitude(a) * magnitude(b))
 end
 
+def weighted_cosine_similarity(a, b, w)
+  dot_product = 0
+  magnitude_a = 0
+  magnitude_b = 0
+  (0..a.size-1).each do |i|
+    dot_product += w[i].abs*a[i]*b[i]
+    magnitude_a += w[i].abs*a[i]**2
+    magnitude_b += w[i].abs*b[i]**2
+  end
+  dot_product/Math.sqrt(magnitude_a*magnitude_b)
+
+end
+
 #@endpoint = @data.collect{|r| r[5]}
 
 def neighbors query
diff --git a/relevant-features.json b/relevant-features.json
new file mode 100644
index 0000000..8da5f0a
--- /dev/null
+++ b/relevant-features.json
@@ -0,0 +1 @@
+{"Localized Surface Plasmon Resonance (LSPR) index Human serum (Sigma #H4522)":{"pvalue":2.7781332789800217e-10,"r":0.5673665177300351},"ZETA POTENTIAL Change":{"pvalue":2.071052722262855e-09,"r":-0.5435850485069125},"ZETA POTENTIAL [mV]":{"pvalue":2.025461243171378e-08,"r":0.5141366184476799},"Intensity Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":7.296459421546331e-06,"r":0.4220954992507819},"Volume Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.00014304093718453537,"r":0.3627394229575142},"Z-Average Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.0013415551974216289,"r":0.3089691292079323},"Localized Surface Plasmon Resonance (LSPR) index Change":{"pvalue":0.0020134808685383643,"r":0.2980672974262308},"Total surface area (SAtot) Human serum (Sigma #H4522) [cm^2]":{"pvalue":0.009242146870015556,"r":-0.2529069909205578},"Localized Surface Plasmon Resonance (LSPR) index":{"pvalue":0.010019903744117542,"r":0.2502907155942361},"Volume Mean Hydrodynamic Diameter Change":{"pvalue":0.013109585368169974,"r":0.24139711318406543},"Polydispersity index Change":{"pvalue":0.020454612521909568,"r":-0.22597482832068308},"Volume Mean Hydrodynamic Diameter [nm]":{"pvalue":0.033722917694600785,"r":0.20744306643756433},"Polydispersity index [nm]":{"pvalue":0.04433575298419301,"r":0.19667713971723438},"Number Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.04474107346052025,"r":0.19631065152636545}}
diff --git a/views/predict.haml b/views/predict.haml
index 6abd5bc..4584cbf 100644
--- a/views/predict.haml
+++ b/views/predict.haml
@@ -1,6 +1,6 @@
 %div.well
   %form{:role=>"form", :action=> to("/predict"), :method=>"post"}
-    %span.help-block
+    %h3.help-block
       Please characterise a Gold nanoparticle
     #input
     - size = @example.size
diff --git a/views/prediction.haml b/views/prediction.haml
index 99bd2a0..113e934 100644
--- a/views/prediction.haml
+++ b/views/prediction.haml
@@ -22,94 +22,105 @@
         %h5= key
         %input.form-control{:id=>id,:type=>"text",:value=>"#{val}", :disabled=>"true"}
     #match.tab-pane.fade
-      - if @prediction[:match]  
-        %table{:id=>"match"}
+      - if @prediction[:match]
+        %table{:id=>"match", :style=>"border-style: 1px solid black;margin-top:10px;"}
           %thead
             %tr
-            - @prediction[:match][:id].keys.each do |key|
-              %th= key.capitalize
+              %th
+                ID
+              %th
+                Composition
+              %th
+                Tox
+              %th
+                Physchem
           %tbody
             %tr
+              / ID
+              %td
+                %h5= @prediction[:match].keys[0]
               / composition
               %td
-                - @prediction[:match][:id]["composition"].each do |k,v|
+                - @prediction[:match].values[0]["composition"].each do |k,v|
                   %h5= k
                   %p= v
               / tox
               %td
-                - @prediction[:match][:id]["tox"].each do |k,v|
+                - @prediction[:match].values[0]["tox"].each do |k,v|
                   %h5= k
                   %p= v
               / physchem
               %td
-                - @prediction[:match][:id]["physchem"].each do |k,v|
+                - @prediction[:match].values[0]["physchem"].each do |k,v|
                   %h5= k
                   %p= v
       - else
         %h5 No match
     #prediction.tab-pane.in.active
-      - @prediction[:prediction].each do |k,v|
-        %h5= k
-        %p= v.round(5)
+      - if @prediction[:prediction]
+        - @prediction[:prediction].each do |k,v|
+          %h5= k
+          %p= v.round(5)
     #neighbors.tab-pane.fade
-      :javascript
-        $(document).ready(function(){
-          $("table#match").tablesorter({
-            debug: false,
-            theme: "bootstrap",
-            headerTemplate: '{content} {icon}',
-            widgets: ['uitheme'],
-            headers: {0: {sorter: false}, 1: {sorter: false}, 2: {sorter: false}},
-            sortList: [[1,1]],
-            widthFixed: false
+      - if @prediction[:neighbors]
+        :javascript
+          $(document).ready(function(){
+            $("table#match").tablesorter({
+              debug: false,
+              theme: "bootstrap",
+              headerTemplate: '{content} {icon}',
+              widgets: ['uitheme'],
+              headers: {0: {sorter: false}, 1: {sorter: false}, 2: {sorter: false}, 3: {sorter: false}},
+              sortList: [[1,1]],
+              widthFixed: false
+            });
           });
-        });
-        $(document).ready(function(){
-          $("table#neighbors").tablesorter({
-            debug: false,
-            theme: "bootstrap",
-            headerTemplate: '{content} {icon}',
-            widgets: ['uitheme'],
-            sortList: [[1,1]],
-            widthFixed: false
+          $(document).ready(function(){
+            $("table#neighbors").tablesorter({
+              debug: false,
+              theme: "bootstrap",
+              headerTemplate: '{content} {icon}',
+              widgets: ['uitheme'],
+              sortList: [[1,1]],
+              widthFixed: false
+            });
           });
-        });
-      %div.table-responsive
-        %table.tablesorter{:id=>"neighbors", :style=>"border-style: 1px solid black;margin-top:10px;"}
-          %thead
-            %tr
-              %th{:style =>"vertical-align:middle;"}
-                ID  
-              %th{:style =>"vertical-align:middle;"}
-                Similarity
-              %th{:style =>"vertical-align:middle;"}
-                Composition  
-              %th{:style =>"vertical-align:middle;"}
-                Tox
-              %th{:style =>"vertical-align:middle;"}
-                Physchem
-          %tbody
-          - @prediction[:neighbors].each do |neighbor|
-            %tr
-              / ID
-              %td
-                %h5= neighbor["id"]
-              / Similarity
-              %td
-                %h5= neighbor["similarity"].round(3)
-              / Composition
-              %td
-                - neighbor["composition"].each do |k,v|
-                  %h5= k
-                  %p= v
-              / Tox
-              %td
-                - neighbor["tox"].each do |k,v|
-                  %h5= k
-                  %p= v.round(3)
-              / Physchem
-              %td
-                - neighbor["physchem"].each do |k,v|
-                  %h5= k
-                  %p= v
+        %div.table-responsive
+          %table.tablesorter{:id=>"neighbors", :style=>"border-style: 1px solid black;margin-top:10px;"}
+            %thead
+              %tr
+                %th{:style =>"vertical-align:middle;"}
+                  ID  
+                %th{:style =>"vertical-align:middle;"}
+                  Similarity
+                %th{:style =>"vertical-align:middle;"}
+                  Composition  
+                %th{:style =>"vertical-align:middle;"}
+                  Tox
+                %th{:style =>"vertical-align:middle;"}
+                  Physchem
+            %tbody
+            - @prediction[:neighbors].each do |neighbor|
+              %tr
+                / ID
+                %td
+                  %h5= neighbor["id"]
+                / Similarity
+                %td
+                  %h5= neighbor["similarity"].round(3)
+                / Composition
+                %td
+                  - neighbor["composition"].each do |k,v|
+                    %h5= k
+                    %p= v
+                / Tox
+                %td
+                  - neighbor["tox"].each do |k,v|
+                    %h5= k
+                    %p= v.round(3)
+                / Physchem
+                %td
+                  - neighbor["physchem"].each do |k,v|
+                    %h5= k
+                    %p= v
author	gebele <gebele@in-silico.ch>	2016-01-22 23:18:56 +0100
committer	gebele <gebele@in-silico.ch>	2016-01-22 23:18:56 +0100
commit	88ac5fbe3d8d3141fbad81460b13d6cb8284da26 (patch)
tree	783f9da714155a5848c84905159a51b002c6a80c
parent	09b750e1639d351d24cff3cca74681c761b17503 (diff)