diff options
author | gebele <gebele@in-silico.ch> | 2016-01-22 23:18:56 +0100 |
---|---|---|
committer | gebele <gebele@in-silico.ch> | 2016-01-22 23:18:56 +0100 |
commit | 88ac5fbe3d8d3141fbad81460b13d6cb8284da26 (patch) | |
tree | 783f9da714155a5848c84905159a51b002c6a80c | |
parent | 09b750e1639d351d24cff3cca74681c761b17503 (diff) |
refined prediction
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Gemfile | 1 | ||||
-rw-r--r-- | application.rb | 5 | ||||
-rw-r--r-- | feature-filter.rb | 36 | ||||
-rw-r--r-- | nanoparticles.rb | 47 | ||||
-rw-r--r-- | relevant-features.json | 1 | ||||
-rw-r--r-- | views/predict.haml | 2 | ||||
-rw-r--r-- | views/prediction.haml | 147 |
8 files changed, 157 insertions, 83 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b844b14 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +Gemfile.lock @@ -2,3 +2,4 @@ source "https://rubygems.org" gemspec gem "sinatra" gem "haml" +gem "rserve-client" diff --git a/application.rb b/application.rb index 50f027a..16985d5 100644 --- a/application.rb +++ b/application.rb @@ -10,8 +10,9 @@ get '/?' do end get '/predict/?' do - @data = JSON.parse(File.read("./data.json")) - @example = @data[@data.keys.sample]["physchem"] + data = JSON.parse(File.read("./data.json")) + relevant_features = JSON.parse(File.read("./relevant-features.json")) + @example = data[data.keys.sample]["physchem"].select{|f,v| relevant_features.keys.include? f} #@json_example = JSON.pretty_generate(@example) haml :predict end diff --git a/feature-filter.rb b/feature-filter.rb new file mode 100644 index 0000000..3765842 --- /dev/null +++ b/feature-filter.rb @@ -0,0 +1,36 @@ +require 'rserve' +require 'json' +require 'yaml' +require 'csv' + +R = Rserve::Connection.new +ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])" + +def feature_filter + data = JSON.parse(File.read("./data.json")) + features = data["G15.AC"]["physchem"].keys + R.assign "tox", data.collect{|id,cats| cats["tox"][ENDPOINT]} + filtered_features = {} + features.each do |feature| + R.assign "feature", data.collect{|id,cats| cats["physchem"][feature]} + begin + #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + filtered_features[feature] = {} + filtered_features[feature]["pvalue"] = pvalue + filtered_features[feature]["r"] = r + end + rescue + f = data.collect{|id,cats| cats["physchem"][feature]} + f = R.eval("feature").to_ruby + p f.collect{|f| p f; Math.log f} + p R.eval("log(feature)").to_ruby + end + end + filtered_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h +end + +puts feature_filter.to_json diff --git a/nanoparticles.rb b/nanoparticles.rb index e554029..e34e509 100644 --- a/nanoparticles.rb +++ b/nanoparticles.rb @@ -1,3 +1,4 @@ +require 'rserve' require 'json' require 'yaml' require 'csv' @@ -9,24 +10,32 @@ def predict params sim_sum = 0 weighted_sum = 0 match = nil + relevant_features = JSON.parse(File.read("./relevant-features.json")) + weights = relevant_features.values.collect{|v| v["r"]} JSON.parse(File.read("./data.json")).each do |id,categories| - if params.values == categories["physchem"].values - match = {:id => categories} + neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values + if params.values == neighbor_values + match = {id => categories} else - sim = cosine_similarity(params.values,categories["physchem"].values) - neighbor = categories - neighbor["similarity"] = sim - neighbor["id"] = id - sim_sum += sim - weighted_sum += sim*Math.log(categories["tox"][ENDPOINT]) - neighbors << neighbor + sim = weighted_cosine_similarity(params.values,neighbor_values,weights) + if sim > 0.95 + neighbor = categories + neighbor["similarity"] = sim + neighbor["sim"] = cosine_similarity(params.values,neighbor_values) + neighbor["id"] = id + sim_sum += sim + weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT]) + #weighted_sum += sim*categories["tox"][ENDPOINT] + neighbors << neighbor + end end end neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) { :query => params, :match => match, - :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)}, + :prediction => {ENDPOINT => prediction}, :neighbors => neighbors } end @@ -44,18 +53,32 @@ end def dot_product(a, b) products = a.zip(b).map{|a, b| a * b} - products.inject(0){|s,p| s + p} + products.inject(0) {|s,p| s + p} end def magnitude(point) - squares = point.map{|x| x.to_f ** 2} + squares = point.map{|x| x ** 2} Math.sqrt(squares.inject(0) {|s, c| s + c}) end +# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity def cosine_similarity(a, b) dot_product(a, b) / (magnitude(a) * magnitude(b)) end +def weighted_cosine_similarity(a, b, w) + dot_product = 0 + magnitude_a = 0 + magnitude_b = 0 + (0..a.size-1).each do |i| + dot_product += w[i].abs*a[i]*b[i] + magnitude_a += w[i].abs*a[i]**2 + magnitude_b += w[i].abs*b[i]**2 + end + dot_product/Math.sqrt(magnitude_a*magnitude_b) + +end + #@endpoint = @data.collect{|r| r[5]} def neighbors query diff --git a/relevant-features.json b/relevant-features.json new file mode 100644 index 0000000..8da5f0a --- /dev/null +++ b/relevant-features.json @@ -0,0 +1 @@ +{"Localized Surface Plasmon Resonance (LSPR) index Human serum (Sigma #H4522)":{"pvalue":2.7781332789800217e-10,"r":0.5673665177300351},"ZETA POTENTIAL Change":{"pvalue":2.071052722262855e-09,"r":-0.5435850485069125},"ZETA POTENTIAL [mV]":{"pvalue":2.025461243171378e-08,"r":0.5141366184476799},"Intensity Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":7.296459421546331e-06,"r":0.4220954992507819},"Volume Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.00014304093718453537,"r":0.3627394229575142},"Z-Average Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.0013415551974216289,"r":0.3089691292079323},"Localized Surface Plasmon Resonance (LSPR) index Change":{"pvalue":0.0020134808685383643,"r":0.2980672974262308},"Total surface area (SAtot) Human serum (Sigma #H4522) [cm^2]":{"pvalue":0.009242146870015556,"r":-0.2529069909205578},"Localized Surface Plasmon Resonance (LSPR) index":{"pvalue":0.010019903744117542,"r":0.2502907155942361},"Volume Mean Hydrodynamic Diameter Change":{"pvalue":0.013109585368169974,"r":0.24139711318406543},"Polydispersity index Change":{"pvalue":0.020454612521909568,"r":-0.22597482832068308},"Volume Mean Hydrodynamic Diameter [nm]":{"pvalue":0.033722917694600785,"r":0.20744306643756433},"Polydispersity index [nm]":{"pvalue":0.04433575298419301,"r":0.19667713971723438},"Number Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.04474107346052025,"r":0.19631065152636545}} diff --git a/views/predict.haml b/views/predict.haml index 6abd5bc..4584cbf 100644 --- a/views/predict.haml +++ b/views/predict.haml @@ -1,6 +1,6 @@ %div.well %form{:role=>"form", :action=> to("/predict"), :method=>"post"} - %span.help-block + %h3.help-block Please characterise a Gold nanoparticle #input - size = @example.size diff --git a/views/prediction.haml b/views/prediction.haml index 99bd2a0..113e934 100644 --- a/views/prediction.haml +++ b/views/prediction.haml @@ -22,94 +22,105 @@ %h5= key %input.form-control{:id=>id,:type=>"text",:value=>"#{val}", :disabled=>"true"} #match.tab-pane.fade - - if @prediction[:match] - %table{:id=>"match"} + - if @prediction[:match] + %table{:id=>"match", :style=>"border-style: 1px solid black;margin-top:10px;"} %thead %tr - - @prediction[:match][:id].keys.each do |key| - %th= key.capitalize + %th + ID + %th + Composition + %th + Tox + %th + Physchem %tbody %tr + / ID + %td + %h5= @prediction[:match].keys[0] / composition %td - - @prediction[:match][:id]["composition"].each do |k,v| + - @prediction[:match].values[0]["composition"].each do |k,v| %h5= k %p= v / tox %td - - @prediction[:match][:id]["tox"].each do |k,v| + - @prediction[:match].values[0]["tox"].each do |k,v| %h5= k %p= v / physchem %td - - @prediction[:match][:id]["physchem"].each do |k,v| + - @prediction[:match].values[0]["physchem"].each do |k,v| %h5= k %p= v - else %h5 No match #prediction.tab-pane.in.active - - @prediction[:prediction].each do |k,v| - %h5= k - %p= v.round(5) + - if @prediction[:prediction] + - @prediction[:prediction].each do |k,v| + %h5= k + %p= v.round(5) #neighbors.tab-pane.fade - :javascript - $(document).ready(function(){ - $("table#match").tablesorter({ - debug: false, - theme: "bootstrap", - headerTemplate: '{content} {icon}', - widgets: ['uitheme'], - headers: {0: {sorter: false}, 1: {sorter: false}, 2: {sorter: false}}, - sortList: [[1,1]], - widthFixed: false + - if @prediction[:neighbors] + :javascript + $(document).ready(function(){ + $("table#match").tablesorter({ + debug: false, + theme: "bootstrap", + headerTemplate: '{content} {icon}', + widgets: ['uitheme'], + headers: {0: {sorter: false}, 1: {sorter: false}, 2: {sorter: false}, 3: {sorter: false}}, + sortList: [[1,1]], + widthFixed: false + }); }); - }); - $(document).ready(function(){ - $("table#neighbors").tablesorter({ - debug: false, - theme: "bootstrap", - headerTemplate: '{content} {icon}', - widgets: ['uitheme'], - sortList: [[1,1]], - widthFixed: false + $(document).ready(function(){ + $("table#neighbors").tablesorter({ + debug: false, + theme: "bootstrap", + headerTemplate: '{content} {icon}', + widgets: ['uitheme'], + sortList: [[1,1]], + widthFixed: false + }); }); - }); - %div.table-responsive - %table.tablesorter{:id=>"neighbors", :style=>"border-style: 1px solid black;margin-top:10px;"} - %thead - %tr - %th{:style =>"vertical-align:middle;"} - ID - %th{:style =>"vertical-align:middle;"} - Similarity - %th{:style =>"vertical-align:middle;"} - Composition - %th{:style =>"vertical-align:middle;"} - Tox - %th{:style =>"vertical-align:middle;"} - Physchem - %tbody - - @prediction[:neighbors].each do |neighbor| - %tr - / ID - %td - %h5= neighbor["id"] - / Similarity - %td - %h5= neighbor["similarity"].round(3) - / Composition - %td - - neighbor["composition"].each do |k,v| - %h5= k - %p= v - / Tox - %td - - neighbor["tox"].each do |k,v| - %h5= k - %p= v.round(3) - / Physchem - %td - - neighbor["physchem"].each do |k,v| - %h5= k - %p= v + %div.table-responsive + %table.tablesorter{:id=>"neighbors", :style=>"border-style: 1px solid black;margin-top:10px;"} + %thead + %tr + %th{:style =>"vertical-align:middle;"} + ID + %th{:style =>"vertical-align:middle;"} + Similarity + %th{:style =>"vertical-align:middle;"} + Composition + %th{:style =>"vertical-align:middle;"} + Tox + %th{:style =>"vertical-align:middle;"} + Physchem + %tbody + - @prediction[:neighbors].each do |neighbor| + %tr + / ID + %td + %h5= neighbor["id"] + / Similarity + %td + %h5= neighbor["similarity"].round(3) + / Composition + %td + - neighbor["composition"].each do |k,v| + %h5= k + %p= v + / Tox + %td + - neighbor["tox"].each do |k,v| + %h5= k + %p= v.round(3) + / Physchem + %td + - neighbor["physchem"].each do |k,v| + %h5= k + %p= v |