From d8f1e75ba45cb770f421fa950861c6ff502d64dd Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 21 Jan 2016 19:26:48 +0100 Subject: feature selection added --- feature-filter.rb | 36 ++++++++++++++++++++++++++++++++++++ nanoparticles.rb | 43 +++++++++++++++++++++++++++++++++---------- relevant-features.json | 1 + test/predict.rb | 9 +++++++++ 4 files changed, 79 insertions(+), 10 deletions(-) create mode 100644 feature-filter.rb create mode 100644 relevant-features.json create mode 100644 test/predict.rb diff --git a/feature-filter.rb b/feature-filter.rb new file mode 100644 index 0000000..3765842 --- /dev/null +++ b/feature-filter.rb @@ -0,0 +1,36 @@ +require 'rserve' +require 'json' +require 'yaml' +require 'csv' + +R = Rserve::Connection.new +ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])" + +def feature_filter + data = JSON.parse(File.read("./data.json")) + features = data["G15.AC"]["physchem"].keys + R.assign "tox", data.collect{|id,cats| cats["tox"][ENDPOINT]} + filtered_features = {} + features.each do |feature| + R.assign "feature", data.collect{|id,cats| cats["physchem"][feature]} + begin + #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + filtered_features[feature] = {} + filtered_features[feature]["pvalue"] = pvalue + filtered_features[feature]["r"] = r + end + rescue + f = data.collect{|id,cats| cats["physchem"][feature]} + f = R.eval("feature").to_ruby + p f.collect{|f| p f; Math.log f} + p R.eval("log(feature)").to_ruby + end + end + filtered_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h +end + +puts feature_filter.to_json diff --git a/nanoparticles.rb b/nanoparticles.rb index 890b3ca..e34e509 100644 --- a/nanoparticles.rb +++ b/nanoparticles.rb @@ -1,3 +1,4 @@ +require 'rserve' require 'json' require 'yaml' require 'csv' @@ -9,24 +10,32 @@ def predict params sim_sum = 0 weighted_sum = 0 match = nil + relevant_features = JSON.parse(File.read("./relevant-features.json")) + weights = relevant_features.values.collect{|v| v["r"]} JSON.parse(File.read("./data.json")).each do |id,categories| - if params.values == categories["physchem"].values - match = {:id => categories} + neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values + if params.values == neighbor_values + match = {id => categories} else - sim = cosine_similarity(params.values,categories["physchem"].values) - neighbor = categories - neighbor["similarity"] = sim - neighbor["id"] = id - sim_sum += sim - weighted_sum += sim*Math.log(categories["tox"][ENDPOINT]) - neighbors << neighbor + sim = weighted_cosine_similarity(params.values,neighbor_values,weights) + if sim > 0.95 + neighbor = categories + neighbor["similarity"] = sim + neighbor["sim"] = cosine_similarity(params.values,neighbor_values) + neighbor["id"] = id + sim_sum += sim + weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT]) + #weighted_sum += sim*categories["tox"][ENDPOINT] + neighbors << neighbor + end end end neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) { :query => params, :match => match, - :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)}, + :prediction => {ENDPOINT => prediction}, :neighbors => neighbors } end @@ -52,10 +61,24 @@ def magnitude(point) Math.sqrt(squares.inject(0) {|s, c| s + c}) end +# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity def cosine_similarity(a, b) dot_product(a, b) / (magnitude(a) * magnitude(b)) end +def weighted_cosine_similarity(a, b, w) + dot_product = 0 + magnitude_a = 0 + magnitude_b = 0 + (0..a.size-1).each do |i| + dot_product += w[i].abs*a[i]*b[i] + magnitude_a += w[i].abs*a[i]**2 + magnitude_b += w[i].abs*b[i]**2 + end + dot_product/Math.sqrt(magnitude_a*magnitude_b) + +end + #@endpoint = @data.collect{|r| r[5]} def neighbors query diff --git a/relevant-features.json b/relevant-features.json new file mode 100644 index 0000000..8da5f0a --- /dev/null +++ b/relevant-features.json @@ -0,0 +1 @@ +{"Localized Surface Plasmon Resonance (LSPR) index Human serum (Sigma #H4522)":{"pvalue":2.7781332789800217e-10,"r":0.5673665177300351},"ZETA POTENTIAL Change":{"pvalue":2.071052722262855e-09,"r":-0.5435850485069125},"ZETA POTENTIAL [mV]":{"pvalue":2.025461243171378e-08,"r":0.5141366184476799},"Intensity Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":7.296459421546331e-06,"r":0.4220954992507819},"Volume Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.00014304093718453537,"r":0.3627394229575142},"Z-Average Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.0013415551974216289,"r":0.3089691292079323},"Localized Surface Plasmon Resonance (LSPR) index Change":{"pvalue":0.0020134808685383643,"r":0.2980672974262308},"Total surface area (SAtot) Human serum (Sigma #H4522) [cm^2]":{"pvalue":0.009242146870015556,"r":-0.2529069909205578},"Localized Surface Plasmon Resonance (LSPR) index":{"pvalue":0.010019903744117542,"r":0.2502907155942361},"Volume Mean Hydrodynamic Diameter Change":{"pvalue":0.013109585368169974,"r":0.24139711318406543},"Polydispersity index Change":{"pvalue":0.020454612521909568,"r":-0.22597482832068308},"Volume Mean Hydrodynamic Diameter [nm]":{"pvalue":0.033722917694600785,"r":0.20744306643756433},"Polydispersity index [nm]":{"pvalue":0.04433575298419301,"r":0.19667713971723438},"Number Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.04474107346052025,"r":0.19631065152636545}} diff --git a/test/predict.rb b/test/predict.rb new file mode 100644 index 0000000..9aed5e2 --- /dev/null +++ b/test/predict.rb @@ -0,0 +1,9 @@ +require_relative "../nanoparticles.rb" +data = JSON.parse(File.read("./data.json")) +relevant_features = JSON.parse(File.read("./relevant-features.json")) +example = data[data.keys.sample]["physchem"].select{|f,v| relevant_features.keys.include? f} +#data.collect +puts predict(example)[:match].collect{|id,v| v["tox"]}.first +puts predict(example)[:prediction] +#puts predict(example)[:neighbors].size + -- cgit v1.2.3