summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-01-21 19:26:48 +0100
committerChristoph Helma <helma@in-silico.ch>2016-01-21 19:26:48 +0100
commitd8f1e75ba45cb770f421fa950861c6ff502d64dd (patch)
treed489ad054770f4b1528fdb60f9a13a9e7de3a3bd
parent9546c589f6852942ed85f8da1e12c351fb92e0f0 (diff)
feature selection added
-rw-r--r--feature-filter.rb36
-rw-r--r--nanoparticles.rb43
-rw-r--r--relevant-features.json1
-rw-r--r--test/predict.rb9
4 files changed, 79 insertions, 10 deletions
diff --git a/feature-filter.rb b/feature-filter.rb
new file mode 100644
index 0000000..3765842
--- /dev/null
+++ b/feature-filter.rb
@@ -0,0 +1,36 @@
+require 'rserve'
+require 'json'
+require 'yaml'
+require 'csv'
+
+R = Rserve::Connection.new
+ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])"
+
+def feature_filter
+ data = JSON.parse(File.read("./data.json"))
+ features = data["G15.AC"]["physchem"].keys
+ R.assign "tox", data.collect{|id,cats| cats["tox"][ENDPOINT]}
+ filtered_features = {}
+ features.each do |feature|
+ R.assign "feature", data.collect{|id,cats| cats["physchem"][feature]}
+ begin
+ #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
+ R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')"
+ pvalue = R.eval("cor$p.value").to_ruby
+ if pvalue <= 0.05
+ r = R.eval("cor$estimate").to_ruby
+ filtered_features[feature] = {}
+ filtered_features[feature]["pvalue"] = pvalue
+ filtered_features[feature]["r"] = r
+ end
+ rescue
+ f = data.collect{|id,cats| cats["physchem"][feature]}
+ f = R.eval("feature").to_ruby
+ p f.collect{|f| p f; Math.log f}
+ p R.eval("log(feature)").to_ruby
+ end
+ end
+ filtered_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+end
+
+puts feature_filter.to_json
diff --git a/nanoparticles.rb b/nanoparticles.rb
index 890b3ca..e34e509 100644
--- a/nanoparticles.rb
+++ b/nanoparticles.rb
@@ -1,3 +1,4 @@
+require 'rserve'
require 'json'
require 'yaml'
require 'csv'
@@ -9,24 +10,32 @@ def predict params
sim_sum = 0
weighted_sum = 0
match = nil
+ relevant_features = JSON.parse(File.read("./relevant-features.json"))
+ weights = relevant_features.values.collect{|v| v["r"]}
JSON.parse(File.read("./data.json")).each do |id,categories|
- if params.values == categories["physchem"].values
- match = {:id => categories}
+ neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values
+ if params.values == neighbor_values
+ match = {id => categories}
else
- sim = cosine_similarity(params.values,categories["physchem"].values)
- neighbor = categories
- neighbor["similarity"] = sim
- neighbor["id"] = id
- sim_sum += sim
- weighted_sum += sim*Math.log(categories["tox"][ENDPOINT])
- neighbors << neighbor
+ sim = weighted_cosine_similarity(params.values,neighbor_values,weights)
+ if sim > 0.95
+ neighbor = categories
+ neighbor["similarity"] = sim
+ neighbor["sim"] = cosine_similarity(params.values,neighbor_values)
+ neighbor["id"] = id
+ sim_sum += sim
+ weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT])
+ #weighted_sum += sim*categories["tox"][ENDPOINT]
+ neighbors << neighbor
+ end
end
end
neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
+ sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
{
:query => params,
:match => match,
- :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)},
+ :prediction => {ENDPOINT => prediction},
:neighbors => neighbors
}
end
@@ -52,10 +61,24 @@ def magnitude(point)
Math.sqrt(squares.inject(0) {|s, c| s + c})
end
+# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
def cosine_similarity(a, b)
dot_product(a, b) / (magnitude(a) * magnitude(b))
end
+def weighted_cosine_similarity(a, b, w)
+ dot_product = 0
+ magnitude_a = 0
+ magnitude_b = 0
+ (0..a.size-1).each do |i|
+ dot_product += w[i].abs*a[i]*b[i]
+ magnitude_a += w[i].abs*a[i]**2
+ magnitude_b += w[i].abs*b[i]**2
+ end
+ dot_product/Math.sqrt(magnitude_a*magnitude_b)
+
+end
+
#@endpoint = @data.collect{|r| r[5]}
def neighbors query
diff --git a/relevant-features.json b/relevant-features.json
new file mode 100644
index 0000000..8da5f0a
--- /dev/null
+++ b/relevant-features.json
@@ -0,0 +1 @@
+{"Localized Surface Plasmon Resonance (LSPR) index Human serum (Sigma #H4522)":{"pvalue":2.7781332789800217e-10,"r":0.5673665177300351},"ZETA POTENTIAL Change":{"pvalue":2.071052722262855e-09,"r":-0.5435850485069125},"ZETA POTENTIAL [mV]":{"pvalue":2.025461243171378e-08,"r":0.5141366184476799},"Intensity Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":7.296459421546331e-06,"r":0.4220954992507819},"Volume Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.00014304093718453537,"r":0.3627394229575142},"Z-Average Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.0013415551974216289,"r":0.3089691292079323},"Localized Surface Plasmon Resonance (LSPR) index Change":{"pvalue":0.0020134808685383643,"r":0.2980672974262308},"Total surface area (SAtot) Human serum (Sigma #H4522) [cm^2]":{"pvalue":0.009242146870015556,"r":-0.2529069909205578},"Localized Surface Plasmon Resonance (LSPR) index":{"pvalue":0.010019903744117542,"r":0.2502907155942361},"Volume Mean Hydrodynamic Diameter Change":{"pvalue":0.013109585368169974,"r":0.24139711318406543},"Polydispersity index Change":{"pvalue":0.020454612521909568,"r":-0.22597482832068308},"Volume Mean Hydrodynamic Diameter [nm]":{"pvalue":0.033722917694600785,"r":0.20744306643756433},"Polydispersity index [nm]":{"pvalue":0.04433575298419301,"r":0.19667713971723438},"Number Mean Hydrodynamic Diameter Human serum (Sigma #H4522) [nm]":{"pvalue":0.04474107346052025,"r":0.19631065152636545}}
diff --git a/test/predict.rb b/test/predict.rb
new file mode 100644
index 0000000..9aed5e2
--- /dev/null
+++ b/test/predict.rb
@@ -0,0 +1,9 @@
+require_relative "../nanoparticles.rb"
+data = JSON.parse(File.read("./data.json"))
+relevant_features = JSON.parse(File.read("./relevant-features.json"))
+example = data[data.keys.sample]["physchem"].select{|f,v| relevant_features.keys.include? f}
+#data.collect
+puts predict(example)[:match].collect{|id,v| v["tox"]}.first
+puts predict(example)[:prediction]
+#puts predict(example)[:neighbors].size
+