From 9546c589f6852942ed85f8da1e12c351fb92e0f0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 20 Jan 2016 13:53:22 +0100 Subject: enm import removed --- nanoparticles.rb | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 nanoparticles.rb (limited to 'nanoparticles.rb') diff --git a/nanoparticles.rb b/nanoparticles.rb new file mode 100644 index 0000000..890b3ca --- /dev/null +++ b/nanoparticles.rb @@ -0,0 +1,147 @@ +require 'json' +require 'yaml' +require 'csv' + +ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])" + +def predict params + neighbors = [] + sim_sum = 0 + weighted_sum = 0 + match = nil + JSON.parse(File.read("./data.json")).each do |id,categories| + if params.values == categories["physchem"].values + match = {:id => categories} + else + sim = cosine_similarity(params.values,categories["physchem"].values) + neighbor = categories + neighbor["similarity"] = sim + neighbor["id"] = id + sim_sum += sim + weighted_sum += sim*Math.log(categories["tox"][ENDPOINT]) + neighbors << neighbor + end + end + neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + { + :query => params, + :match => match, + :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)}, + :neighbors => neighbors + } +end + +class Object + def numeric? + true if Float(self) rescue false + end +end + +def euclidean_distance(a, b) + sq = a.zip(b).map{|a,b| (a - b) ** 2} + Math.sqrt(sq.inject(0) {|s,c| s + c}) +end + +def dot_product(a, b) + products = a.zip(b).map{|a, b| a * b} + products.inject(0) {|s,p| s + p} +end + +def magnitude(point) + squares = point.map{|x| x ** 2} + Math.sqrt(squares.inject(0) {|s, c| s + c}) +end + +def cosine_similarity(a, b) + dot_product(a, b) / (magnitude(a) * magnitude(b)) +end + +#@endpoint = @data.collect{|r| r[5]} + +def neighbors query +end + +def csv2json + csv = CSV.read("data/MergedSheets_edit.csv") + csv.collect!{|row| row[0..36].collect{|c| c.numeric? ? c.to_f : c } }.compact + feature_names = [ + "ID", + csv[0][1], + csv[0][2], + csv[0][3], + csv[6][4], + "#{csv[0][5]} (#{csv[6][5]} [#{csv[11][5]}])", # endpoint + "#{csv[0][6]} (#{csv[6][6]})", # endpoint + "#{csv[6][7]} [#{csv[11][7]}]", + "#{csv[6][8]} [#{csv[11][8]}]", + "#{csv[6][9]} [#{csv[11][9]}]", + ] + (10..10+5*3).step(3) do |i| + feature_names += [ + "#{csv[6][i]} [#{csv[11][i]}]", + "#{csv[6][i+1]} #{csv[8][i+1]} [#{csv[11][i+1]}]", + "#{csv[6][i+2]} #{csv[8][i+2]}", + ] + end + feature_names += [ + "#{csv[6][28]}", + "#{csv[6][29]} #{csv[8][29]}", + "#{csv[6][30]} #{csv[8][30]}", + ] + (31..34).each do |i| + feature_names << "#{csv[6][i]} #{csv[8][i]} [#{csv[11][i]}]" + end + (35..36).each do |i| + feature_names << "#{csv[6][i]} #{csv[8][i]} #{csv[10][i]} [#{csv[11][i]}]" + end + data = {} + csv.drop(12).each do |row| + id = row.first + if id.match /^G/ # skip Ag, too many missing values + data[id] = {} + row.each_with_index do |col,i| + if i == 0 + data[id][:composition] = {} + elsif i < 5 + data[id][:composition][feature_names[i]] = col + elsif i == 5 + data[id][:tox] ||= {} + data[id][:tox][feature_names[i]] = col + elsif i > 6 + data[id][:physchem] ||= {} + data[id][:physchem][feature_names[i]] = col + end + end + end + end + File.open("data.json","w+"){|f| f.puts data.to_json} + data +end + +#puts data.to_yaml +=begin +R.assign "endpoint", endpoint +(0..data[0].size).each do |c| + if data.collect{|r| r[c]}.uniq.size > 1 + begin + R.assign "feature", data.collect{|r| r[c]} + R.eval "r <- cor(-log(endpoint),-log(feature),use='complete')" + r = R.eval("r").to_ruby + p "#{c}: #{r}" if r > 0.3 or r < -0.3 + rescue + end + end +end + + +csv[0..13].each do |row| + row.each_with_index do |col,i| + features[i] = features[i].to_s+", "+col.to_s + end +end + +puts features.select{|f| f.match(/Mean/)}.to_yaml + + #n+=1 + #p n,row.first unless row.first.match /^[G|S]/ +=end -- cgit v1.2.3 From d8f1e75ba45cb770f421fa950861c6ff502d64dd Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 21 Jan 2016 19:26:48 +0100 Subject: feature selection added --- nanoparticles.rb | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'nanoparticles.rb') diff --git a/nanoparticles.rb b/nanoparticles.rb index 890b3ca..e34e509 100644 --- a/nanoparticles.rb +++ b/nanoparticles.rb @@ -1,3 +1,4 @@ +require 'rserve' require 'json' require 'yaml' require 'csv' @@ -9,24 +10,32 @@ def predict params sim_sum = 0 weighted_sum = 0 match = nil + relevant_features = JSON.parse(File.read("./relevant-features.json")) + weights = relevant_features.values.collect{|v| v["r"]} JSON.parse(File.read("./data.json")).each do |id,categories| - if params.values == categories["physchem"].values - match = {:id => categories} + neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values + if params.values == neighbor_values + match = {id => categories} else - sim = cosine_similarity(params.values,categories["physchem"].values) - neighbor = categories - neighbor["similarity"] = sim - neighbor["id"] = id - sim_sum += sim - weighted_sum += sim*Math.log(categories["tox"][ENDPOINT]) - neighbors << neighbor + sim = weighted_cosine_similarity(params.values,neighbor_values,weights) + if sim > 0.95 + neighbor = categories + neighbor["similarity"] = sim + neighbor["sim"] = cosine_similarity(params.values,neighbor_values) + neighbor["id"] = id + sim_sum += sim + weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT]) + #weighted_sum += sim*categories["tox"][ENDPOINT] + neighbors << neighbor + end end end neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) { :query => params, :match => match, - :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)}, + :prediction => {ENDPOINT => prediction}, :neighbors => neighbors } end @@ -52,10 +61,24 @@ def magnitude(point) Math.sqrt(squares.inject(0) {|s, c| s + c}) end +# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity def cosine_similarity(a, b) dot_product(a, b) / (magnitude(a) * magnitude(b)) end +def weighted_cosine_similarity(a, b, w) + dot_product = 0 + magnitude_a = 0 + magnitude_b = 0 + (0..a.size-1).each do |i| + dot_product += w[i].abs*a[i]*b[i] + magnitude_a += w[i].abs*a[i]**2 + magnitude_b += w[i].abs*b[i]**2 + end + dot_product/Math.sqrt(magnitude_a*magnitude_b) + +end + #@endpoint = @data.collect{|r| r[5]} def neighbors query -- cgit v1.2.3 From 7e824ae1a52ee27bda90dd08783aef0ab3a539a9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 29 Jan 2016 12:23:14 +0100 Subject: internal calculation of difference features --- nanoparticles.rb | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'nanoparticles.rb') diff --git a/nanoparticles.rb b/nanoparticles.rb index e34e509..d3399e9 100644 --- a/nanoparticles.rb +++ b/nanoparticles.rb @@ -5,7 +5,16 @@ require 'csv' ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])" +def query_features + relevant_features = JSON.parse(File.read("./relevant-features.json")) +end + def predict params + # calculate difference parameters + diff_features = JSON.parse(File.read("./diff-features.json")) + diff_features.each do |feature,originals| + params[feature] = params[originals[1]]-params[originals[0]] # causes rounding errors! + end neighbors = [] sim_sum = 0 weighted_sum = 0 @@ -13,21 +22,24 @@ def predict params relevant_features = JSON.parse(File.read("./relevant-features.json")) weights = relevant_features.values.collect{|v| v["r"]} JSON.parse(File.read("./data.json")).each do |id,categories| - neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values - if params.values == neighbor_values + query_values = [] + neighbor_values = [] + relevant_features.keys.each do |f| + query_values << params[f] + neighbor_values << categories["physchem"][f] + end + sim = weighted_cosine_similarity(query_values,neighbor_values,weights) + if sim > 0.9999 # no exact match because of rounding errors match = {id => categories} - else - sim = weighted_cosine_similarity(params.values,neighbor_values,weights) - if sim > 0.95 - neighbor = categories - neighbor["similarity"] = sim - neighbor["sim"] = cosine_similarity(params.values,neighbor_values) - neighbor["id"] = id - sim_sum += sim - weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT]) - #weighted_sum += sim*categories["tox"][ENDPOINT] - neighbors << neighbor - end + elsif sim > 0.95 + neighbor = categories + neighbor["similarity"] = sim + neighbor["sim"] = cosine_similarity(query_values,neighbor_values) + neighbor["id"] = id + sim_sum += sim + weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT]) + #weighted_sum += sim*categories["tox"][ENDPOINT] + neighbors << neighbor end end neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} -- cgit v1.2.3