From 9546c589f6852942ed85f8da1e12c351fb92e0f0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 20 Jan 2016 13:53:22 +0100 Subject: enm import removed --- .gitignore | 1 + application.rb | 4 +- import.rb | 153 ------------------------------------------------------ nanoparticles.rb | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++ protein_corona.rb | 147 --------------------------------------------------- 5 files changed, 150 insertions(+), 302 deletions(-) delete mode 100644 import.rb create mode 100644 nanoparticles.rb delete mode 100644 protein_corona.rb diff --git a/.gitignore b/.gitignore index 43ce16a..e73632d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ data papers *tmp *swp +enm-import.rb diff --git a/application.rb b/application.rb index 1cb0e13..703932f 100644 --- a/application.rb +++ b/application.rb @@ -1,7 +1,7 @@ require 'sinatra' require "sinatra/reloader" if development? -require_relative 'protein_corona.rb' -also_reload './protein_corona.rb' +require_relative 'nanoparticles.rb' +also_reload './nanoparticles.rb' get '/?' do @data = JSON.parse(File.read("./data.json")) diff --git a/import.rb b/import.rb deleted file mode 100644 index 63d8a08..0000000 --- a/import.rb +++ /dev/null @@ -1,153 +0,0 @@ -# TODO: missing data for protein corona silver particles -require 'json' -require 'yaml' -require 'csv' -require_relative "lib/nano-lazar.rb" -include OpenTox - -def feature_name uri - f = @features[uri] - name = f['title'] - annotations = f['annotation'].collect{|a| "#{a['p']}: #{a['o']}"}.uniq.join ", " - name << " (#{annotations})" unless annotations.empty? - name << " [#{f['units']}]" if f['units'] and !f['units'].empty? - name -end - -nanomaterials = [] -feature_names = {} -@features = {} - -["nanowiki.json", "protein-corona.json", "marina.json"].each do |f| - bundle = JSON.parse(File.read(File.join("data",f))) - @features.merge! bundle["feature"] - bundle["dataEntry"].each do |substance| - nm = Nanoparticle.new - nm.uri = substance["compound"]["URI"] - nm.name = substance["values"]["https://apps.ideaconsult.net/enanomapper/identifier/name"] if substance["values"] - if substance["composition"] - nr_cores = substance["composition"].select{|c| c["relation"] == "HAS_CORE"}.size - puts "#{substance["compound"]["URI"]} has #{nr_cores} cores" if nr_cores !=1 - substance["composition"].each do |composition| - component = composition["component"] - if component - name = component["values"]["https://apps.ideaconsult.net/enanomapper/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - #names << name - if composition["relation"] == "HAS_CORE" - nm.core = name - elsif composition["relation"] == "HAS_COATING" - nm.coating ||= [] - nm.coating << name - end - else - #puts substance.to_yaml - end - end - else - #puts substance.to_yaml - end - substance["values"].each do |k,v| - property = nil - if k.match(/TOX/) - nm.tox ||= [] - property = "tox" - elsif k.match(/P-CHEM/) - nm.p_chem ||= [] - property = "p_chem" - end - if property - v.each do |val| - if val.keys == ["loValue"] - nm.tox << {k => val["loValue"]} if property == "tox" - nm.p_chem << {k => val["loValue"]} if property == "p_chem" - elsif val.keys == ["loQualifier", "loValue"] and val["loQualifier"] == "mean" - nm.tox << {k => val["loValue"]} if property == "tox" - nm.p_chem << {k => val["loValue"]} if property == "p_chem" - elsif val.keys == ["loQualifier", "loValue", "upQualifier", "upValue" ] - nm.tox << {k => (val["loValue"]+val["upValue"])/2} if property == "tox" - nm.p_chem << {k => (val["loValue"]+val["upValue"])/2} if property == "p_chem" - elsif val.keys == ["loQualifier", "loValue"] and val["loQualifier"] == ">=" - else - p val - end - end - else - #p k,v - end - end - nm.tox.uniq! if nm.tox - nm.p_chem.uniq! if nm.p_chem - nanomaterials << nm - end -end - -puts "Total imported: #{nanomaterials.size}" -puts "With nanoparticle characterisation: #{nanomaterials.select{|n| n.p_chem}.size}" -modelling_data = nanomaterials.select{|n| n.tox and n.p_chem} -puts "With TOX data: #{nanomaterials.select{|n| n.tox}.size}" -puts "With TOX data and particle characterisation: #{modelling_data.size}" -endpoints = modelling_data.collect{|n| n.tox.collect{|t| t.keys}}.flatten.compact.uniq -puts -puts "Endpoints: #{endpoints.size}" - -single_value_endpoints = [] -endpoint_values = {} - -endpoints.each do |e| - i = 0 - values = [] - modelling_data.each do |n| - n.tox.each do |t| - if t[e] - i += 1 - values << t[e] - end - end - end - single_value_endpoints << e if values.uniq.size == 1 - endpoint_values[e] = values.size unless values.uniq.size == 1 -end - -endpoints -= single_value_endpoints -puts "Endpoints with more than one measurement value: #{endpoints.size}" -endpoint_values.select!{|k,v| v > 10} -puts "Endpoints with more than 10 measurements: #{endpoint_values.size}" -endpoints = endpoint_values.keys -puts -puts endpoint_values.sort{|a,b| b[1] <=> a[1]}.collect{|e,v| "#{feature_names[e]}: #{v}"}.join("\n") - -endpoint = "https://apps.ideaconsult.net/enanomapper/property/TOX/UNKNOWN_TOXICITY_SECTION/Log2+transformed/94D664CFE4929A0F400A5AD8CA733B52E049A688/E/3ed642f9-1b42-387a-9966-dea5b91e5f8a" -nanomaterials.select!{|nm| nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint} -p nanomaterials.size - -feature_values = {} -nanomaterials.each do |nm| - (nm.p_chem + nm.tox).each do |f| - feature_names[f] = feature_name f # avoid appending annotations/units with each function call, unclear why it happens - p f unless f.size == 1 - k = f.keys.first - unless f[k].is_a? String - feature_values[k] ||= [] - feature_values[k] << f[k] - end - end -end - -# remove empty values -feature_values.select!{|f,vals| vals.uniq.size > 2} -tox_descriptors = feature_values.select{|f,vals| f.match 'TOX'}.keys -p_chem_descriptors = feature_values.select{|f,vals| f.match 'P-CHEM'}.keys - -#puts @features.to_yaml - -column_names = ["Nanoparticle"] + p_chem_descriptors.collect{|d| feature_names[d]} + tox_descriptors.collect{|d| feature_names[d]} -table = [] -CSV.open(File.join(File.dirname(__FILE__),"data","protein_corona_extract.csv"),"w+") do |csv| - csv << column_names - nanomaterials.each do |nm| - if nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint - #table << [] - csv << [nm.name] + p_chem_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first} + tox_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first} - end - end -end diff --git a/nanoparticles.rb b/nanoparticles.rb new file mode 100644 index 0000000..890b3ca --- /dev/null +++ b/nanoparticles.rb @@ -0,0 +1,147 @@ +require 'json' +require 'yaml' +require 'csv' + +ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])" + +def predict params + neighbors = [] + sim_sum = 0 + weighted_sum = 0 + match = nil + JSON.parse(File.read("./data.json")).each do |id,categories| + if params.values == categories["physchem"].values + match = {:id => categories} + else + sim = cosine_similarity(params.values,categories["physchem"].values) + neighbor = categories + neighbor["similarity"] = sim + neighbor["id"] = id + sim_sum += sim + weighted_sum += sim*Math.log(categories["tox"][ENDPOINT]) + neighbors << neighbor + end + end + neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + { + :query => params, + :match => match, + :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)}, + :neighbors => neighbors + } +end + +class Object + def numeric? + true if Float(self) rescue false + end +end + +def euclidean_distance(a, b) + sq = a.zip(b).map{|a,b| (a - b) ** 2} + Math.sqrt(sq.inject(0) {|s,c| s + c}) +end + +def dot_product(a, b) + products = a.zip(b).map{|a, b| a * b} + products.inject(0) {|s,p| s + p} +end + +def magnitude(point) + squares = point.map{|x| x ** 2} + Math.sqrt(squares.inject(0) {|s, c| s + c}) +end + +def cosine_similarity(a, b) + dot_product(a, b) / (magnitude(a) * magnitude(b)) +end + +#@endpoint = @data.collect{|r| r[5]} + +def neighbors query +end + +def csv2json + csv = CSV.read("data/MergedSheets_edit.csv") + csv.collect!{|row| row[0..36].collect{|c| c.numeric? ? c.to_f : c } }.compact + feature_names = [ + "ID", + csv[0][1], + csv[0][2], + csv[0][3], + csv[6][4], + "#{csv[0][5]} (#{csv[6][5]} [#{csv[11][5]}])", # endpoint + "#{csv[0][6]} (#{csv[6][6]})", # endpoint + "#{csv[6][7]} [#{csv[11][7]}]", + "#{csv[6][8]} [#{csv[11][8]}]", + "#{csv[6][9]} [#{csv[11][9]}]", + ] + (10..10+5*3).step(3) do |i| + feature_names += [ + "#{csv[6][i]} [#{csv[11][i]}]", + "#{csv[6][i+1]} #{csv[8][i+1]} [#{csv[11][i+1]}]", + "#{csv[6][i+2]} #{csv[8][i+2]}", + ] + end + feature_names += [ + "#{csv[6][28]}", + "#{csv[6][29]} #{csv[8][29]}", + "#{csv[6][30]} #{csv[8][30]}", + ] + (31..34).each do |i| + feature_names << "#{csv[6][i]} #{csv[8][i]} [#{csv[11][i]}]" + end + (35..36).each do |i| + feature_names << "#{csv[6][i]} #{csv[8][i]} #{csv[10][i]} [#{csv[11][i]}]" + end + data = {} + csv.drop(12).each do |row| + id = row.first + if id.match /^G/ # skip Ag, too many missing values + data[id] = {} + row.each_with_index do |col,i| + if i == 0 + data[id][:composition] = {} + elsif i < 5 + data[id][:composition][feature_names[i]] = col + elsif i == 5 + data[id][:tox] ||= {} + data[id][:tox][feature_names[i]] = col + elsif i > 6 + data[id][:physchem] ||= {} + data[id][:physchem][feature_names[i]] = col + end + end + end + end + File.open("data.json","w+"){|f| f.puts data.to_json} + data +end + +#puts data.to_yaml +=begin +R.assign "endpoint", endpoint +(0..data[0].size).each do |c| + if data.collect{|r| r[c]}.uniq.size > 1 + begin + R.assign "feature", data.collect{|r| r[c]} + R.eval "r <- cor(-log(endpoint),-log(feature),use='complete')" + r = R.eval("r").to_ruby + p "#{c}: #{r}" if r > 0.3 or r < -0.3 + rescue + end + end +end + + +csv[0..13].each do |row| + row.each_with_index do |col,i| + features[i] = features[i].to_s+", "+col.to_s + end +end + +puts features.select{|f| f.match(/Mean/)}.to_yaml + + #n+=1 + #p n,row.first unless row.first.match /^[G|S]/ +=end diff --git a/protein_corona.rb b/protein_corona.rb deleted file mode 100644 index 890b3ca..0000000 --- a/protein_corona.rb +++ /dev/null @@ -1,147 +0,0 @@ -require 'json' -require 'yaml' -require 'csv' - -ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])" - -def predict params - neighbors = [] - sim_sum = 0 - weighted_sum = 0 - match = nil - JSON.parse(File.read("./data.json")).each do |id,categories| - if params.values == categories["physchem"].values - match = {:id => categories} - else - sim = cosine_similarity(params.values,categories["physchem"].values) - neighbor = categories - neighbor["similarity"] = sim - neighbor["id"] = id - sim_sum += sim - weighted_sum += sim*Math.log(categories["tox"][ENDPOINT]) - neighbors << neighbor - end - end - neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} - { - :query => params, - :match => match, - :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)}, - :neighbors => neighbors - } -end - -class Object - def numeric? - true if Float(self) rescue false - end -end - -def euclidean_distance(a, b) - sq = a.zip(b).map{|a,b| (a - b) ** 2} - Math.sqrt(sq.inject(0) {|s,c| s + c}) -end - -def dot_product(a, b) - products = a.zip(b).map{|a, b| a * b} - products.inject(0) {|s,p| s + p} -end - -def magnitude(point) - squares = point.map{|x| x ** 2} - Math.sqrt(squares.inject(0) {|s, c| s + c}) -end - -def cosine_similarity(a, b) - dot_product(a, b) / (magnitude(a) * magnitude(b)) -end - -#@endpoint = @data.collect{|r| r[5]} - -def neighbors query -end - -def csv2json - csv = CSV.read("data/MergedSheets_edit.csv") - csv.collect!{|row| row[0..36].collect{|c| c.numeric? ? c.to_f : c } }.compact - feature_names = [ - "ID", - csv[0][1], - csv[0][2], - csv[0][3], - csv[6][4], - "#{csv[0][5]} (#{csv[6][5]} [#{csv[11][5]}])", # endpoint - "#{csv[0][6]} (#{csv[6][6]})", # endpoint - "#{csv[6][7]} [#{csv[11][7]}]", - "#{csv[6][8]} [#{csv[11][8]}]", - "#{csv[6][9]} [#{csv[11][9]}]", - ] - (10..10+5*3).step(3) do |i| - feature_names += [ - "#{csv[6][i]} [#{csv[11][i]}]", - "#{csv[6][i+1]} #{csv[8][i+1]} [#{csv[11][i+1]}]", - "#{csv[6][i+2]} #{csv[8][i+2]}", - ] - end - feature_names += [ - "#{csv[6][28]}", - "#{csv[6][29]} #{csv[8][29]}", - "#{csv[6][30]} #{csv[8][30]}", - ] - (31..34).each do |i| - feature_names << "#{csv[6][i]} #{csv[8][i]} [#{csv[11][i]}]" - end - (35..36).each do |i| - feature_names << "#{csv[6][i]} #{csv[8][i]} #{csv[10][i]} [#{csv[11][i]}]" - end - data = {} - csv.drop(12).each do |row| - id = row.first - if id.match /^G/ # skip Ag, too many missing values - data[id] = {} - row.each_with_index do |col,i| - if i == 0 - data[id][:composition] = {} - elsif i < 5 - data[id][:composition][feature_names[i]] = col - elsif i == 5 - data[id][:tox] ||= {} - data[id][:tox][feature_names[i]] = col - elsif i > 6 - data[id][:physchem] ||= {} - data[id][:physchem][feature_names[i]] = col - end - end - end - end - File.open("data.json","w+"){|f| f.puts data.to_json} - data -end - -#puts data.to_yaml -=begin -R.assign "endpoint", endpoint -(0..data[0].size).each do |c| - if data.collect{|r| r[c]}.uniq.size > 1 - begin - R.assign "feature", data.collect{|r| r[c]} - R.eval "r <- cor(-log(endpoint),-log(feature),use='complete')" - r = R.eval("r").to_ruby - p "#{c}: #{r}" if r > 0.3 or r < -0.3 - rescue - end - end -end - - -csv[0..13].each do |row| - row.each_with_index do |col,i| - features[i] = features[i].to_s+", "+col.to_s - end -end - -puts features.select{|f| f.match(/Mean/)}.to_yaml - - #n+=1 - #p n,row.first unless row.first.match /^[G|S]/ -=end -- cgit v1.2.3