From 9546c589f6852942ed85f8da1e12c351fb92e0f0 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 20 Jan 2016 13:53:22 +0100
Subject: enm import removed

---
 .gitignore        |   1 +
 application.rb    |   4 +-
 import.rb         | 153 ------------------------------------------------------
 nanoparticles.rb  | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++
 protein_corona.rb | 147 ---------------------------------------------------
 5 files changed, 150 insertions(+), 302 deletions(-)
 delete mode 100644 import.rb
 create mode 100644 nanoparticles.rb
 delete mode 100644 protein_corona.rb

diff --git a/.gitignore b/.gitignore
index 43ce16a..e73632d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ data
 papers
 *tmp
 *swp
+enm-import.rb
diff --git a/application.rb b/application.rb
index 1cb0e13..703932f 100644
--- a/application.rb
+++ b/application.rb
@@ -1,7 +1,7 @@
 require 'sinatra'
 require "sinatra/reloader" if development?
-require_relative 'protein_corona.rb'
-also_reload './protein_corona.rb'
+require_relative 'nanoparticles.rb'
+also_reload './nanoparticles.rb'
 
 get '/?' do
   @data = JSON.parse(File.read("./data.json"))
diff --git a/import.rb b/import.rb
deleted file mode 100644
index 63d8a08..0000000
--- a/import.rb
+++ /dev/null
@@ -1,153 +0,0 @@
-# TODO: missing data for protein corona silver particles
-require 'json'
-require 'yaml'
-require 'csv'
-require_relative "lib/nano-lazar.rb"
-include OpenTox
-
-def feature_name uri
-  f = @features[uri]
-  name = f['title']
-  annotations = f['annotation'].collect{|a| "#{a['p']}: #{a['o']}"}.uniq.join ", "
-  name << " (#{annotations})" unless annotations.empty?
-  name << " [#{f['units']}]" if f['units'] and !f['units'].empty?
-  name
-end
-
-nanomaterials = []
-feature_names = {}
-@features = {}
-
-["nanowiki.json",  "protein-corona.json", "marina.json"].each do |f|
-  bundle = JSON.parse(File.read(File.join("data",f)))
-  @features.merge! bundle["feature"]
-  bundle["dataEntry"].each do |substance|
-    nm = Nanoparticle.new
-    nm.uri = substance["compound"]["URI"]
-    nm.name = substance["values"]["https://apps.ideaconsult.net/enanomapper/identifier/name"] if substance["values"]
-    if substance["composition"]
-      nr_cores = substance["composition"].select{|c| c["relation"] == "HAS_CORE"}.size
-      puts "#{substance["compound"]["URI"]} has #{nr_cores} cores" if nr_cores !=1
-      substance["composition"].each do |composition|
-        component = composition["component"]
-        if component
-          name = component["values"]["https://apps.ideaconsult.net/enanomapper/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
-          #names << name
-          if composition["relation"] == "HAS_CORE"
-            nm.core = name
-          elsif composition["relation"] == "HAS_COATING"
-            nm.coating ||= []
-            nm.coating << name
-          end
-        else
-          #puts substance.to_yaml
-        end
-      end
-    else
-      #puts substance.to_yaml
-    end
-    substance["values"].each do |k,v|
-      property = nil
-      if k.match(/TOX/)
-        nm.tox ||= []
-        property = "tox"
-      elsif k.match(/P-CHEM/)
-        nm.p_chem ||= []
-        property = "p_chem"
-      end
-      if property
-        v.each do |val|
-          if val.keys == ["loValue"]
-            nm.tox << {k => val["loValue"]} if property == "tox"
-            nm.p_chem << {k => val["loValue"]} if property == "p_chem"
-          elsif val.keys == ["loQualifier", "loValue"] and val["loQualifier"] == "mean"
-            nm.tox << {k => val["loValue"]} if property == "tox"
-            nm.p_chem << {k => val["loValue"]} if property == "p_chem"
-          elsif val.keys == ["loQualifier", "loValue", "upQualifier", "upValue" ]
-            nm.tox << {k => (val["loValue"]+val["upValue"])/2} if property == "tox"
-            nm.p_chem << {k => (val["loValue"]+val["upValue"])/2} if property == "p_chem"
-          elsif val.keys == ["loQualifier", "loValue"] and val["loQualifier"] == ">="
-          else
-          p val
-          end
-        end
-      else
-        #p k,v
-      end
-    end
-    nm.tox.uniq! if nm.tox
-    nm.p_chem.uniq! if nm.p_chem
-    nanomaterials << nm
-  end
-end
-
-puts "Total imported: #{nanomaterials.size}"
-puts "With nanoparticle characterisation: #{nanomaterials.select{|n| n.p_chem}.size}"
-modelling_data = nanomaterials.select{|n| n.tox and n.p_chem}
-puts "With TOX data: #{nanomaterials.select{|n| n.tox}.size}"
-puts "With TOX data and particle characterisation: #{modelling_data.size}"
-endpoints = modelling_data.collect{|n| n.tox.collect{|t| t.keys}}.flatten.compact.uniq
-puts
-puts "Endpoints: #{endpoints.size}"
-
-single_value_endpoints = []
-endpoint_values = {}
-
-endpoints.each do |e|
-  i = 0
-  values = []
-  modelling_data.each do |n|
-    n.tox.each do |t|
-      if t[e]
-        i += 1
-        values << t[e]
-      end
-    end
-  end
-  single_value_endpoints << e if values.uniq.size == 1
-  endpoint_values[e] = values.size unless values.uniq.size == 1
-end
-
-endpoints -= single_value_endpoints
-puts "Endpoints with more than one measurement value: #{endpoints.size}"
-endpoint_values.select!{|k,v| v > 10}
-puts "Endpoints with more than 10 measurements: #{endpoint_values.size}"
-endpoints = endpoint_values.keys
-puts
-puts endpoint_values.sort{|a,b| b[1] <=> a[1]}.collect{|e,v| "#{feature_names[e]}: #{v}"}.join("\n")
-
-endpoint = "https://apps.ideaconsult.net/enanomapper/property/TOX/UNKNOWN_TOXICITY_SECTION/Log2+transformed/94D664CFE4929A0F400A5AD8CA733B52E049A688/E/3ed642f9-1b42-387a-9966-dea5b91e5f8a"
-nanomaterials.select!{|nm| nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint}
-p nanomaterials.size
-
-feature_values = {}
-nanomaterials.each do |nm|
-  (nm.p_chem + nm.tox).each do |f|
-    feature_names[f] = feature_name f # avoid appending annotations/units with each function call, unclear why it happens
-    p f unless f.size == 1
-    k = f.keys.first
-    unless f[k].is_a? String
-      feature_values[k] ||= []
-      feature_values[k] << f[k]
-    end
-  end
-end
-
-# remove empty values
-feature_values.select!{|f,vals| vals.uniq.size > 2}
-tox_descriptors = feature_values.select{|f,vals| f.match 'TOX'}.keys
-p_chem_descriptors = feature_values.select{|f,vals| f.match 'P-CHEM'}.keys
-
-#puts @features.to_yaml
-
-column_names = ["Nanoparticle"] + p_chem_descriptors.collect{|d| feature_names[d]} + tox_descriptors.collect{|d| feature_names[d]}
-table = []
-CSV.open(File.join(File.dirname(__FILE__),"data","protein_corona_extract.csv"),"w+") do |csv|
-  csv << column_names
-  nanomaterials.each do |nm|
-    if nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint
-      #table << []
-      csv << [nm.name] + p_chem_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first} + tox_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first}
-    end
-  end
-end
diff --git a/nanoparticles.rb b/nanoparticles.rb
new file mode 100644
index 0000000..890b3ca
--- /dev/null
+++ b/nanoparticles.rb
@@ -0,0 +1,147 @@
+require 'json'
+require 'yaml'
+require 'csv'
+
+ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])"
+
+def predict params
+  neighbors = []
+  sim_sum = 0
+  weighted_sum = 0
+  match = nil
+  JSON.parse(File.read("./data.json")).each do |id,categories|
+    if params.values == categories["physchem"].values
+      match = {:id => categories}
+    else
+      sim = cosine_similarity(params.values,categories["physchem"].values)
+      neighbor = categories
+      neighbor["similarity"] = sim
+      neighbor["id"] = id
+      sim_sum += sim
+      weighted_sum += sim*Math.log(categories["tox"][ENDPOINT])
+      neighbors << neighbor
+    end
+  end
+  neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
+  {
+    :query => params,
+    :match => match,
+    :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)},
+    :neighbors => neighbors
+  }
+end
+
+class Object
+  def numeric?
+    true if Float(self) rescue false
+  end
+end
+
+def euclidean_distance(a, b)
+  sq = a.zip(b).map{|a,b| (a - b) ** 2}
+  Math.sqrt(sq.inject(0) {|s,c| s + c})
+end
+
+def dot_product(a, b)
+  products = a.zip(b).map{|a, b| a * b}
+  products.inject(0) {|s,p| s + p}
+end
+
+def magnitude(point)
+  squares = point.map{|x| x ** 2}
+  Math.sqrt(squares.inject(0) {|s, c| s + c})
+end
+
+def cosine_similarity(a, b)
+  dot_product(a, b) / (magnitude(a) * magnitude(b))
+end
+
+#@endpoint = @data.collect{|r| r[5]}
+
+def neighbors query
+end
+
+def csv2json
+  csv = CSV.read("data/MergedSheets_edit.csv")
+  csv.collect!{|row| row[0..36].collect{|c| c.numeric? ? c.to_f : c } }.compact
+  feature_names = [
+    "ID",
+     csv[0][1],
+     csv[0][2],
+     csv[0][3],
+     csv[6][4],
+     "#{csv[0][5]} (#{csv[6][5]} [#{csv[11][5]}])", # endpoint
+     "#{csv[0][6]} (#{csv[6][6]})", # endpoint
+     "#{csv[6][7]} [#{csv[11][7]}]",
+     "#{csv[6][8]} [#{csv[11][8]}]",
+     "#{csv[6][9]} [#{csv[11][9]}]",
+  ]
+  (10..10+5*3).step(3) do |i|
+    feature_names += [
+     "#{csv[6][i]} [#{csv[11][i]}]",
+     "#{csv[6][i+1]} #{csv[8][i+1]} [#{csv[11][i+1]}]",
+     "#{csv[6][i+2]} #{csv[8][i+2]}",
+    ]
+  end
+  feature_names += [
+   "#{csv[6][28]}",
+   "#{csv[6][29]} #{csv[8][29]}",
+   "#{csv[6][30]} #{csv[8][30]}",
+  ]
+  (31..34).each do |i|
+    feature_names << "#{csv[6][i]} #{csv[8][i]} [#{csv[11][i]}]"
+  end
+  (35..36).each do |i|
+    feature_names << "#{csv[6][i]} #{csv[8][i]} #{csv[10][i]} [#{csv[11][i]}]"
+  end
+  data = {}
+  csv.drop(12).each do |row|
+    id = row.first
+    if id.match /^G/ # skip Ag, too many missing values
+      data[id] = {}
+      row.each_with_index do |col,i|
+        if i == 0
+          data[id][:composition] = {}
+        elsif i < 5
+          data[id][:composition][feature_names[i]] = col
+        elsif i == 5
+          data[id][:tox] ||= {}
+          data[id][:tox][feature_names[i]] = col
+        elsif i > 6
+          data[id][:physchem] ||= {}
+          data[id][:physchem][feature_names[i]] = col
+        end
+      end
+    end
+  end
+  File.open("data.json","w+"){|f| f.puts data.to_json}
+  data
+end
+
+#puts data.to_yaml
+=begin
+R.assign "endpoint", endpoint
+(0..data[0].size).each do |c|
+  if data.collect{|r| r[c]}.uniq.size > 1
+    begin
+    R.assign "feature", data.collect{|r| r[c]}
+    R.eval "r <- cor(-log(endpoint),-log(feature),use='complete')"
+    r = R.eval("r").to_ruby
+    p "#{c}: #{r}" if r > 0.3 or r < -0.3
+    rescue
+    end
+  end
+end
+
+
+csv[0..13].each do |row|
+  row.each_with_index do |col,i|
+    features[i] = features[i].to_s+", "+col.to_s
+  end
+end
+
+puts features.select{|f| f.match(/Mean/)}.to_yaml
+
+  #n+=1
+  #p n,row.first unless row.first.match /^[G|S]/
+=end
diff --git a/protein_corona.rb b/protein_corona.rb
deleted file mode 100644
index 890b3ca..0000000
--- a/protein_corona.rb
+++ /dev/null
@@ -1,147 +0,0 @@
-require 'json'
-require 'yaml'
-require 'csv'
-
-ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])"
-
-def predict params
-  neighbors = []
-  sim_sum = 0
-  weighted_sum = 0
-  match = nil
-  JSON.parse(File.read("./data.json")).each do |id,categories|
-    if params.values == categories["physchem"].values
-      match = {:id => categories}
-    else
-      sim = cosine_similarity(params.values,categories["physchem"].values)
-      neighbor = categories
-      neighbor["similarity"] = sim
-      neighbor["id"] = id
-      sim_sum += sim
-      weighted_sum += sim*Math.log(categories["tox"][ENDPOINT])
-      neighbors << neighbor
-    end
-  end
-  neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
-  {
-    :query => params,
-    :match => match,
-    :prediction => {ENDPOINT => 10**(weighted_sum/sim_sum)},
-    :neighbors => neighbors
-  }
-end
-
-class Object
-  def numeric?
-    true if Float(self) rescue false
-  end
-end
-
-def euclidean_distance(a, b)
-  sq = a.zip(b).map{|a,b| (a - b) ** 2}
-  Math.sqrt(sq.inject(0) {|s,c| s + c})
-end
-
-def dot_product(a, b)
-  products = a.zip(b).map{|a, b| a * b}
-  products.inject(0) {|s,p| s + p}
-end
-
-def magnitude(point)
-  squares = point.map{|x| x ** 2}
-  Math.sqrt(squares.inject(0) {|s, c| s + c})
-end
-
-def cosine_similarity(a, b)
-  dot_product(a, b) / (magnitude(a) * magnitude(b))
-end
-
-#@endpoint = @data.collect{|r| r[5]}
-
-def neighbors query
-end
-
-def csv2json
-  csv = CSV.read("data/MergedSheets_edit.csv")
-  csv.collect!{|row| row[0..36].collect{|c| c.numeric? ? c.to_f : c } }.compact
-  feature_names = [
-    "ID",
-     csv[0][1],
-     csv[0][2],
-     csv[0][3],
-     csv[6][4],
-     "#{csv[0][5]} (#{csv[6][5]} [#{csv[11][5]}])", # endpoint
-     "#{csv[0][6]} (#{csv[6][6]})", # endpoint
-     "#{csv[6][7]} [#{csv[11][7]}]",
-     "#{csv[6][8]} [#{csv[11][8]}]",
-     "#{csv[6][9]} [#{csv[11][9]}]",
-  ]
-  (10..10+5*3).step(3) do |i|
-    feature_names += [
-     "#{csv[6][i]} [#{csv[11][i]}]",
-     "#{csv[6][i+1]} #{csv[8][i+1]} [#{csv[11][i+1]}]",
-     "#{csv[6][i+2]} #{csv[8][i+2]}",
-    ]
-  end
-  feature_names += [
-   "#{csv[6][28]}",
-   "#{csv[6][29]} #{csv[8][29]}",
-   "#{csv[6][30]} #{csv[8][30]}",
-  ]
-  (31..34).each do |i|
-    feature_names << "#{csv[6][i]} #{csv[8][i]} [#{csv[11][i]}]"
-  end
-  (35..36).each do |i|
-    feature_names << "#{csv[6][i]} #{csv[8][i]} #{csv[10][i]} [#{csv[11][i]}]"
-  end
-  data = {}
-  csv.drop(12).each do |row|
-    id = row.first
-    if id.match /^G/ # skip Ag, too many missing values
-      data[id] = {}
-      row.each_with_index do |col,i|
-        if i == 0
-          data[id][:composition] = {}
-        elsif i < 5
-          data[id][:composition][feature_names[i]] = col
-        elsif i == 5
-          data[id][:tox] ||= {}
-          data[id][:tox][feature_names[i]] = col
-        elsif i > 6
-          data[id][:physchem] ||= {}
-          data[id][:physchem][feature_names[i]] = col
-        end
-      end
-    end
-  end
-  File.open("data.json","w+"){|f| f.puts data.to_json}
-  data
-end
-
-#puts data.to_yaml
-=begin
-R.assign "endpoint", endpoint
-(0..data[0].size).each do |c|
-  if data.collect{|r| r[c]}.uniq.size > 1
-    begin
-    R.assign "feature", data.collect{|r| r[c]}
-    R.eval "r <- cor(-log(endpoint),-log(feature),use='complete')"
-    r = R.eval("r").to_ruby
-    p "#{c}: #{r}" if r > 0.3 or r < -0.3
-    rescue
-    end
-  end
-end
-
-
-csv[0..13].each do |row|
-  row.each_with_index do |col,i|
-    features[i] = features[i].to_s+", "+col.to_s
-  end
-end
-
-puts features.select{|f| f.match(/Mean/)}.to_yaml
-
-  #n+=1
-  #p n,row.first unless row.first.match /^[G|S]/
-=end
-- 
cgit v1.2.3