From ff73a102a87d9e8e409ddf925f2e9477d60cafa7 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Sat, 16 Jan 2016 14:35:53 +0100
Subject: features with unique values removed

---
 import.rb           | 81 +++++++++++++++++++++++++++++++++++++----------------
 lib/nanoparticle.rb |  4 +--
 2 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/import.rb b/import.rb
index 2a83b94..63d8a08 100644
--- a/import.rb
+++ b/import.rb
@@ -1,16 +1,30 @@
+# TODO: missing data for protein corona silver particles
 require 'json'
 require 'yaml'
-#require_relative "../lazar/lib/lazar.rb"
+require 'csv'
 require_relative "lib/nano-lazar.rb"
 include OpenTox
 
+def feature_name uri
+  f = @features[uri]
+  name = f['title']
+  annotations = f['annotation'].collect{|a| "#{a['p']}: #{a['o']}"}.uniq.join ", "
+  name << " (#{annotations})" unless annotations.empty?
+  name << " [#{f['units']}]" if f['units'] and !f['units'].empty?
+  name
+end
+
 nanomaterials = []
-names = []
+feature_names = {}
+@features = {}
 
 ["nanowiki.json",  "protein-corona.json", "marina.json"].each do |f|
-  JSON.parse(File.read(File.join("data",f)))["dataEntry"].each do |substance|
-    nm = Nanomaterial.new
+  bundle = JSON.parse(File.read(File.join("data",f)))
+  @features.merge! bundle["feature"]
+  bundle["dataEntry"].each do |substance|
+    nm = Nanoparticle.new
     nm.uri = substance["compound"]["URI"]
+    nm.name = substance["values"]["https://apps.ideaconsult.net/enanomapper/identifier/name"] if substance["values"]
     if substance["composition"]
       nr_cores = substance["composition"].select{|c| c["relation"] == "HAS_CORE"}.size
       puts "#{substance["compound"]["URI"]} has #{nr_cores} cores" if nr_cores !=1
@@ -18,7 +32,7 @@ names = []
         component = composition["component"]
         if component
           name = component["values"]["https://apps.ideaconsult.net/enanomapper/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
-          names << name
+          #names << name
           if composition["relation"] == "HAS_CORE"
             nm.core = name
           elsif composition["relation"] == "HAS_COATING"
@@ -73,18 +87,13 @@ modelling_data = nanomaterials.select{|n| n.tox and n.p_chem}
 puts "With TOX data: #{nanomaterials.select{|n| n.tox}.size}"
 puts "With TOX data and particle characterisation: #{modelling_data.size}"
 endpoints = modelling_data.collect{|n| n.tox.collect{|t| t.keys}}.flatten.compact.uniq
+puts
 puts "Endpoints: #{endpoints.size}"
 
 single_value_endpoints = []
 endpoint_values = {}
 
 endpoints.each do |e|
-  #json = `curl -H "Accept:application/json" "#{e}" 2>/dev/null`
-  #f = JSON.parse(json)["feature"]
-  #p k unless f.keys.size == 1
-  #k = f.keys.first
-  #p e
-  #p modelling_data.select{|n| n.tox.select{|t| t[e]}}.size
   i = 0
   values = []
   modelling_data.each do |n|
@@ -97,24 +106,48 @@ endpoints.each do |e|
   end
   single_value_endpoints << e if values.uniq.size == 1
   endpoint_values[e] = values.size unless values.uniq.size == 1
-  #puts "#{f[k]['title']} [#{f[k]['units']}]: #{i} #{values}"
 end
 
 endpoints -= single_value_endpoints
 puts "Endpoints with more than one measurement value: #{endpoints.size}"
-#endpoint_values.sort{|a,b| b[1] <=> a[1]}
 endpoint_values.select!{|k,v| v > 10}
 puts "Endpoints with more than 10 measurements: #{endpoint_values.size}"
 endpoints = endpoint_values.keys
-#puts endpoints.to_yaml
-endpoint_values.sort{|a,b| b[1] <=> a[1]}.each do |e,v|
-  json = `curl -H "Accept:application/json" "#{e}" 2>/dev/null`
-  f = JSON.parse(json)["feature"]
-  p k unless f.keys.size == 1
-  k = f.keys.first
-  p e
-  puts "#{f[k]['title']} [#{f[k]['units']}]: #{v} "
+puts
+puts endpoint_values.sort{|a,b| b[1] <=> a[1]}.collect{|e,v| "#{feature_names[e]}: #{v}"}.join("\n")
+
+endpoint = "https://apps.ideaconsult.net/enanomapper/property/TOX/UNKNOWN_TOXICITY_SECTION/Log2+transformed/94D664CFE4929A0F400A5AD8CA733B52E049A688/E/3ed642f9-1b42-387a-9966-dea5b91e5f8a"
+nanomaterials.select!{|nm| nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint}
+p nanomaterials.size
+
+feature_values = {}
+nanomaterials.each do |nm|
+  (nm.p_chem + nm.tox).each do |f|
+    feature_names[f] = feature_name f # avoid appending annotations/units with each function call, unclear why it happens
+    p f unless f.size == 1
+    k = f.keys.first
+    unless f[k].is_a? String
+      feature_values[k] ||= []
+      feature_values[k] << f[k]
+    end
+  end
+end
+
+# remove empty values
+feature_values.select!{|f,vals| vals.uniq.size > 2}
+tox_descriptors = feature_values.select{|f,vals| f.match 'TOX'}.keys
+p_chem_descriptors = feature_values.select{|f,vals| f.match 'P-CHEM'}.keys
+
+#puts @features.to_yaml
+
+column_names = ["Nanoparticle"] + p_chem_descriptors.collect{|d| feature_names[d]} + tox_descriptors.collect{|d| feature_names[d]}
+table = []
+CSV.open(File.join(File.dirname(__FILE__),"data","protein_corona_extract.csv"),"w+") do |csv|
+  csv << column_names
+  nanomaterials.each do |nm|
+    if nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint
+      #table << []
+      csv << [nm.name] + p_chem_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first} + tox_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first}
+    end
+  end
 end
-#puts "Endpoints with more than one value single_value_endpoints.size
-#puts names.sort.uniq.to_yaml
-#p nanomaterials.collect{|n| n.uri}.uniq.size
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 3a293ee..0ec29a3 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -1,9 +1,9 @@
 module OpenTox
 
-  class Nanomaterial
+  class Nanoparticle
     include OpenTox
 
-    attr_accessor :uri, :tox, :p_chem, :core, :coating
+    attr_accessor :name, :uri, :tox, :p_chem, :core, :coating
 
   end
 end
-- 
cgit v1.2.3