summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-01-16 14:35:53 +0100
committerChristoph Helma <helma@in-silico.ch>2016-01-16 14:35:53 +0100
commitff73a102a87d9e8e409ddf925f2e9477d60cafa7 (patch)
tree48939c7366ec52a4712e6714cfadfdb8de0490aa
parente5b2f59ab602b2fb850a5338f5645ef331e0e66c (diff)
features with unique values removed
-rw-r--r--import.rb81
-rw-r--r--lib/nanoparticle.rb4
2 files changed, 59 insertions, 26 deletions
diff --git a/import.rb b/import.rb
index 2a83b94..63d8a08 100644
--- a/import.rb
+++ b/import.rb
@@ -1,16 +1,30 @@
+# TODO: missing data for protein corona silver particles
require 'json'
require 'yaml'
-#require_relative "../lazar/lib/lazar.rb"
+require 'csv'
require_relative "lib/nano-lazar.rb"
include OpenTox
+def feature_name uri
+ f = @features[uri]
+ name = f['title']
+ annotations = f['annotation'].collect{|a| "#{a['p']}: #{a['o']}"}.uniq.join ", "
+ name << " (#{annotations})" unless annotations.empty?
+ name << " [#{f['units']}]" if f['units'] and !f['units'].empty?
+ name
+end
+
nanomaterials = []
-names = []
+feature_names = {}
+@features = {}
["nanowiki.json", "protein-corona.json", "marina.json"].each do |f|
- JSON.parse(File.read(File.join("data",f)))["dataEntry"].each do |substance|
- nm = Nanomaterial.new
+ bundle = JSON.parse(File.read(File.join("data",f)))
+ @features.merge! bundle["feature"]
+ bundle["dataEntry"].each do |substance|
+ nm = Nanoparticle.new
nm.uri = substance["compound"]["URI"]
+ nm.name = substance["values"]["https://apps.ideaconsult.net/enanomapper/identifier/name"] if substance["values"]
if substance["composition"]
nr_cores = substance["composition"].select{|c| c["relation"] == "HAS_CORE"}.size
puts "#{substance["compound"]["URI"]} has #{nr_cores} cores" if nr_cores !=1
@@ -18,7 +32,7 @@ names = []
component = composition["component"]
if component
name = component["values"]["https://apps.ideaconsult.net/enanomapper/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
- names << name
+ #names << name
if composition["relation"] == "HAS_CORE"
nm.core = name
elsif composition["relation"] == "HAS_COATING"
@@ -73,18 +87,13 @@ modelling_data = nanomaterials.select{|n| n.tox and n.p_chem}
puts "With TOX data: #{nanomaterials.select{|n| n.tox}.size}"
puts "With TOX data and particle characterisation: #{modelling_data.size}"
endpoints = modelling_data.collect{|n| n.tox.collect{|t| t.keys}}.flatten.compact.uniq
+puts
puts "Endpoints: #{endpoints.size}"
single_value_endpoints = []
endpoint_values = {}
endpoints.each do |e|
- #json = `curl -H "Accept:application/json" "#{e}" 2>/dev/null`
- #f = JSON.parse(json)["feature"]
- #p k unless f.keys.size == 1
- #k = f.keys.first
- #p e
- #p modelling_data.select{|n| n.tox.select{|t| t[e]}}.size
i = 0
values = []
modelling_data.each do |n|
@@ -97,24 +106,48 @@ endpoints.each do |e|
end
single_value_endpoints << e if values.uniq.size == 1
endpoint_values[e] = values.size unless values.uniq.size == 1
- #puts "#{f[k]['title']} [#{f[k]['units']}]: #{i} #{values}"
end
endpoints -= single_value_endpoints
puts "Endpoints with more than one measurement value: #{endpoints.size}"
-#endpoint_values.sort{|a,b| b[1] <=> a[1]}
endpoint_values.select!{|k,v| v > 10}
puts "Endpoints with more than 10 measurements: #{endpoint_values.size}"
endpoints = endpoint_values.keys
-#puts endpoints.to_yaml
-endpoint_values.sort{|a,b| b[1] <=> a[1]}.each do |e,v|
- json = `curl -H "Accept:application/json" "#{e}" 2>/dev/null`
- f = JSON.parse(json)["feature"]
- p k unless f.keys.size == 1
- k = f.keys.first
- p e
- puts "#{f[k]['title']} [#{f[k]['units']}]: #{v} "
+puts
+puts endpoint_values.sort{|a,b| b[1] <=> a[1]}.collect{|e,v| "#{feature_names[e]}: #{v}"}.join("\n")
+
+endpoint = "https://apps.ideaconsult.net/enanomapper/property/TOX/UNKNOWN_TOXICITY_SECTION/Log2+transformed/94D664CFE4929A0F400A5AD8CA733B52E049A688/E/3ed642f9-1b42-387a-9966-dea5b91e5f8a"
+nanomaterials.select!{|nm| nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint}
+p nanomaterials.size
+
+feature_values = {}
+nanomaterials.each do |nm|
+ (nm.p_chem + nm.tox).each do |f|
+ feature_names[f] = feature_name f # avoid appending annotations/units with each function call, unclear why it happens
+ p f unless f.size == 1
+ k = f.keys.first
+ unless f[k].is_a? String
+ feature_values[k] ||= []
+ feature_values[k] << f[k]
+ end
+ end
+end
+
+# remove empty values
+feature_values.select!{|f,vals| vals.uniq.size > 2}
+tox_descriptors = feature_values.select{|f,vals| f.match 'TOX'}.keys
+p_chem_descriptors = feature_values.select{|f,vals| f.match 'P-CHEM'}.keys
+
+#puts @features.to_yaml
+
+column_names = ["Nanoparticle"] + p_chem_descriptors.collect{|d| feature_names[d]} + tox_descriptors.collect{|d| feature_names[d]}
+table = []
+CSV.open(File.join(File.dirname(__FILE__),"data","protein_corona_extract.csv"),"w+") do |csv|
+ csv << column_names
+ nanomaterials.each do |nm|
+ if nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint
+ #table << []
+ csv << [nm.name] + p_chem_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first} + tox_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first}
+ end
+ end
end
-#puts "Endpoints with more than one value single_value_endpoints.size
-#puts names.sort.uniq.to_yaml
-#p nanomaterials.collect{|n| n.uri}.uniq.size
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 3a293ee..0ec29a3 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -1,9 +1,9 @@
module OpenTox
- class Nanomaterial
+ class Nanoparticle
include OpenTox
- attr_accessor :uri, :tox, :p_chem, :core, :coating
+ attr_accessor :name, :uri, :tox, :p_chem, :core, :coating
end
end