import.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

# TODO: missing data for protein corona silver particles
require 'json'
require 'yaml'
require 'csv'
require_relative "lib/nano-lazar.rb"
include OpenTox

def feature_name uri
  f = @features[uri]
  name = f['title']
  annotations = f['annotation'].collect{|a| "#{a['p']}: #{a['o']}"}.uniq.join ", "
  name << " (#{annotations})" unless annotations.empty?
  name << " [#{f['units']}]" if f['units'] and !f['units'].empty?
  name
end

nanomaterials = []
feature_names = {}
@features = {}

["nanowiki.json",  "protein-corona.json", "marina.json"].each do |f|
  bundle = JSON.parse(File.read(File.join("data",f)))
  @features.merge! bundle["feature"]
  bundle["dataEntry"].each do |substance|
    nm = Nanoparticle.new
    nm.uri = substance["compound"]["URI"]
    nm.name = substance["values"]["https://apps.ideaconsult.net/enanomapper/identifier/name"] if substance["values"]
    if substance["composition"]
      nr_cores = substance["composition"].select{|c| c["relation"] == "HAS_CORE"}.size
      puts "#{substance["compound"]["URI"]} has #{nr_cores} cores" if nr_cores !=1
      substance["composition"].each do |composition|
        component = composition["component"]
        if component
          name = component["values"]["https://apps.ideaconsult.net/enanomapper/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
          #names << name
          if composition["relation"] == "HAS_CORE"
            nm.core = name
          elsif composition["relation"] == "HAS_COATING"
            nm.coating ||= []
            nm.coating << name
          end
        else
          #puts substance.to_yaml
        end
      end
    else
      #puts substance.to_yaml
    end
    substance["values"].each do |k,v|
      property = nil
      if k.match(/TOX/)
        nm.tox ||= []
        property = "tox"
      elsif k.match(/P-CHEM/)
        nm.p_chem ||= []
        property = "p_chem"
      end
      if property
        v.each do |val|
          if val.keys == ["loValue"]
            nm.tox << {k => val["loValue"]} if property == "tox"
            nm.p_chem << {k => val["loValue"]} if property == "p_chem"
          elsif val.keys == ["loQualifier", "loValue"] and val["loQualifier"] == "mean"
            nm.tox << {k => val["loValue"]} if property == "tox"
            nm.p_chem << {k => val["loValue"]} if property == "p_chem"
          elsif val.keys == ["loQualifier", "loValue", "upQualifier", "upValue" ]
            nm.tox << {k => (val["loValue"]+val["upValue"])/2} if property == "tox"
            nm.p_chem << {k => (val["loValue"]+val["upValue"])/2} if property == "p_chem"
          elsif val.keys == ["loQualifier", "loValue"] and val["loQualifier"] == ">="
          else
          p val
          end
        end
      else
        #p k,v
      end
    end
    nm.tox.uniq! if nm.tox
    nm.p_chem.uniq! if nm.p_chem
    nanomaterials << nm
  end
end

puts "Total imported: #{nanomaterials.size}"
puts "With nanoparticle characterisation: #{nanomaterials.select{|n| n.p_chem}.size}"
modelling_data = nanomaterials.select{|n| n.tox and n.p_chem}
puts "With TOX data: #{nanomaterials.select{|n| n.tox}.size}"
puts "With TOX data and particle characterisation: #{modelling_data.size}"
endpoints = modelling_data.collect{|n| n.tox.collect{|t| t.keys}}.flatten.compact.uniq
puts
puts "Endpoints: #{endpoints.size}"

single_value_endpoints = []
endpoint_values = {}

endpoints.each do |e|
  i = 0
  values = []
  modelling_data.each do |n|
    n.tox.each do |t|
      if t[e]
        i += 1
        values << t[e]
      end
    end
  end
  single_value_endpoints << e if values.uniq.size == 1
  endpoint_values[e] = values.size unless values.uniq.size == 1
end

endpoints -= single_value_endpoints
puts "Endpoints with more than one measurement value: #{endpoints.size}"
endpoint_values.select!{|k,v| v > 10}
puts "Endpoints with more than 10 measurements: #{endpoint_values.size}"
endpoints = endpoint_values.keys
puts
puts endpoint_values.sort{|a,b| b[1] <=> a[1]}.collect{|e,v| "#{feature_names[e]}: #{v}"}.join("\n")

endpoint = "https://apps.ideaconsult.net/enanomapper/property/TOX/UNKNOWN_TOXICITY_SECTION/Log2+transformed/94D664CFE4929A0F400A5AD8CA733B52E049A688/E/3ed642f9-1b42-387a-9966-dea5b91e5f8a"
nanomaterials.select!{|nm| nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint}
p nanomaterials.size

feature_values = {}
nanomaterials.each do |nm|
  (nm.p_chem + nm.tox).each do |f|
    feature_names[f] = feature_name f # avoid appending annotations/units with each function call, unclear why it happens
    p f unless f.size == 1
    k = f.keys.first
    unless f[k].is_a? String
      feature_values[k] ||= []
      feature_values[k] << f[k]
    end
  end
end

# remove empty values
feature_values.select!{|f,vals| vals.uniq.size > 2}
tox_descriptors = feature_values.select{|f,vals| f.match 'TOX'}.keys
p_chem_descriptors = feature_values.select{|f,vals| f.match 'P-CHEM'}.keys

#puts @features.to_yaml

column_names = ["Nanoparticle"] + p_chem_descriptors.collect{|d| feature_names[d]} + tox_descriptors.collect{|d| feature_names[d]}
table = []
CSV.open(File.join(File.dirname(__FILE__),"data","protein_corona_extract.csv"),"w+") do |csv|
  csv << column_names
  nanomaterials.each do |nm|
    if nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint
      #table << []
      csv << [nm.name] + p_chem_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first} + tox_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first}
    end
  end
end