1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
module OpenTox
class Nanoparticle < Substance
include OpenTox
field :core, type: Hash, default: {}
field :coating, type: Array, default: []
field :proteomics, type: Hash, default: {}
def nanoparticle_neighbors_old min_sim: 0.9, type:, dataset_id:, prediction_feature_id:
dataset = Dataset.find(dataset_id)
neighbors = []
dataset.nanoparticles.each do |np|
values = dataset.values(np,prediction_feature_id)
if values
common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys
common_descriptors.select!{|id| NumericFeature.find(id) }
query_descriptors = common_descriptors.collect{|d| physchem_descriptors[d].first}
neighbor_descriptors = common_descriptors.collect{|d| np.physchem_descriptors[d].first}
sim = Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)
neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
end
end
neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
neighbors
end
def nanoparticle_neighbors min_sim: 0.9, type:, dataset_id:, prediction_feature_id:
p self.name
#p self.physchem_descriptors.keys.size
dataset = Dataset.find(dataset_id)
relevant_features = {}
toxicities = []
substances = []
# TODO: exclude query activities!!!
dataset.substances.each do |s|
dataset.values(s,prediction_feature_id).each do |act|
toxicities << act
substances << s
end
end
R.assign "tox", toxicities
feature_ids = physchem_descriptors.keys.select{|fid| Feature.find(fid).is_a? NumericFeature}
# identify relevant features
feature_ids.each do |feature_id|
feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]}
R.assign "feature", feature_values
begin
R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
pvalue = R.eval("cor$p.value").to_ruby
if pvalue <= 0.05
r = R.eval("cor$estimate").to_ruby
relevant_features[feature_id] = {}
relevant_features[feature_id]["pvalue"] = pvalue
relevant_features[feature_id]["r"] = r
relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
end
rescue
warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed."
end
end
neighbors = []
substances.each do |substance|
values = dataset.values(substance,prediction_feature_id)
if values
common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys
# scale values
query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
#weights = common_descriptors.collect{|d| 1-relevant_features[d]["pvalue"]}
weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2}
#p weights
sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights)
##p "SIM"
#p [sim, Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)]
neighbors << {"_id" => substance.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
end
end
p neighbors.size
neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
neighbors
end
def add_feature feature, value, dataset
unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
case feature.category
when "P-CHEM"
physchem_descriptors[feature.id.to_s] ||= []
physchem_descriptors[feature.id.to_s] << value
physchem_descriptors[feature.id.to_s].uniq!
when "Proteomics"
proteomics[feature.id.to_s] ||= []
proteomics[feature.id.to_s] << value
proteomics[feature.id.to_s].uniq!
when "TOX"
# TODO generic way of parsing TOX values
if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)"
dataset.add self, feature, Math.log2(value)
elsif feature.name == "Total protein (BCA assay)"
physchem_descriptors[feature.id.to_s] ||= []
physchem_descriptors[feature.id.to_s] << value
physchem_descriptors[feature.id.to_s].uniq!
else
dataset.add self, feature, value
end
dataset.save
dataset_ids << dataset.id
dataset_ids.uniq!
else
warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
end
end
end
def parse_ambit_value feature, v, dataset
v.delete "unit"
# TODO: ppm instead of weights
if v.keys == ["textValue"]
add_feature feature, v["textValue"], dataset
elsif v.keys == ["loValue"]
add_feature feature, v["loValue"], dataset
elsif v.keys.size == 2 and v["errorValue"]
add_feature feature, v["loValue"], dataset
warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
elsif v.keys.size == 2 and v["loQualifier"] == "mean"
add_feature feature, v["loValue"], dataset
warn "'#{feature.name}' is a mean value. Original data is not available."
elsif v.keys.size == 2 and v["loQualifier"] #== ">="
warn "Only min value available for '#{feature.name}', entry ignored"
elsif v.keys.size == 2 and v["upQualifier"] #== ">="
warn "Only max value available for '#{feature.name}', entry ignored"
elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
add_feature feature, v["loValue"], dataset
warn "loQualifier and upQualifier are empty."
elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
add_feature feature, v["loValue"], dataset
warn "loQualifier and upQualifier are empty."
elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
add_feature feature, v["loValue"], dataset
warn "loQualifier and upQualifier are empty."
elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
add_feature feature, [v["loValue"],v["upValue"]].mean, dataset
warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
add_feature feature, v["loValue"], dataset
elsif v == {} # do nothing
else
warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
end
end
end
end
|