lib/nanoparticle.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

module OpenTox

  class Nanoparticle < Substance
    include OpenTox

    field :core, type: Hash, default: {}
    field :coating, type: Array, default: []
    field :proteomics, type: Hash, default: {}

    def nanoparticle_neighbors_old min_sim: 0.9, type:, dataset_id:, prediction_feature_id:
      dataset = Dataset.find(dataset_id)
      neighbors = []
      dataset.nanoparticles.each do |np|
        values = dataset.values(np,prediction_feature_id)
        if values
          common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys
          common_descriptors.select!{|id| NumericFeature.find(id) }
          query_descriptors = common_descriptors.collect{|d| physchem_descriptors[d].first}
          neighbor_descriptors = common_descriptors.collect{|d| np.physchem_descriptors[d].first}
          sim = Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)
          neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
        end
      end
      neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
      neighbors
    end
 
    def nanoparticle_neighbors min_sim: 0.9, type:, dataset_id:, prediction_feature_id:
      p self.name
      #p self.physchem_descriptors.keys.size
      dataset = Dataset.find(dataset_id)
      relevant_features = {}
      toxicities = []
      substances = []
      # TODO: exclude query activities!!!
      dataset.substances.each do |s|
        dataset.values(s,prediction_feature_id).each do |act|
          toxicities << act
          substances << s
        end
      end
      R.assign "tox", toxicities
      feature_ids = physchem_descriptors.keys.select{|fid| Feature.find(fid).is_a? NumericFeature}
      # identify relevant features
      feature_ids.each do |feature_id|
        feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]}
        R.assign "feature", feature_values
        begin
          R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
          pvalue = R.eval("cor$p.value").to_ruby
          if pvalue <= 0.05
            r = R.eval("cor$estimate").to_ruby
            relevant_features[feature_id] = {}
            relevant_features[feature_id]["pvalue"] = pvalue
            relevant_features[feature_id]["r"] = r
            relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
            relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
          end
        rescue
          warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed."
        end
      end
      neighbors = []
      substances.each do |substance|
        values = dataset.values(substance,prediction_feature_id)
        if values
          common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys
          # scale values
          query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
          neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
          #weights = common_descriptors.collect{|d| 1-relevant_features[d]["pvalue"]}
          weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2}
          #p weights
          sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights)
          ##p "SIM"
          #p [sim, Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)]
          neighbors << {"_id" => substance.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
        end
      end
      p neighbors.size
      neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
      neighbors
    end

    def add_feature feature, value, dataset
      unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
        case feature.category
        when "P-CHEM"
          physchem_descriptors[feature.id.to_s] ||= []
          physchem_descriptors[feature.id.to_s] << value
          physchem_descriptors[feature.id.to_s].uniq!
        when "Proteomics"
          proteomics[feature.id.to_s] ||= []
          proteomics[feature.id.to_s] << value
          proteomics[feature.id.to_s].uniq!
        when "TOX"
          # TODO generic way of parsing TOX values
          if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)" 
            dataset.add self, feature, Math.log2(value)
          elsif feature.name == "Total protein (BCA assay)"
            physchem_descriptors[feature.id.to_s] ||= []
            physchem_descriptors[feature.id.to_s] << value
            physchem_descriptors[feature.id.to_s].uniq!
          else
            dataset.add self, feature, value
          end
          dataset.save
          dataset_ids << dataset.id
          dataset_ids.uniq!
        else
          warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
        end
      end
    end

    def parse_ambit_value feature, v, dataset
      v.delete "unit"
      # TODO: ppm instead of weights
      if v.keys == ["textValue"]
        add_feature feature, v["textValue"], dataset
      elsif v.keys == ["loValue"]
        add_feature feature, v["loValue"], dataset
      elsif v.keys.size == 2 and v["errorValue"]
        add_feature feature, v["loValue"], dataset
        warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
      elsif v.keys.size == 2 and v["loQualifier"] == "mean"
        add_feature feature, v["loValue"], dataset
        warn "'#{feature.name}' is a mean value. Original data is not available."
      elsif v.keys.size == 2 and v["loQualifier"] #== ">="
        warn "Only min value available for '#{feature.name}', entry ignored"
      elsif v.keys.size == 2 and v["upQualifier"] #== ">="
        warn "Only max value available for '#{feature.name}', entry ignored"
      elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
        add_feature feature, v["loValue"], dataset
        warn "loQualifier and upQualifier are empty."
      elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
        add_feature feature, v["loValue"], dataset
        warn "loQualifier and upQualifier are empty."
      elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
        add_feature feature, v["loValue"], dataset
        warn "loQualifier and upQualifier are empty."
      elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
        add_feature feature, [v["loValue"],v["upValue"]].mean, dataset
        warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
      elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
        warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
        add_feature feature, v["loValue"], dataset
      elsif v == {} # do nothing
      else
        warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
      end
    end

  end
end