321636d82a86292b5478a1439768bb59f94af89b
[lazar] / lib / model.rb
1 module OpenTox
2
3   module Model
4
5     class Lazar 
6
7       include OpenTox
8       include Mongoid::Document
9       include Mongoid::Timestamps
10       store_in collection: "models"
11
12       attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
13
14       field :name, type: String
15       field :creator, type: String, default: __FILE__
16       field :algorithms, type: Hash, default:{}
17       field :training_dataset_id, type: BSON::ObjectId
18       field :substance_ids, type: Array, default:[]
19       field :prediction_feature_id, type: BSON::ObjectId
20       field :dependent_variables, type: Array, default:[]
21       field :descriptor_ids, type:Array, default:[]
22       field :independent_variables_id, type: BSON::ObjectId
23       field :fingerprints, type: Array, default:[]
24       field :descriptor_weights, type: Array, default:[]
25       field :descriptor_means, type: Array, default:[]
26       field :descriptor_sds, type: Array, default:[]
27       field :scaled_variables, type: Array, default:[]
28       field :version, type: Hash, default:{}
29       
30       # Create a lazar model
31       # @param [OpenTox::Dataset, nil] training_dataset
32       # @param [OpenTox::Feature, nil] prediction_feature
33       # @param [Hash] algorithms
34       # @return [OpenTox::Model::Lazar]
35       def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
36         bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
37         prediction_feature = training_dataset.features.first unless prediction_feature
38         # TODO: prediction_feature without training_dataset: use all available data
39
40         # guess model type
41         prediction_feature.numeric? ?  model = LazarRegression.new : model = LazarClassification.new
42
43         model.prediction_feature_id = prediction_feature.id
44         model.training_dataset_id = training_dataset.id
45         model.name = "#{prediction_feature.name} (#{training_dataset.name})" 
46         # TODO: check if this works for gem version, add gem versioning?
47         dir = File.dirname(__FILE__)
48         commit = `cd #{dir}; git rev-parse HEAD`.chomp
49         branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp
50         url = `cd #{dir}; git config --get remote.origin.url`.chomp
51         if branch
52           model.version = {:url => url, :branch => branch, :commit => commit}
53         else
54           model.version = {:warning => "git is not installed"}
55         end
56
57         # set defaults
58         substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
59         bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
60
61         if substance_classes.first == "OpenTox::Compound"
62
63           model.algorithms = {
64             :descriptors => {
65               :method => "fingerprint",
66               :type => "MP2D",
67             },
68             :similarity => {
69               :method => "Algorithm::Similarity.tanimoto",
70               :min => 0.1
71             },
72             :feature_selection => nil
73           }
74
75           if model.class == LazarClassification
76             model.algorithms[:prediction] = {
77                 :method => "Algorithm::Classification.weighted_majority_vote",
78             }
79           elsif model.class == LazarRegression
80             model.algorithms[:prediction] = {
81               :method => "Algorithm::Caret.pls",
82             }
83           end
84
85         elsif substance_classes.first == "OpenTox::Nanoparticle"
86           model.algorithms = {
87             :descriptors => {
88               :method => "properties",
89               :categories => ["P-CHEM"],
90             },
91             :similarity => {
92               :method => "Algorithm::Similarity.weighted_cosine",
93               :min => 0.5
94             },
95             :prediction => {
96               :method => "Algorithm::Caret.rf",
97             },
98             :feature_selection => {
99               :method => "Algorithm::FeatureSelection.correlation_filter",
100             },
101           }
102         else
103           bad_request_error "Cannot create models for #{substance_classes.first}."
104         end
105         
106         # overwrite defaults with explicit parameters
107         algorithms.each do |type,parameters|
108           if parameters and parameters.is_a? Hash
109             parameters.each do |p,v|
110               model.algorithms[type] ||= {}
111               model.algorithms[type][p] = v
112               model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
113             end
114           else
115             model.algorithms[type] = parameters
116           end
117         end if algorithms
118
119         # parse dependent_variables from training dataset
120         training_dataset.substances.each do |substance|
121           values = training_dataset.values(substance,model.prediction_feature_id)
122           values.each do |v|
123             model.substance_ids << substance.id.to_s
124             model.dependent_variables << v
125           end if values
126         end
127
128         descriptor_method = model.algorithms[:descriptors][:method]
129         model.independent_variables = []
130         case descriptor_method
131         # parse fingerprints
132         when "fingerprint"
133           type = model.algorithms[:descriptors][:type]
134           model.substances.each_with_index do |s,i|
135             model.fingerprints[i] ||= [] 
136             model.fingerprints[i] += s.fingerprint(type)
137             model.fingerprints[i].uniq!
138           end
139           model.descriptor_ids = model.fingerprints.flatten.uniq
140           model.descriptor_ids.each do |d|
141             # resulting model may break BSON size limit (e.g. f Kazius dataset)
142             model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
143           end
144         # calculate physchem properties
145         when "calculate_properties"
146           features = model.algorithms[:descriptors][:features]
147           model.descriptor_ids = features.collect{|f| f.id.to_s}
148           model.algorithms[:descriptors].delete(:features)
149           model.algorithms[:descriptors].delete(:type)
150           model.substances.each_with_index do |s,i|
151             props = s.calculate_properties(features)
152             props.each_with_index do |v,j|
153               model.independent_variables[j] ||= []
154               model.independent_variables[j][i] = v
155             end if props and !props.empty?
156           end
157         # parse independent_variables
158         when "properties"
159           categories = model.algorithms[:descriptors][:categories]
160           feature_ids = []
161           categories.each do |category|
162             Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
163           end
164           properties = model.substances.collect { |s| s.properties  }
165           property_ids = properties.collect{|p| p.keys}.flatten.uniq
166           model.descriptor_ids = feature_ids & property_ids
167           model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
168         else
169           bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
170         end
171         
172         if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
173           model = Algorithm.run model.algorithms[:feature_selection][:method], model
174         end
175
176         # scale independent_variables
177         unless model.fingerprints?
178           model.independent_variables.each_with_index do |var,i|
179             model.descriptor_means[i] = var.mean
180             model.descriptor_sds[i] =  var.standard_deviation
181             model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
182           end
183         end
184         model.save
185         model
186       end
187
188       # Predict a substance 
189       # @param [OpenTox::Substance]
190       # @return [Hash]
191       def predict_substance substance
192         
193         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
194         case algorithms[:similarity][:method]
195         when /tanimoto/ # binary features
196           similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
197           # TODO this excludes descriptors only present in the query substance
198           # use for applicability domain?
199           query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
200         when /euclid|cosine/ # quantitative features
201           if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
202             features = descriptor_ids.collect{|id| Feature.find(id)}
203             query_descriptors = substance.calculate_properties(features)
204             similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
205           else
206             similarity_descriptors = []
207             query_descriptors = []
208             descriptor_ids.each_with_index do |id,i|
209               prop = substance.properties[id]
210               prop = prop.median if prop.is_a? Array # measured
211               if prop
212                 similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
213                 query_descriptors[i] = prop
214               end
215             end
216           end
217         else
218           bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
219         end
220         
221         prediction = {}
222         neighbor_ids = []
223         neighbor_similarities = []
224         neighbor_dependent_variables = []
225         neighbor_independent_variables = []
226
227         prediction = {}
228         # find neighbors
229         substance_ids.each_with_index do |s,i|
230           # handle query substance
231           if substance.id.to_s == s
232             prediction[:measurements] ||= []
233             prediction[:measurements] << dependent_variables[i]
234             prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
235           else
236             if fingerprints?
237               neighbor_descriptors = fingerprints[i]
238             else
239               next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
240               neighbor_descriptors = scaled_variables.collect{|v| v[i]}
241             end
242             sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
243             if sim >= algorithms[:similarity][:min]
244               neighbor_ids << s
245               neighbor_similarities << sim
246               neighbor_dependent_variables << dependent_variables[i]
247               independent_variables.each_with_index do |c,j|
248                 neighbor_independent_variables[j] ||= []
249                 neighbor_independent_variables[j] << @independent_variables[j][i]
250               end
251             end
252           end
253         end
254
255         measurements = nil
256         
257         if neighbor_similarities.empty?
258           prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
259         elsif neighbor_similarities.size == 1
260           prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
261         else
262           query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
263           # call prediction algorithm
264           result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
265           prediction.merge! result
266           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
267         end
268         prediction
269       end
270
271       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
272       # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
273       # @return [Hash, Array<Hash>, OpenTox::Dataset]
274       def predict object
275
276         training_dataset = Dataset.find training_dataset_id
277
278         # parse data
279         substances = []
280         if object.is_a? Substance
281           substances = [object] 
282         elsif object.is_a? Array
283           substances = object
284         elsif object.is_a? Dataset
285           substances = object.substances
286         else 
287           bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
288         end
289
290         # make predictions
291         predictions = {}
292         substances.each do |c|
293           predictions[c.id.to_s] = predict_substance c
294           predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
295         end
296
297         # serialize result
298         if object.is_a? Substance
299           prediction = predictions[substances.first.id.to_s]
300           prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
301           return prediction
302         elsif object.is_a? Array
303           return predictions
304         elsif object.is_a? Dataset
305           # prepare prediction dataset
306           measurement_feature = Feature.find prediction_feature_id
307
308           prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
309           prediction_dataset = LazarPrediction.create(
310             :name => "Lazar prediction for #{prediction_feature.name}",
311             :creator =>  __FILE__,
312             :prediction_feature_id => prediction_feature.id,
313             :predictions => predictions
314           )
315           return prediction_dataset
316         end
317
318       end
319
320       def save # store independent_variables in GridFS to avoid Mongo database size limit problems
321         file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
322         self.independent_variables_id = $gridfs.insert_one(file)
323         super
324       end
325
326       # Get independent variables
327       # @return [Array<Array>]
328       def independent_variables 
329         @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
330         @independent_variables
331       end
332
333       # Get training dataset
334       # @return [OpenTox::Dataset]
335       def training_dataset
336         Dataset.find(training_dataset_id)
337       end
338
339       # Get prediction feature
340       # @return [OpenTox::Feature]
341       def prediction_feature
342         Feature.find(prediction_feature_id)
343       end
344
345       # Get training descriptors
346       # @return [Array<OpenTox::Feature>]
347       def descriptors
348         descriptor_ids.collect{|id| Feature.find(id)}
349       end
350
351       # Get training substances
352       # @return [Array<OpenTox::Substance>]
353       def substances
354         substance_ids.collect{|id| Substance.find(id)}
355       end
356
357       def fingerprints?
358         algorithms[:descriptors][:method] == "fingerprint" ? true : false
359       end
360
361     end
362
363     # Classification model
364     class LazarClassification < Lazar
365     end
366
367     # Regression model
368     class LazarRegression < Lazar
369     end
370
371     # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
372     class Validation
373
374       include OpenTox
375       include Mongoid::Document
376       include Mongoid::Timestamps
377
378       field :endpoint, type: String
379       field :species, type: String
380       field :source, type: String
381       field :unit, type: String
382       field :model_id, type: BSON::ObjectId
383       field :repeated_crossvalidation_id, type: BSON::ObjectId
384
385       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
386       # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
387       # @return [Hash, Array<Hash>, OpenTox::Dataset]
388       def predict object
389         model.predict object
390       end
391
392       # Get training dataset
393       # @return [OpenTox::Dataset]
394       def training_dataset
395         model.training_dataset
396       end
397
398       # Get lazar model
399       # @return [OpenTox::Model::Lazar]
400       def model
401         Lazar.find model_id
402       end
403
404       # Get algorithms
405       # @return [Hash]
406       def algorithms
407         model.algorithms
408       end
409
410       # Get prediction feature
411       # @return [OpenTox::Feature]
412       def prediction_feature
413         model.prediction_feature
414       end
415
416       # Get repeated crossvalidations
417       # @return [OpenTox::Validation::RepeatedCrossValidation]
418       def repeated_crossvalidation
419         OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
420       end
421
422       # Get crossvalidations
423       # @return [Array<OpenTox::CrossValidation]
424       def crossvalidations
425         repeated_crossvalidation.crossvalidations
426       end
427
428       def regression?
429         model.is_a? LazarRegression
430       end
431
432       def classification?
433         model.is_a? LazarClassification
434       end
435
436       # Create and validate a lazar model from a csv file with training data and a json file with metadata
437       # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
438       # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
439       def self.from_csv_file file
440         metadata_file = file.sub(/csv$/,"json")
441         bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
442         model_validation = self.new JSON.parse(File.read(metadata_file))
443         training_dataset = Dataset.from_csv_file file
444         model = Lazar.create training_dataset: training_dataset
445         model_validation[:model_id] = model.id
446         model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
447         model_validation.save
448         model_validation
449       end
450
451       # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
452       # @param [OpenTox::Dataset, nil] training_dataset
453       # @param [OpenTox::Feature, nil] prediction_feature
454       # @param [Hash, nil] algorithms
455       # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
456       def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
457         
458         # find/import training_dataset
459         training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
460         unless training_dataset # try to import 
461           Import::Enanomapper.import
462           training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
463           bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
464         end
465         prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
466
467         model_validation = self.new(
468           :endpoint => prediction_feature.name,
469           :source => prediction_feature.source,
470           :species => "A549 human lung epithelial carcinoma cells",
471           :unit => prediction_feature.unit
472         )
473         model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
474         model_validation[:model_id] = model.id
475         repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5
476         model_validation[:repeated_crossvalidation_id] = repeated_cv.id
477         model_validation.save
478         model_validation
479       end
480
481     end
482
483   end
484
485 end