model documentation updated
[lazar] / lib / model.rb
1 module OpenTox
2
3   module Model
4
5     class Lazar 
6
7       include OpenTox
8       include Mongoid::Document
9       include Mongoid::Timestamps
10       store_in collection: "models"
11
12       attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
13
14       field :name, type: String
15       field :creator, type: String, default: __FILE__
16       field :algorithms, type: Hash, default:{}
17       field :training_dataset_id, type: BSON::ObjectId
18       field :substance_ids, type: Array, default:[]
19       field :prediction_feature_id, type: BSON::ObjectId
20       field :dependent_variables, type: Array, default:[]
21       field :descriptor_ids, type:Array, default:[]
22       field :independent_variables_id, type: BSON::ObjectId
23       field :fingerprints, type: Array, default:[]
24       field :descriptor_weights, type: Array, default:[]
25       field :descriptor_means, type: Array, default:[]
26       field :descriptor_sds, type: Array, default:[]
27       field :scaled_variables, type: Array, default:[]
28       field :version, type: Hash, default:{}
29       
30       # Create a lazar model
31       # @param [OpenTox::Dataset] training_dataset
32       # @param [OpenTox::Feature, nil] prediction_feature
33       #   By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
34       # @param [Hash, nil] algorithms
35       #   Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. 
36       #
37       # @return [OpenTox::Model::Lazar]
38       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
39         bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
40         prediction_feature = training_dataset.features.first unless prediction_feature
41         # TODO: prediction_feature without training_dataset: use all available data
42
43         # guess model type
44         prediction_feature.numeric? ?  model = LazarRegression.new : model = LazarClassification.new
45
46         model.prediction_feature_id = prediction_feature.id
47         model.training_dataset_id = training_dataset.id
48         model.name = "#{prediction_feature.name} (#{training_dataset.name})" 
49         # TODO: check if this works for gem version, add gem versioning?
50         dir = File.dirname(__FILE__)
51         commit = `cd #{dir}; git rev-parse HEAD`.chomp
52         branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp
53         url = `cd #{dir}; git config --get remote.origin.url`.chomp
54         if branch
55           model.version = {:url => url, :branch => branch, :commit => commit}
56         else
57           model.version = {:warning => "git is not installed"}
58         end
59
60         # set defaults
61         substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
62         bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
63
64         if substance_classes.first == "OpenTox::Compound"
65
66           model.algorithms = {
67             :descriptors => {
68               :method => "fingerprint",
69               :type => "MP2D",
70             },
71             :similarity => {
72               :method => "Algorithm::Similarity.tanimoto",
73               :min => 0.1
74             },
75             :feature_selection => nil
76           }
77
78           if model.class == LazarClassification
79             model.algorithms[:prediction] = {
80                 :method => "Algorithm::Classification.weighted_majority_vote",
81             }
82           elsif model.class == LazarRegression
83             model.algorithms[:prediction] = {
84               :method => "Algorithm::Caret.pls",
85             }
86           end
87
88         elsif substance_classes.first == "OpenTox::Nanoparticle"
89           model.algorithms = {
90             :descriptors => {
91               :method => "properties",
92               :categories => ["P-CHEM"],
93             },
94             :similarity => {
95               :method => "Algorithm::Similarity.weighted_cosine",
96               :min => 0.5
97             },
98             :prediction => {
99               :method => "Algorithm::Caret.rf",
100             },
101             :feature_selection => {
102               :method => "Algorithm::FeatureSelection.correlation_filter",
103             },
104           }
105         else
106           bad_request_error "Cannot create models for #{substance_classes.first}."
107         end
108         
109         # overwrite defaults with explicit parameters
110         algorithms.each do |type,parameters|
111           if parameters and parameters.is_a? Hash
112             parameters.each do |p,v|
113               model.algorithms[type] ||= {}
114               model.algorithms[type][p] = v
115               model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
116             end
117           else
118             model.algorithms[type] = parameters
119           end
120         end if algorithms
121
122         # parse dependent_variables from training dataset
123         training_dataset.substances.each do |substance|
124           values = training_dataset.values(substance,model.prediction_feature_id)
125           values.each do |v|
126             model.substance_ids << substance.id.to_s
127             model.dependent_variables << v
128           end if values
129         end
130
131         descriptor_method = model.algorithms[:descriptors][:method]
132         model.independent_variables = []
133         case descriptor_method
134         # parse fingerprints
135         when "fingerprint"
136           type = model.algorithms[:descriptors][:type]
137           model.substances.each_with_index do |s,i|
138             model.fingerprints[i] ||= [] 
139             model.fingerprints[i] += s.fingerprint(type)
140             model.fingerprints[i].uniq!
141           end
142           model.descriptor_ids = model.fingerprints.flatten.uniq
143           model.descriptor_ids.each do |d|
144             # resulting model may break BSON size limit (e.g. f Kazius dataset)
145             model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
146           end
147         # calculate physchem properties
148         when "calculate_properties"
149           features = model.algorithms[:descriptors][:features]
150           model.descriptor_ids = features.collect{|f| f.id.to_s}
151           model.algorithms[:descriptors].delete(:features)
152           model.algorithms[:descriptors].delete(:type)
153           model.substances.each_with_index do |s,i|
154             props = s.calculate_properties(features)
155             props.each_with_index do |v,j|
156               model.independent_variables[j] ||= []
157               model.independent_variables[j][i] = v
158             end if props and !props.empty?
159           end
160         # parse independent_variables
161         when "properties"
162           categories = model.algorithms[:descriptors][:categories]
163           feature_ids = []
164           categories.each do |category|
165             Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
166           end
167           properties = model.substances.collect { |s| s.properties  }
168           property_ids = properties.collect{|p| p.keys}.flatten.uniq
169           model.descriptor_ids = feature_ids & property_ids
170           model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
171         else
172           bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
173         end
174         
175         if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
176           model = Algorithm.run model.algorithms[:feature_selection][:method], model
177         end
178
179         # scale independent_variables
180         unless model.fingerprints?
181           model.independent_variables.each_with_index do |var,i|
182             model.descriptor_means[i] = var.mean
183             model.descriptor_sds[i] =  var.standard_deviation
184             model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
185           end
186         end
187         model.save
188         model
189       end
190
191       # Predict a substance (compound or nanoparticle)
192       # @param [OpenTox::Substance]
193       # @return [Hash]
194       def predict_substance substance
195         
196         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
197         case algorithms[:similarity][:method]
198         when /tanimoto/ # binary features
199           similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
200           # TODO this excludes descriptors only present in the query substance
201           # use for applicability domain?
202           query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
203         when /euclid|cosine/ # quantitative features
204           if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
205             features = descriptor_ids.collect{|id| Feature.find(id)}
206             query_descriptors = substance.calculate_properties(features)
207             similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
208           else
209             similarity_descriptors = []
210             query_descriptors = []
211             descriptor_ids.each_with_index do |id,i|
212               prop = substance.properties[id]
213               prop = prop.median if prop.is_a? Array # measured
214               if prop
215                 similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
216                 query_descriptors[i] = prop
217               end
218             end
219           end
220         else
221           bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
222         end
223         
224         prediction = {}
225         neighbor_ids = []
226         neighbor_similarities = []
227         neighbor_dependent_variables = []
228         neighbor_independent_variables = []
229
230         prediction = {}
231         # find neighbors
232         substance_ids.each_with_index do |s,i|
233           # handle query substance
234           if substance.id.to_s == s
235             prediction[:measurements] ||= []
236             prediction[:measurements] << dependent_variables[i]
237             prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
238           else
239             if fingerprints?
240               neighbor_descriptors = fingerprints[i]
241             else
242               next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
243               neighbor_descriptors = scaled_variables.collect{|v| v[i]}
244             end
245             sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
246             if sim >= algorithms[:similarity][:min]
247               neighbor_ids << s
248               neighbor_similarities << sim
249               neighbor_dependent_variables << dependent_variables[i]
250               independent_variables.each_with_index do |c,j|
251                 neighbor_independent_variables[j] ||= []
252                 neighbor_independent_variables[j] << @independent_variables[j][i]
253               end
254             end
255           end
256         end
257
258         measurements = nil
259         
260         if neighbor_similarities.empty?
261           prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
262         elsif neighbor_similarities.size == 1
263           prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
264         else
265           query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
266           # call prediction algorithm
267           result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
268           prediction.merge! result
269           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
270         end
271         prediction
272       end
273
274       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
275       # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
276       # @return [Hash, Array<Hash>, OpenTox::Dataset]
277       def predict object
278
279         training_dataset = Dataset.find training_dataset_id
280
281         # parse data
282         substances = []
283         if object.is_a? Substance
284           substances = [object] 
285         elsif object.is_a? Array
286           substances = object
287         elsif object.is_a? Dataset
288           substances = object.substances
289         else 
290           bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
291         end
292
293         # make predictions
294         predictions = {}
295         substances.each do |c|
296           predictions[c.id.to_s] = predict_substance c
297           predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
298         end
299
300         # serialize result
301         if object.is_a? Substance
302           prediction = predictions[substances.first.id.to_s]
303           prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
304           return prediction
305         elsif object.is_a? Array
306           return predictions
307         elsif object.is_a? Dataset
308           # prepare prediction dataset
309           measurement_feature = Feature.find prediction_feature_id
310
311           prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
312           prediction_dataset = LazarPrediction.create(
313             :name => "Lazar prediction for #{prediction_feature.name}",
314             :creator =>  __FILE__,
315             :prediction_feature_id => prediction_feature.id,
316             :predictions => predictions
317           )
318           return prediction_dataset
319         end
320
321       end
322
323       def save # store independent_variables in GridFS to avoid Mongo database size limit problems
324         file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
325         self.independent_variables_id = $gridfs.insert_one(file)
326         super
327       end
328
329       # Get independent variables
330       # @return [Array<Array>]
331       def independent_variables 
332         @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
333         @independent_variables
334       end
335
336       # Get training dataset
337       # @return [OpenTox::Dataset]
338       def training_dataset
339         Dataset.find(training_dataset_id)
340       end
341
342       # Get prediction feature
343       # @return [OpenTox::Feature]
344       def prediction_feature
345         Feature.find(prediction_feature_id)
346       end
347
348       # Get training descriptors
349       # @return [Array<OpenTox::Feature>]
350       def descriptors
351         descriptor_ids.collect{|id| Feature.find(id)}
352       end
353
354       # Get training substances
355       # @return [Array<OpenTox::Substance>]
356       def substances
357         substance_ids.collect{|id| Substance.find(id)}
358       end
359
360       def fingerprints?
361         algorithms[:descriptors][:method] == "fingerprint" ? true : false
362       end
363
364     end
365
366     # Classification model
367     class LazarClassification < Lazar
368     end
369
370     # Regression model
371     class LazarRegression < Lazar
372     end
373
374     # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
375     class Validation
376
377       include OpenTox
378       include Mongoid::Document
379       include Mongoid::Timestamps
380
381       field :endpoint, type: String
382       field :species, type: String
383       field :source, type: String
384       field :unit, type: String
385       field :model_id, type: BSON::ObjectId
386       field :repeated_crossvalidation_id, type: BSON::ObjectId
387
388       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
389       # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
390       # @return [Hash, Array<Hash>, OpenTox::Dataset]
391       def predict object
392         model.predict object
393       end
394
395       # Get training dataset
396       # @return [OpenTox::Dataset]
397       def training_dataset
398         model.training_dataset
399       end
400
401       # Get lazar model
402       # @return [OpenTox::Model::Lazar]
403       def model
404         Lazar.find model_id
405       end
406
407       # Get algorithms
408       # @return [Hash]
409       def algorithms
410         model.algorithms
411       end
412
413       # Get prediction feature
414       # @return [OpenTox::Feature]
415       def prediction_feature
416         model.prediction_feature
417       end
418
419       # Get repeated crossvalidations
420       # @return [OpenTox::Validation::RepeatedCrossValidation]
421       def repeated_crossvalidation
422         OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
423       end
424
425       # Get crossvalidations
426       # @return [Array<OpenTox::CrossValidation]
427       def crossvalidations
428         repeated_crossvalidation.crossvalidations
429       end
430
431       def regression?
432         model.is_a? LazarRegression
433       end
434
435       def classification?
436         model.is_a? LazarClassification
437       end
438
439       # Create and validate a lazar model from a csv file with training data and a json file with metadata
440       # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
441       # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
442       def self.from_csv_file file
443         metadata_file = file.sub(/csv$/,"json")
444         bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
445         model_validation = self.new JSON.parse(File.read(metadata_file))
446         training_dataset = Dataset.from_csv_file file
447         model = Lazar.create training_dataset: training_dataset
448         model_validation[:model_id] = model.id
449         model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
450         model_validation.save
451         model_validation
452       end
453
454       # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
455       # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
456       # @param [OpenTox::Dataset, nil] training_dataset
457       # @param [OpenTox::Feature, nil] prediction_feature
458       # @param [Hash, nil] algorithms
459       # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
460       def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
461         
462         # find/import training_dataset
463         training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
464         unless training_dataset # try to import 
465           Import::Enanomapper.import
466           training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
467           bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
468         end
469         prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
470
471         model_validation = self.new(
472           :endpoint => prediction_feature.name,
473           :source => prediction_feature.source,
474           :species => "A549 human lung epithelial carcinoma cells",
475           :unit => prediction_feature.unit
476         )
477         model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
478         model_validation[:model_id] = model.id
479         repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5
480         model_validation[:repeated_crossvalidation_id] = repeated_cv.id
481         model_validation.save
482         model_validation
483       end
484
485     end
486
487   end
488
489 end