caf8a6eb624c82a1329c664fa1aed8b1718e5f28
[lazar] / lib / model.rb
1 module OpenTox
2
3   module Model
4
5     class Lazar 
6
7       include OpenTox
8       include Mongoid::Document
9       include Mongoid::Timestamps
10       store_in collection: "models"
11
12       attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
13
14       field :name, type: String
15       field :creator, type: String, default: __FILE__
16       field :algorithms, type: Hash, default:{}
17       field :training_dataset_id, type: BSON::ObjectId
18       field :substance_ids, type: Array, default:[]
19       field :prediction_feature_id, type: BSON::ObjectId
20       field :dependent_variables, type: Array, default:[]
21       field :descriptor_ids, type:Array, default:[]
22       field :independent_variables_id, type: BSON::ObjectId
23       field :fingerprints, type: Array, default:[]
24       field :descriptor_weights, type: Array, default:[]
25       field :descriptor_means, type: Array, default:[]
26       field :descriptor_sds, type: Array, default:[]
27       field :scaled_variables, type: Array, default:[]
28       field :version, type: Hash, default:{}
29       
30       # Create a lazar model
31       # @param [OpenTox::Dataset] training_dataset
32       # @param [OpenTox::Feature, nil] prediction_feature
33       #   By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
34       # @param [Hash, nil] algorithms
35       #   Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. 
36       #
37       # @return [OpenTox::Model::Lazar]
38       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
39         raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
40         prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
41
42         # guess model type
43         prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
44
45         model.prediction_feature_id = prediction_feature.id
46         model.training_dataset_id = training_dataset.id
47         model.name = training_dataset.name
48         
49         # git or gem versioning
50         dir = File.dirname(__FILE__)
51         path = File.expand_path("../", File.expand_path(dir))
52         if Dir.exists?(dir+"/.git")
53           commit = `git rev-parse HEAD`.chomp
54           branch = `git rev-parse --abbrev-ref HEAD`.chomp
55           url = `git config --get remote.origin.url`.chomp
56           model.version = {:url => url, :branch => branch, :commit => commit}
57         else
58           version = File.open(path+"/VERSION", &:gets).chomp
59           url = "https://rubygems.org/gems/lazar/versions/"+version
60           model.version = {:url => url, :branch => "gem", :commit => version}
61         end
62
63         # set defaults#
64         substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
65         raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
66
67         if substance_classes.first == "OpenTox::Compound"
68
69           model.algorithms = {
70             :descriptors => {
71               :method => "fingerprint",
72               :type => "MP2D",
73             },
74             :feature_selection => nil
75           }
76
77           if model.class == LazarClassification
78             model.algorithms[:prediction] = {
79                 :method => "Algorithm::Classification.weighted_majority_vote",
80             }
81             model.algorithms[:similarity] = {
82               :method => "Algorithm::Similarity.tanimoto",
83               :min => 0.5,
84             }
85           elsif model.class == LazarRegression
86             model.algorithms[:prediction] = {
87               :method => "Algorithm::Caret.rf",
88             }
89             model.algorithms[:similarity] = {
90               :method => "Algorithm::Similarity.tanimoto",
91               :min => 0.5,
92             }
93           end
94
95         elsif substance_classes.first == "OpenTox::Nanoparticle"
96           model.algorithms = {
97             :descriptors => {
98               :method => "properties",
99               :categories => ["P-CHEM"],
100             },
101             :similarity => {
102               :method => "Algorithm::Similarity.weighted_cosine",
103               :min => 0.5,
104             },
105             :prediction => {
106               :method => "Algorithm::Caret.rf",
107             },
108             :feature_selection => {
109               :method => "Algorithm::FeatureSelection.correlation_filter",
110             },
111           }
112         else
113           raise ArgumentError, "Cannot create models for #{substance_classes.first}."
114         end
115         
116         # overwrite defaults with explicit parameters
117         algorithms.each do |type,parameters|
118           if parameters and parameters.is_a? Hash
119             parameters.each do |p,v|
120               model.algorithms[type] ||= {}
121               model.algorithms[type][p] = v
122               model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
123             end
124           else
125             model.algorithms[type] = parameters
126           end
127         end if algorithms
128
129         # parse dependent_variables from training dataset
130         training_dataset.substances.each do |substance|
131           values = training_dataset.values(substance,model.prediction_feature_id)
132           values.each do |v|
133             model.substance_ids << substance.id.to_s
134             model.dependent_variables << v
135           end if values
136         end
137
138         descriptor_method = model.algorithms[:descriptors][:method]
139         model.independent_variables = []
140         case descriptor_method
141         # parse fingerprints
142         when "fingerprint"
143           type = model.algorithms[:descriptors][:type]
144           model.substances.each_with_index do |s,i|
145             model.fingerprints[i] ||= [] 
146             model.fingerprints[i] += s.fingerprint(type)
147             model.fingerprints[i].uniq!
148           end
149           model.descriptor_ids = model.fingerprints.flatten.uniq
150           model.descriptor_ids.each do |d|
151             model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
152           end
153         # calculate physchem properties
154         when "calculate_properties"
155           features = model.algorithms[:descriptors][:features]
156           model.descriptor_ids = features.collect{|f| f.id.to_s}
157           model.algorithms[:descriptors].delete(:features)
158           model.algorithms[:descriptors].delete(:type)
159           model.substances.each_with_index do |s,i|
160             props = s.calculate_properties(features)
161             props.each_with_index do |v,j|
162               model.independent_variables[j] ||= []
163               model.independent_variables[j][i] = v
164             end if props and !props.empty?
165           end
166         # parse independent_variables
167         when "properties"
168           categories = model.algorithms[:descriptors][:categories]
169           feature_ids = []
170           categories.each do |category|
171             Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
172           end
173           properties = model.substances.collect { |s| s.properties  }
174           property_ids = properties.collect{|p| p.keys}.flatten.uniq
175           model.descriptor_ids = feature_ids & property_ids
176           model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
177         else
178           raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented."
179         end
180         
181         if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
182           model = Algorithm.run model.algorithms[:feature_selection][:method], model
183         end
184
185         # scale independent_variables
186         unless model.fingerprints?
187           model.independent_variables.each_with_index do |var,i|
188             model.descriptor_means[i] = var.mean
189             model.descriptor_sds[i] =  var.standard_deviation
190             model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
191           end
192         end
193         model.save
194         model
195       end
196
197       # Predict a substance (compound or nanoparticle)
198       # @param [OpenTox::Substance]
199       # @return [Hash]
200       def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil
201         
202         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
203         case algorithms[:similarity][:method]
204         when /tanimoto/ # binary features
205           similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
206           # TODO this excludes descriptors only present in the query substance
207           # use for applicability domain?
208           query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
209         when /euclid|cosine/ # quantitative features
210           if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
211             features = descriptor_ids.collect{|id| Feature.find(id)}
212             query_descriptors = substance.calculate_properties(features)
213             similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
214           else
215             similarity_descriptors = []
216             query_descriptors = []
217             descriptor_ids.each_with_index do |id,i|
218               prop = substance.properties[id]
219               prop = prop.median if prop.is_a? Array # measured
220               if prop
221                 similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
222                 query_descriptors[i] = prop
223               end
224             end
225           end
226         else
227           raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
228         end
229         
230         prediction ||= {:warnings => [], :measurements => []}
231         prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
232         neighbor_ids = []
233         neighbor_similarities = []
234         neighbor_dependent_variables = []
235         neighbor_independent_variables = []
236
237         # find neighbors
238         substance_ids.each_with_index do |s,i|
239           # handle query substance
240           if substance.id.to_s == s
241             prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass
242             prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
243           else
244             if fingerprints?
245               neighbor_descriptors = fingerprints[i]
246             else
247               next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
248               neighbor_descriptors = scaled_variables.collect{|v| v[i]}
249             end
250             sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
251             if sim >= threshold
252               neighbor_ids << s
253               neighbor_similarities << sim
254               neighbor_dependent_variables << dependent_variables[i]
255               independent_variables.each_with_index do |c,j|
256                 neighbor_independent_variables[j] ||= []
257                 neighbor_independent_variables[j] << @independent_variables[j][i]
258               end
259             end
260           end
261         end
262
263         measurements = nil
264         
265         if neighbor_similarities.empty?
266           prediction[:value] = nil
267           prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
268         elsif neighbor_similarities.size == 1
269           prediction[:value] = nil
270           prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
271           prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
272         else
273           query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
274           # call prediction algorithm
275           result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
276           prediction.merge! result
277           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
278         end
279         if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
280           prediction
281         else # try again with a lower threshold
282           prediction[:warnings] << "Lowering similarity threshold to 0.2."
283           predict_substance substance, 0.2, prediction
284         end
285       end
286
287       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
288       # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
289       # @return [Hash, Array<Hash>, OpenTox::Dataset]
290       def predict object
291
292         training_dataset = Dataset.find training_dataset_id
293
294         # parse data
295         substances = []
296         if object.is_a? Substance
297           substances = [object] 
298         elsif object.is_a? Array
299           substances = object
300         elsif object.is_a? Dataset
301           substances = object.substances
302         else 
303           raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
304         end
305
306         # make predictions
307         predictions = {}
308         substances.each do |c|
309           predictions[c.id.to_s] = predict_substance c
310           if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value]
311             prediction_feature.accept_values.each do |v|
312               predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity)
313             end
314           end
315           predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
316         end
317
318         # serialize result
319         if object.is_a? Substance
320           prediction = predictions[substances.first.id.to_s]
321           prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
322           return prediction
323         elsif object.is_a? Array
324           return predictions
325         elsif object.is_a? Dataset
326           d = object.copy
327           warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
328           if prediction_feature.is_a? NominalBioActivity
329             f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
330             probability_features = {}
331             prediction_feature.accept_values.each do |v|
332               probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
333             end
334           elsif prediction_feature.is_a? NumericBioActivity
335             f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
336             prediction_interval = {}
337             ["lower","upper"].each do |v|
338               prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
339             end
340           end
341
342           # add predictions to dataset
343           predictions.each do |substance_id,p|
344             substance_id = BSON::ObjectId.from_string(substance_id)
345             d.add substance_id,warning_feature,p[:warnings].join(" ") unless p[:warnings].empty?
346             unless p[:value].nil?
347               d.add substance_id,f,p[:value]
348               p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
349               p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval]
350             end
351           end
352           d.save
353           return d
354         end
355
356       end
357
358       # Save the model
359       #   Stores independent_variables in GridFS to avoid Mongo database size limit problems
360       def save
361         file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
362         self.independent_variables_id = $gridfs.insert_one(file)
363         super
364       end
365
366       # Get independent variables
367       # @return [Array<Array>]
368       def independent_variables 
369         @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
370         @independent_variables
371       end
372
373       # Get training dataset
374       # @return [OpenTox::Dataset]
375       def training_dataset
376         Dataset.find(training_dataset_id)
377       end
378
379       # Get prediction feature
380       # @return [OpenTox::Feature]
381       def prediction_feature
382         Feature.find(prediction_feature_id)
383       end
384
385       # Get training descriptors
386       # @return [Array<OpenTox::Feature>]
387       def descriptors
388         descriptor_ids.collect{|id| Feature.find(id)}
389       end
390
391       # Get training substances
392       # @return [Array<OpenTox::Substance>]
393       def substances
394         substance_ids.collect{|id| Substance.find(id)}
395       end
396
397       # Are fingerprints used as descriptors
398       # @return [TrueClass, FalseClass]
399       def fingerprints?
400         algorithms[:descriptors][:method] == "fingerprint" ? true : false
401       end
402
403     end
404
405     # Classification model
406     class LazarClassification < Lazar
407     end
408
409     # Regression model
410     class LazarRegression < Lazar
411     end
412
413     # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
414     class Validation
415
416       include OpenTox
417       include Mongoid::Document
418       include Mongoid::Timestamps
419
420       field :endpoint, type: String
421       field :qmrf, type: Hash
422       field :species, type: String
423       field :source, type: String
424       field :unit, type: String
425       field :warnings, type: Array
426       field :model_id, type: BSON::ObjectId
427       field :repeated_crossvalidation_id, type: BSON::ObjectId
428
429       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
430       # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
431       # @return [Hash, Array<Hash>, OpenTox::Dataset]
432       def predict object
433         model.predict object
434       end
435
436       # Get training dataset
437       # @return [OpenTox::Dataset]
438       def training_dataset
439         model.training_dataset
440       end
441
442       # Get lazar model
443       # @return [OpenTox::Model::Lazar]
444       def model
445         Lazar.find model_id
446       end
447
448       # Get algorithms
449       # @return [Hash]
450       def algorithms
451         model.algorithms
452       end
453
454       # Get prediction feature
455       # @return [OpenTox::Feature]
456       def prediction_feature
457         model.prediction_feature
458       end
459
460       # Get repeated crossvalidations
461       # @return [OpenTox::Validation::RepeatedCrossValidation]
462       def repeated_crossvalidation
463         OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
464       end
465
466       # Get crossvalidations
467       # @return [Array<OpenTox::CrossValidation]
468       def crossvalidations
469         repeated_crossvalidation.crossvalidations
470       end
471
472       # Is it a regression model
473       # @return [TrueClass, FalseClass]
474       def regression?
475         model.is_a? LazarRegression
476       end
477
478       # Is it a classification model
479       # @return [TrueClass, FalseClass]
480       def classification?
481         model.is_a? LazarClassification
482       end
483
484       # Create and validate a lazar model from a csv file with training data and a json file with metadata
485       # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar.
486       # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
487       def self.from_csv_file file
488         metadata_file = file.sub(/csv$/,"json")
489         raise ArgumentError, "No metadata file #{metadata_file}" unless File.exist? metadata_file
490         model_validation = self.new JSON.parse(File.read(metadata_file))
491         training_dataset = Dataset.from_csv_file file
492         model = Lazar.create training_dataset: training_dataset
493         model_validation[:model_id] = model.id
494         model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
495         model_validation.save
496         model_validation
497       end
498
499       # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
500       #   nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
501       #   *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!*
502       # @param [OpenTox::Dataset, nil] training_dataset
503       # @param [OpenTox::Feature, nil] prediction_feature
504       # @param [Hash, nil] algorithms
505       # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
506       def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
507         
508         # find/import training_dataset
509         training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
510         unless training_dataset # try to import 
511           Import::Enanomapper.import
512           training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
513           raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
514         end
515         prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
516
517         model_validation = self.new(
518           :endpoint => prediction_feature.name,
519           :source => prediction_feature.source,
520           :species => "A549 human lung epithelial carcinoma cells",
521           :unit => prediction_feature.unit
522         )
523         model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
524         model_validation[:model_id] = model.id
525         repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5
526         model_validation[:repeated_crossvalidation_id] = repeated_cv.id
527         model_validation.save
528         model_validation
529       end
530
531     end
532
533   end
534
535 end