real datasets for testing, test data cleanup, Daphnia import, upper and lower similar...
[lazar] / lib / model.rb
1 module OpenTox
2
3   module Model
4
5     class Lazar 
6
7       include OpenTox
8       include Mongoid::Document
9       include Mongoid::Timestamps
10       store_in collection: "models"
11
12       attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
13
14       field :name, type: String
15       field :creator, type: String, default: __FILE__
16       field :algorithms, type: Hash, default:{}
17       field :training_dataset_id, type: BSON::ObjectId
18       field :substance_ids, type: Array, default:[]
19       field :prediction_feature_id, type: BSON::ObjectId
20       field :dependent_variables, type: Array, default:[]
21       field :descriptor_ids, type:Array, default:[]
22       field :independent_variables_id, type: BSON::ObjectId
23       field :fingerprints, type: Array, default:[]
24       field :descriptor_weights, type: Array, default:[]
25       field :descriptor_means, type: Array, default:[]
26       field :descriptor_sds, type: Array, default:[]
27       field :scaled_variables, type: Array, default:[]
28       field :version, type: Hash, default:{}
29       
30       # Create a lazar model
31       # @param [OpenTox::Dataset] training_dataset
32       # @param [OpenTox::Feature, nil] prediction_feature
33       #   By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
34       # @param [Hash, nil] algorithms
35       #   Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. 
36       #
37       # @return [OpenTox::Model::Lazar]
38       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
39         raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
40         prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
41
42         # guess model type
43         prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
44
45         model.prediction_feature_id = prediction_feature.id
46         model.training_dataset_id = training_dataset.id
47         model.name = training_dataset.name
48         
49         # git or gem versioning
50         dir = File.dirname(__FILE__)
51         path = File.expand_path("../", File.expand_path(dir))
52         if Dir.exists?(dir+"/.git")
53           commit = `git rev-parse HEAD`.chomp
54           branch = `git rev-parse --abbrev-ref HEAD`.chomp
55           url = `git config --get remote.origin.url`.chomp
56           model.version = {:url => url, :branch => branch, :commit => commit}
57         else
58           version = File.open(path+"/VERSION", &:gets).chomp
59           url = "https://rubygems.org/gems/lazar/versions/"+version
60           model.version = {:url => url, :branch => "gem", :commit => version}
61         end
62
63         # set defaults#
64         substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
65         raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
66
67         if substance_classes.first == "OpenTox::Compound"
68
69           model.algorithms = {
70             :descriptors => {
71               :method => "fingerprint",
72               :type => "MP2D",
73             },
74             :feature_selection => nil
75           }
76
77           if model.class == LazarClassification
78             model.algorithms[:prediction] = {
79                 :method => "Algorithm::Classification.weighted_majority_vote",
80             }
81             model.algorithms[:similarity] = {
82               :method => "Algorithm::Similarity.tanimoto",
83               :min => [0.5,0.2],
84             }
85           elsif model.class == LazarRegression
86             model.algorithms[:prediction] = {
87               :method => "Algorithm::Caret.rf",
88             }
89             model.algorithms[:similarity] = {
90               :method => "Algorithm::Similarity.tanimoto",
91               :min => [0.5,0.2],
92             }
93           end
94
95         elsif substance_classes.first == "OpenTox::Nanoparticle"
96           model.algorithms = {
97             :descriptors => {
98               :method => "properties",
99               :categories => ["P-CHEM"],
100             },
101             :similarity => {
102               :method => "Algorithm::Similarity.weighted_cosine",
103               :min => [0.5,0.2],
104             },
105             :prediction => {
106               :method => "Algorithm::Caret.rf",
107             },
108             :feature_selection => {
109               :method => "Algorithm::FeatureSelection.correlation_filter",
110             },
111           }
112         else
113           raise ArgumentError, "Cannot create models for #{substance_classes.first}."
114         end
115         
116         # overwrite defaults with explicit parameters
117         algorithms.each do |type,parameters|
118           if parameters and parameters.is_a? Hash
119             parameters.each do |p,v|
120               model.algorithms[type] ||= {}
121               model.algorithms[type][p] = v
122               model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
123             end
124           else
125             model.algorithms[type] = parameters
126           end
127         end if algorithms
128
129         # parse dependent_variables from training dataset
130         training_dataset.substances.each do |substance|
131           values = training_dataset.values(substance,model.prediction_feature_id)
132           values.each do |v|
133             model.substance_ids << substance.id.to_s
134             model.dependent_variables << v
135           end if values
136         end
137
138         descriptor_method = model.algorithms[:descriptors][:method]
139         model.independent_variables = []
140         case descriptor_method
141         # parse fingerprints
142         when "fingerprint"
143           type = model.algorithms[:descriptors][:type]
144           model.substances.each_with_index do |s,i|
145             model.fingerprints[i] ||= [] 
146             model.fingerprints[i] += s.fingerprint(type)
147             model.fingerprints[i].uniq!
148           end
149           model.descriptor_ids = model.fingerprints.flatten.uniq
150           model.descriptor_ids.each do |d|
151             model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
152           end
153         # calculate physchem properties
154         when "calculate_properties"
155           features = model.algorithms[:descriptors][:features]
156           model.descriptor_ids = features.collect{|f| f.id.to_s}
157           model.algorithms[:descriptors].delete(:features)
158           model.algorithms[:descriptors].delete(:type)
159           model.substances.each_with_index do |s,i|
160             props = s.calculate_properties(features)
161             props.each_with_index do |v,j|
162               model.independent_variables[j] ||= []
163               model.independent_variables[j][i] = v
164             end if props and !props.empty?
165           end
166         # parse independent_variables
167         when "properties"
168           categories = model.algorithms[:descriptors][:categories]
169           feature_ids = []
170           categories.each do |category|
171             Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
172           end
173           properties = model.substances.collect { |s| s.properties  }
174           property_ids = properties.collect{|p| p.keys}.flatten.uniq
175           model.descriptor_ids = feature_ids & property_ids
176           model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
177         else
178           raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented."
179         end
180         
181         if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
182           model = Algorithm.run model.algorithms[:feature_selection][:method], model
183         end
184
185         # scale independent_variables
186         unless model.fingerprints?
187           model.independent_variables.each_with_index do |var,i|
188             model.descriptor_means[i] = var.mean
189             model.descriptor_sds[i] =  var.standard_deviation
190             model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
191           end
192         end
193         model.save
194         model
195       end
196
197       # Predict a substance (compound or nanoparticle)
198       # @param [OpenTox::Substance]
199       # @return [Hash]
200       def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil
201         
202         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
203         case algorithms[:similarity][:method]
204         when /tanimoto/ # binary features
205           similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
206           # TODO this excludes descriptors only present in the query substance
207           # use for applicability domain?
208           query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
209         when /euclid|cosine/ # quantitative features
210           if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
211             features = descriptor_ids.collect{|id| Feature.find(id)}
212             query_descriptors = substance.calculate_properties(features)
213             similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
214           else
215             similarity_descriptors = []
216             query_descriptors = []
217             descriptor_ids.each_with_index do |id,i|
218               prop = substance.properties[id]
219               prop = prop.median if prop.is_a? Array # measured
220               if prop
221                 similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
222                 query_descriptors[i] = prop
223               end
224             end
225           end
226         else
227           raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
228         end
229         
230         prediction ||= {:warnings => [], :measurements => []}
231         prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first
232         neighbor_ids = []
233         neighbor_similarities = []
234         neighbor_dependent_variables = []
235         neighbor_independent_variables = []
236
237         # find neighbors
238         substance_ids.each_with_index do |s,i|
239           # handle query substance
240           if substance.id.to_s == s
241             prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass
242             prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
243           else
244             if fingerprints?
245               neighbor_descriptors = fingerprints[i]
246             else
247               next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
248               neighbor_descriptors = scaled_variables.collect{|v| v[i]}
249             end
250             sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
251             if sim >= threshold
252               neighbor_ids << s
253               neighbor_similarities << sim
254               neighbor_dependent_variables << dependent_variables[i]
255               independent_variables.each_with_index do |c,j|
256                 neighbor_independent_variables[j] ||= []
257                 neighbor_independent_variables[j] << @independent_variables[j][i]
258               end
259             end
260           end
261         end
262
263         measurements = nil
264         
265         if neighbor_similarities.empty?
266           prediction[:value] = nil
267           prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset."
268           if threshold == algorithms[:similarity][:min].last
269             prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})."
270             return prediction
271           end
272         elsif neighbor_similarities.size == 1
273           prediction[:value] = nil
274           prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})."
275           prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
276           if threshold == algorithms[:similarity][:min].last
277             prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set."
278             return prediction
279           end
280         else
281           query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
282           # call prediction algorithm
283           result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
284           prediction.merge! result
285           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
286         end
287         if threshold == algorithms[:similarity][:min].first
288           if prediction[:warnings].empty? 
289             prediction[:confidence] = "High (close to bioassay results)"
290             return prediction
291           else # try again with a lower threshold
292             prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
293             predict_substance substance, algorithms[:similarity][:min].last, prediction
294           end
295         elsif threshold < algorithms[:similarity][:min].first
296           prediction[:confidence] = "Low (lower than bioassay results)"
297           return prediction
298         end
299       end
300
301       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
302       # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
303       # @return [Hash, Array<Hash>, OpenTox::Dataset]
304       def predict object
305
306         training_dataset = Dataset.find training_dataset_id
307
308         # parse data
309         substances = []
310         if object.is_a? Substance
311           substances = [object] 
312         elsif object.is_a? Array
313           substances = object
314         elsif object.is_a? Dataset
315           substances = object.substances
316         else 
317           raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
318         end
319
320         # make predictions
321         predictions = {}
322         substances.each do |c|
323           predictions[c.id.to_s] = predict_substance c
324           if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value]
325             prediction_feature.accept_values.each do |v|
326               predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity)
327             end
328           end
329           predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
330         end
331
332         # serialize result
333         if object.is_a? Substance
334           prediction = predictions[substances.first.id.to_s]
335           prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
336           return prediction
337         elsif object.is_a? Array
338           return predictions
339         elsif object.is_a? Dataset
340           d = object.copy
341           warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
342           if prediction_feature.is_a? NominalBioActivity
343             f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
344             probability_features = {}
345             prediction_feature.accept_values.each do |v|
346               probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
347             end
348           elsif prediction_feature.is_a? NumericBioActivity
349             f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
350             prediction_interval = {}
351             ["lower","upper"].each do |v|
352               prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
353             end
354           end
355
356           # add predictions to dataset
357           predictions.each do |substance_id,p|
358             substance_id = BSON::ObjectId.from_string(substance_id)
359             d.add substance_id,warning_feature,p[:warnings].join(" ") unless p[:warnings].empty?
360             unless p[:value].nil?
361               d.add substance_id,f,p[:value]
362               p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
363               p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval]
364             end
365           end
366           d.save
367           return d
368         end
369
370       end
371
372       # Save the model
373       #   Stores independent_variables in GridFS to avoid Mongo database size limit problems
374       def save
375         file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
376         self.independent_variables_id = $gridfs.insert_one(file)
377         super
378       end
379
380       # Get independent variables
381       # @return [Array<Array>]
382       def independent_variables 
383         @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
384         @independent_variables
385       end
386
387       # Get training dataset
388       # @return [OpenTox::Dataset]
389       def training_dataset
390         Dataset.find(training_dataset_id)
391       end
392
393       # Get prediction feature
394       # @return [OpenTox::Feature]
395       def prediction_feature
396         Feature.find(prediction_feature_id)
397       end
398
399       # Get training descriptors
400       # @return [Array<OpenTox::Feature>]
401       def descriptors
402         descriptor_ids.collect{|id| Feature.find(id)}
403       end
404
405       # Get training substances
406       # @return [Array<OpenTox::Substance>]
407       def substances
408         substance_ids.collect{|id| Substance.find(id)}
409       end
410
411       # Are fingerprints used as descriptors
412       # @return [TrueClass, FalseClass]
413       def fingerprints?
414         algorithms[:descriptors][:method] == "fingerprint" ? true : false
415       end
416
417     end
418
419     # Classification model
420     class LazarClassification < Lazar
421     end
422
423     # Regression model
424     class LazarRegression < Lazar
425     end
426
427     # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
428     class Validation
429
430       include OpenTox
431       include Mongoid::Document
432       include Mongoid::Timestamps
433
434       field :endpoint, type: String
435       field :qmrf, type: Hash
436       field :species, type: String
437       field :source, type: String
438       field :unit, type: String
439       field :warnings, type: Array
440       field :model_id, type: BSON::ObjectId
441       field :repeated_crossvalidation_id, type: BSON::ObjectId
442
443       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
444       # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
445       # @return [Hash, Array<Hash>, OpenTox::Dataset]
446       def predict object
447         model.predict object
448       end
449
450       # Get training dataset
451       # @return [OpenTox::Dataset]
452       def training_dataset
453         model.training_dataset
454       end
455
456       # Get lazar model
457       # @return [OpenTox::Model::Lazar]
458       def model
459         Lazar.find model_id
460       end
461
462       # Get algorithms
463       # @return [Hash]
464       def algorithms
465         model.algorithms
466       end
467
468       # Get prediction feature
469       # @return [OpenTox::Feature]
470       def prediction_feature
471         model.prediction_feature
472       end
473
474       # Get repeated crossvalidations
475       # @return [OpenTox::Validation::RepeatedCrossValidation]
476       def repeated_crossvalidation
477         OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
478       end
479
480       # Get crossvalidations
481       # @return [Array<OpenTox::CrossValidation]
482       def crossvalidations
483         repeated_crossvalidation.crossvalidations
484       end
485
486       # Is it a regression model
487       # @return [TrueClass, FalseClass]
488       def regression?
489         model.is_a? LazarRegression
490       end
491
492       # Is it a classification model
493       # @return [TrueClass, FalseClass]
494       def classification?
495         model.is_a? LazarClassification
496       end
497
498       # Create and validate a lazar model from a csv file with training data and a json file with metadata
499       # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar.
500       # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
501       def self.from_csv_file file
502         metadata_file = file.sub(/csv$/,"json")
503         raise ArgumentError, "No metadata file #{metadata_file}" unless File.exist? metadata_file
504         model_validation = self.new JSON.parse(File.read(metadata_file))
505         training_dataset = Dataset.from_csv_file file
506         model = Lazar.create training_dataset: training_dataset
507         model_validation[:model_id] = model.id
508         model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
509         model_validation.save
510         model_validation
511       end
512
513       # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
514       #   nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
515       #   *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!*
516       # @param [OpenTox::Dataset, nil] training_dataset
517       # @param [OpenTox::Feature, nil] prediction_feature
518       # @param [Hash, nil] algorithms
519       # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
520       def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
521         
522         # find/import training_dataset
523         training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
524         unless training_dataset # try to import 
525           Import::Enanomapper.import
526           training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
527           raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
528         end
529         prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
530
531         model_validation = self.new(
532           :endpoint => prediction_feature.name,
533           :source => prediction_feature.source,
534           :species => "A549 human lung epithelial carcinoma cells",
535           :unit => prediction_feature.unit
536         )
537         model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
538         model_validation[:model_id] = model.id
539         repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5
540         model_validation[:repeated_crossvalidation_id] = repeated_cv.id
541         model_validation.save
542         model_validation
543       end
544
545     end
546
547   end
548
549 end