module OpenTox module Model class Lazar include OpenTox include Mongoid::Document include Mongoid::Timestamps store_in collection: "models" attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems field :name, type: String field :creator, type: String, default: __FILE__ field :algorithms, type: Hash, default:{} field :training_dataset_id, type: BSON::ObjectId field :substance_ids, type: Array, default:[] field :prediction_feature_id, type: BSON::ObjectId field :dependent_variables, type: Array, default:[] field :descriptor_ids, type:Array, default:[] field :independent_variables_id, type: BSON::ObjectId field :fingerprints, type: Array, default:[] field :descriptor_weights, type: Array, default:[] field :descriptor_means, type: Array, default:[] field :descriptor_sds, type: Array, default:[] field :scaled_variables, type: Array, default:[] field :version, type: Hash, default:{} # Create a lazar model # @param [OpenTox::Dataset] training_dataset # @param [OpenTox::Feature, nil] prediction_feature # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature # @param [Hash, nil] algorithms # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. # # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:, algorithms:{} raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature # guess model type prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = training_dataset.name # git or gem versioning dir = File.dirname(__FILE__) path = File.expand_path("../", File.expand_path(dir)) if Dir.exists?(dir+"/.git") commit = `git rev-parse HEAD`.chomp branch = `git rev-parse --abbrev-ref HEAD`.chomp url = `git config --get remote.origin.url`.chomp model.version = {:url => url, :branch => branch, :commit => commit} else version = File.open(path+"/VERSION", &:gets).chomp url = "https://rubygems.org/gems/lazar/versions/"+version model.version = {:url => url, :branch => "gem", :commit => version} end # set defaults# substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 if substance_classes.first == "OpenTox::Compound" model.algorithms = { :descriptors => { :method => "fingerprint", :type => "MP2D", }, :feature_selection => nil } if model.class == LazarClassification model.algorithms[:prediction] = { :method => "Algorithm::Classification.weighted_majority_vote", } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", :min => [0.5,0.2], } elsif model.class == LazarRegression model.algorithms[:prediction] = { :method => "Algorithm::Caret.rf", } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", :min => [0.5,0.2], } end elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { :descriptors => { :method => "properties", :categories => ["P-CHEM"], }, :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => [0.5,0.2], }, :prediction => { :method => "Algorithm::Caret.rf", }, :feature_selection => { :method => "Algorithm::FeatureSelection.correlation_filter", }, } else raise ArgumentError, "Cannot create models for #{substance_classes.first}." end # overwrite defaults with explicit parameters algorithms.each do |type,parameters| if parameters and parameters.is_a? Hash parameters.each do |p,v| model.algorithms[type] ||= {} model.algorithms[type][p] = v model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type end else model.algorithms[type] = parameters end end if algorithms # parse dependent_variables from training dataset training_dataset.substances.each do |substance| values = training_dataset.values(substance,model.prediction_feature_id) values.each do |v| model.substance_ids << substance.id.to_s model.dependent_variables << v end if values end descriptor_method = model.algorithms[:descriptors][:method] model.independent_variables = [] case descriptor_method # parse fingerprints when "fingerprint" type = model.algorithms[:descriptors][:type] model.substances.each_with_index do |s,i| model.fingerprints[i] ||= [] model.fingerprints[i] += s.fingerprint(type) model.fingerprints[i].uniq! end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end # calculate physchem properties when "calculate_properties" features = model.algorithms[:descriptors][:features] model.descriptor_ids = features.collect{|f| f.id.to_s} model.algorithms[:descriptors].delete(:features) model.algorithms[:descriptors].delete(:type) model.substances.each_with_index do |s,i| props = s.calculate_properties(features) props.each_with_index do |v,j| model.independent_variables[j] ||= [] model.independent_variables[j][i] = v end if props and !props.empty? end # parse independent_variables when "properties" categories = model.algorithms[:descriptors][:categories] feature_ids = [] categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end properties = model.substances.collect { |s| s.properties } property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} else raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented." end if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] model = Algorithm.run model.algorithms[:feature_selection][:method], model end # scale independent_variables unless model.fingerprints? model.independent_variables.each_with_index do |var,i| model.descriptor_means[i] = var.mean model.descriptor_sds[i] = var.standard_deviation model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} end end model.save model end # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] when /tanimoto/ # binary features similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] # TODO this excludes descriptors only present in the query substance # use for applicability domain? query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} when /euclid|cosine/ # quantitative features if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors features = descriptor_ids.collect{|id| Feature.find(id)} query_descriptors = substance.calculate_properties(features) similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} else similarity_descriptors = [] query_descriptors = [] descriptor_ids.each_with_index do |id,i| prop = substance.properties[id] prop = prop.median if prop.is_a? Array # measured if prop similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i] query_descriptors[i] = prop end end end else raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end prediction ||= {:warnings => [], :measurements => []} prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first neighbor_ids = [] neighbor_similarities = [] neighbor_dependent_variables = [] neighbor_independent_variables = [] # find neighbors substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? neighbor_descriptors = fingerprints[i] else next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] if sim >= threshold neighbor_ids << s neighbor_similarities << sim neighbor_dependent_variables << dependent_variables[i] independent_variables.each_with_index do |c,j| neighbor_independent_variables[j] ||= [] neighbor_independent_variables[j] << @independent_variables[j][i] end end end end measurements = nil if neighbor_similarities.empty? prediction[:value] = nil prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset." if threshold == algorithms[:similarity][:min].last prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})." return prediction end elsif neighbor_similarities.size == 1 prediction[:value] = nil prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})." prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}] if threshold == algorithms[:similarity][:min].last prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set." return prediction end else query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end if threshold == algorithms[:similarity][:min].first if prediction[:warnings].empty? prediction[:confidence] = "Similar to bioassay results" return prediction else # try again with a lower threshold prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." predict_substance substance, algorithms[:similarity][:min].last, prediction end elsif threshold < algorithms[:similarity][:min].first prediction[:confidence] = "Lower than bioassay results" return prediction end end # Predict a substance (compound or nanoparticle), an array of substances or a dataset # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] # @return [Hash, Array, OpenTox::Dataset] def predict object training_dataset = Dataset.find training_dataset_id # parse data substances = [] if object.is_a? Substance substances = [object] elsif object.is_a? Array substances = object elsif object.is_a? Dataset substances = object.substances else raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions predictions = {} substances.each do |c| predictions[c.id.to_s] = predict_substance c if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value] prediction_feature.accept_values.each do |v| predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity) end end predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end # serialize result if object.is_a? Substance prediction = predictions[substances.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity return prediction elsif object.is_a? Array return predictions elsif object.is_a? Dataset d = object.copy #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id) if prediction_feature.is_a? NominalBioActivity f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) probability_features = {} prediction_feature.accept_values.each do |v| probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) prediction_interval = [] ["lower","upper"].each do |v| prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end end # add predictions to dataset predictions.each do |substance_id,p| substance_id = BSON::ObjectId.from_string(substance_id) d.add substance_id,confidence_feature,p[:confidence] unless p[:value].nil? d.add substance_id,f,p[:value] p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval] end end d.save return d end end # Save the model # Stores independent_variables in GridFS to avoid Mongo database size limit problems def save file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") self.independent_variables_id = $gridfs.insert_one(file) super end # Get independent variables # @return [Array] def independent_variables @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data @independent_variables end # Get training dataset # @return [OpenTox::Dataset] def training_dataset Dataset.find(training_dataset_id) end # Get prediction feature # @return [OpenTox::Feature] def prediction_feature Feature.find(prediction_feature_id) end # Get training descriptors # @return [Array] def descriptors descriptor_ids.collect{|id| Feature.find(id)} end # Get training substances # @return [Array] def substances substance_ids.collect{|id| Substance.find(id)} end # Are fingerprints used as descriptors # @return [TrueClass, FalseClass] def fingerprints? algorithms[:descriptors][:method] == "fingerprint" ? true : false end end # Classification model class LazarClassification < Lazar end # Regression model class LazarRegression < Lazar end # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets class Validation include OpenTox include Mongoid::Document include Mongoid::Timestamps field :endpoint, type: String field :qmrf, type: Hash field :species, type: String field :source, type: String field :unit, type: String field :warnings, type: Array field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId # Predict a substance (compound or nanoparticle), an array of substances or a dataset # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] # @return [Hash, Array, OpenTox::Dataset] def predict object model.predict object end # Get training dataset # @return [OpenTox::Dataset] def training_dataset model.training_dataset end # Get lazar model # @return [OpenTox::Model::Lazar] def model Lazar.find model_id end # Get algorithms # @return [Hash] def algorithms model.algorithms end # Get prediction feature # @return [OpenTox::Feature] def prediction_feature model.prediction_feature end # Get repeated crossvalidations # @return [OpenTox::Validation::RepeatedCrossValidation] def repeated_crossvalidation OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required end # Get crossvalidations # @return [Array "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first unless training_dataset # try to import Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset end prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first model_validation = self.new( :endpoint => prediction_feature.name, :source => prediction_feature.source, :species => "A549 human lung epithelial carcinoma cells", :unit => prediction_feature.unit ) model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms model_validation[:model_id] = model.id repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5 model_validation[:repeated_crossvalidation_id] = repeated_cv.id model_validation.save model_validation end end end end