From 1f789133d961c29d3babfaf69cdde3d675288537 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 24 Aug 2019 14:44:52 +0200 Subject: initial refactored version for mutagenicity paper --- lib/model.rb | 669 ++++++++++------------------------------------------------- 1 file changed, 114 insertions(+), 555 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 07759c5..44e0e50 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -1,570 +1,129 @@ -module OpenTox - - module Model - - class Lazar - - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "models" - - attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems - - field :name, type: String - field :creator, type: String, default: __FILE__ - field :algorithms, type: Hash, default:{} - field :training_dataset_id, type: BSON::ObjectId - field :substance_ids, type: Array, default:[] - field :prediction_feature_id, type: BSON::ObjectId - field :dependent_variables, type: Array, default:[] - field :descriptor_ids, type:Array, default:[] - field :independent_variables_id, type: BSON::ObjectId - field :fingerprints, type: Array, default:[] - field :descriptor_weights, type: Array, default:[] - field :descriptor_means, type: Array, default:[] - field :descriptor_sds, type: Array, default:[] - field :scaled_variables, type: Array, default:[] - field :version, type: Hash, default:{} - - # Create a lazar model - # @param [OpenTox::Dataset] training_dataset - # @param [OpenTox::Feature, nil] prediction_feature - # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature - # @param [Hash, nil] algorithms - # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. - # - # @return [OpenTox::Model::Lazar] - def self.create prediction_feature:nil, training_dataset:, algorithms:{} - raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset - prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature - - # guess model type - prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new - - model.prediction_feature_id = prediction_feature.id - model.training_dataset_id = training_dataset.id - model.name = training_dataset.name - - # git or gem versioning - dir = File.dirname(__FILE__) - path = File.expand_path("../", File.expand_path(dir)) - if Dir.exists?(dir+"/.git") - commit = `git rev-parse HEAD`.chomp - branch = `git rev-parse --abbrev-ref HEAD`.chomp - url = `git config --get remote.origin.url`.chomp - model.version = {:url => url, :branch => branch, :commit => commit} - else - version = File.open(path+"/VERSION", &:gets).chomp - url = "https://rubygems.org/gems/lazar/versions/"+version - model.version = {:url => url, :branch => "gem", :commit => version} - end - - # set defaults# - substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq - raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 - - if substance_classes.first == "OpenTox::Compound" - - model.algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D", - }, - :feature_selection => nil - } - - if model.class == LazarClassification - model.algorithms[:prediction] = { - :method => "Algorithm::Classification.weighted_majority_vote", - } - model.algorithms[:similarity] = { - :method => "Algorithm::Similarity.tanimoto", - :min => [0.5,0.2], - } - elsif model.class == LazarRegression - model.algorithms[:prediction] = { - :method => "Algorithm::Caret.rf", - } - model.algorithms[:similarity] = { - :method => "Algorithm::Similarity.tanimoto", - :min => [0.5,0.2], - } - end - - elsif substance_classes.first == "OpenTox::Nanoparticle" - model.algorithms = { - :descriptors => { - :method => "properties", - :categories => ["P-CHEM"], - }, - :similarity => { - :method => "Algorithm::Similarity.weighted_cosine", - :min => [0.5,0.2], - }, - :prediction => { - :method => "Algorithm::Caret.rf", - }, - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - elsif substance_classes.first == "OpenTox::Substance" and algorithms[:descriptors][:method] == "properties" and algorithms[:descriptors][:categories] - model.algorithms = { - :feature_selection => nil, - :similarity => { # similarity algorithm - :method => "Algorithm::Similarity.weighted_cosine", - :min => [0.5,0.2] - }, - } - if model.class == LazarClassification - model.algorithms[:prediction] = { - :method => "Algorithm::Classification.weighted_majority_vote", - } - elsif model.class == LazarRegression - model.algorithms[:prediction] = { - :method => "Algorithm::Caret.rf", - } - end - else - raise ArgumentError, "Cannot create models for #{substance_classes.first} #{algorithms.to_json}." - end - - # overwrite defaults with explicit parameters - algorithms.each do |type,parameters| - if parameters and parameters.is_a? Hash - parameters.each do |p,v| - model.algorithms[type] ||= {} - model.algorithms[type][p] = v - model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type - end - else - model.algorithms[type] = parameters - end - end if algorithms - - # parse dependent_variables from training dataset - training_dataset.substances.each do |substance| - values = training_dataset.values(substance,model.prediction_feature_id) - values.each do |v| - model.substance_ids << substance.id.to_s - model.dependent_variables << v - end if values - end - - descriptor_method = model.algorithms[:descriptors][:method] - model.independent_variables = [] - case descriptor_method - # parse fingerprints - when "fingerprint" - type = model.algorithms[:descriptors][:type] - model.substances.each_with_index do |s,i| - model.fingerprints[i] ||= [] - model.fingerprints[i] += s.fingerprint(type) - model.fingerprints[i].uniq! - end - model.descriptor_ids = model.fingerprints.flatten.uniq - model.descriptor_ids.each do |d| - model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ - end - # calculate physchem properties - when "calculate_properties" - features = model.algorithms[:descriptors][:features] - model.descriptor_ids = features.collect{|f| f.id.to_s} - model.algorithms[:descriptors].delete(:features) - model.algorithms[:descriptors].delete(:type) - model.substances.each_with_index do |s,i| - props = s.calculate_properties(features) - props.each_with_index do |v,j| - model.independent_variables[j] ||= [] - model.independent_variables[j][i] = v - end if props and !props.empty? - end - # parse independent_variables - when "properties" - feature_ids = [] - model.algorithms[:descriptors][:categories].each do |category| - p category - Feature.where(category:category).each{|f| feature_ids << f.id.to_s} - end - p feature_ids - property_ids = model.substances.collect { |s| s.properties.keys }.flatten.uniq - p property_ids - model.descriptor_ids = feature_ids & property_ids - p model.descriptor_ids - exit - model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} - else - raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented." - end - - if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] - model = Algorithm.run model.algorithms[:feature_selection][:method], model - end - - # scale independent_variables - unless model.fingerprints? - model.independent_variables.each_with_index do |var,i| - model.descriptor_means[i] = var.mean - model.descriptor_sds[i] = var.standard_deviation - model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} - end - end - model.save - model - end - - # Predict a substance (compound or nanoparticle) - # @param [OpenTox::Substance] - # @return [Hash] - def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil - - @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data - case algorithms[:similarity][:method] - when /tanimoto/ # binary features - similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] - # TODO this excludes descriptors only present in the query substance - # use for applicability domain? - query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} - when /euclid|cosine/ # quantitative features - if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors - features = descriptor_ids.collect{|id| Feature.find(id)} - query_descriptors = substance.calculate_properties(features) - similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} - else - similarity_descriptors = [] - query_descriptors = [] - descriptor_ids.each_with_index do |id,i| - prop = substance.properties[id] - prop = prop.median if prop.is_a? Array # measured - if prop - similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i] - query_descriptors[i] = prop - end - end - end - else - raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." - end - - prediction ||= {:warnings => [], :measurements => []} - prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first - neighbor_ids = [] - neighbor_similarities = [] - neighbor_dependent_variables = [] - neighbor_independent_variables = [] - - # find neighbors - substance_ids.each_with_index do |s,i| - # handle query substance - if substance.id.to_s == s - prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass - prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." - else - if fingerprints? - neighbor_descriptors = fingerprints[i] - else - next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions - neighbor_descriptors = scaled_variables.collect{|v| v[i]} - end - sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] - if sim >= threshold - neighbor_ids << s - neighbor_similarities << sim - neighbor_dependent_variables << dependent_variables[i] - independent_variables.each_with_index do |c,j| - neighbor_independent_variables[j] ||= [] - neighbor_independent_variables[j] << @independent_variables[j][i] - end - end - end - end +class Model + + def initialize dir + @dir = dir + @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp} + @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp + if @dependent_variable_type == "binary" + abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"] + @dependent_variables = @dependent_variables.collect{|v| v.to_i} + elsif @dependent_variable_type == "numeric" + # TODO check for floats + @dependent_variables = @dependent_variables.collect{|v| v.to_f} + end + @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp + @independent_variables = [] + @smiles = [] + File.readlines(File.join(@dir,"independent_variables")).each do |line| + items = line.chomp.split(",") + @smiles << items.shift + items.collect!{|v| v.to_f} if @independent_variable_type == "numeric" + @independent_variables << items + end + @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f} + end - measurements = nil - - if neighbor_similarities.empty? - prediction[:value] = nil - prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset." - if threshold == algorithms[:similarity][:min].last - prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})." - return prediction + def crossvalidation folds=10 + start_time = Time.now + nr_instances = @independent_variables.size + indices = (0..nr_instances-1).to_a.shuffle + mid = (nr_instances/folds) + start = 0 + 0.upto(folds-1) do |i| + t = Time.now + print "Fold #{i}: " + # split train data + last = start+mid + last = last-1 unless nr_instances%folds > i + test_idxs = indices[start..last] || [] + idxs = { + :test => test_idxs, + :train => indices-test_idxs + } + start = last+1 + # write training/test data + cv_dir = File.join(@dir,"crossvalidation",i.to_s) + dirs = {} + idxs.each do |t,idx| + d = File.join cv_dir,t.to_s + dirs[t] = d + FileUtils.mkdir_p d + File.open(File.join(d,"independent_variables"),"w+") do |f| + idx.each do |i| + f.print "#{@smiles[i]}," + f.puts @independent_variables[i].join(",") end - elsif neighbor_similarities.size == 1 - prediction[:value] = nil - prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})." - prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}] - if threshold == algorithms[:similarity][:min].last - prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set." - return prediction - end - else - query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" - # call prediction algorithm - result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors - prediction.merge! result - prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end - if threshold == algorithms[:similarity][:min].first - if prediction[:warnings].empty? - prediction[:confidence] = "Similar to bioassay results" - return prediction - else # try again with a lower threshold - prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." - predict_substance substance, algorithms[:similarity][:min].last, prediction - end - elsif threshold < algorithms[:similarity][:min].first - prediction[:confidence] = "Lower than bioassay results" - return prediction + File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} + if t == :train + File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type } + File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type } + File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } end end - - # Predict a substance (compound or nanoparticle), an array of substances or a dataset - # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] - # @return [Hash, Array, OpenTox::Dataset] - def predict object - - training_dataset = Dataset.find training_dataset_id - - # parse data - substances = [] - if object.is_a? Substance - substances = [object] - elsif object.is_a? Array - substances = object - elsif object.is_a? Dataset - substances = object.substances - else - raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." - end - - # make predictions - predictions = {} - substances.each do |c| - predictions[c.id.to_s] = predict_substance c - if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value] - prediction_feature.accept_values.each do |v| - predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity) - end - end - predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id - end - - # serialize result - if object.is_a? Substance - prediction = predictions[substances.first.id.to_s] - prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity - return prediction - elsif object.is_a? Array - return predictions - elsif object.is_a? Dataset - d = object.copy - #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) - confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id) - if prediction_feature.is_a? NominalBioActivity - f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) - probability_features = {} - prediction_feature.accept_values.each do |v| - probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) - end - elsif prediction_feature.is_a? NumericBioActivity - f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) - prediction_interval = [] - ["lower","upper"].each do |v| - prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) - end - end - - # add predictions to dataset - predictions.each do |substance_id,p| - substance_id = BSON::ObjectId.from_string(substance_id) - d.add substance_id,confidence_feature,p[:confidence] - unless p[:value].nil? - d.add substance_id,f,p[:value] - p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] - p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval] - end - end - d.save - return d - end - - end - - # Save the model - # Stores independent_variables in GridFS to avoid Mongo database size limit problems - def save - file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") - self.independent_variables_id = $gridfs.insert_one(file) - super - end - - # Get independent variables - # @return [Array] - def independent_variables - @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data - @independent_variables - end - - # Get training dataset - # @return [OpenTox::Dataset] - def training_dataset - Dataset.find(training_dataset_id) - end - - # Get prediction feature - # @return [OpenTox::Feature] - def prediction_feature - Feature.find(prediction_feature_id) - end - - # Get training descriptors - # @return [Array] - def descriptors - descriptor_ids.collect{|id| Feature.find(id)} - end - - # Get training substances - # @return [Array] - def substances - substance_ids.collect{|id| Substance.find(id)} - end - - # Are fingerprints used as descriptors - # @return [TrueClass, FalseClass] - def fingerprints? - algorithms[:descriptors][:method] == "fingerprint" ? true : false - end - + # predict + train_model = self.class.new dirs[:train] + train_model.predict_file File.join(dirs[:test],"independent_variables") + puts Time.now-t end + puts "Total: #{Time.now-start_time}" + end +end - # Classification model - class LazarClassification < Lazar - end +class ClassificationModel < Model - # Regression model - class LazarRegression < Lazar + def predict_file independent_variable_file + pred_dir = File.dirname independent_variable_file + predictions = [] + File.readlines(independent_variable_file).each do |line| + variables = line.chomp.split(",") + smiles = variables.shift + variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric" + predictions << predict(smiles,variables) end + File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} } + end - # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets - class Validation - - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - - field :endpoint, type: String - field :qmrf, type: Hash - field :species, type: String - field :source, type: String - field :unit, type: String - field :warnings, type: Array - field :model_id, type: BSON::ObjectId - field :repeated_crossvalidation_id, type: BSON::ObjectId - - # Predict a substance (compound or nanoparticle), an array of substances or a dataset - # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] - # @return [Hash, Array, OpenTox::Dataset] - def predict object - model.predict object - end - - # Get training dataset - # @return [OpenTox::Dataset] - def training_dataset - model.training_dataset - end - - # Get lazar model - # @return [OpenTox::Model::Lazar] - def model - Lazar.find model_id - end - - # Get algorithms - # @return [Hash] - def algorithms - model.algorithms - end - - # Get prediction feature - # @return [OpenTox::Feature] - def prediction_feature - model.prediction_feature - end - - # Get repeated crossvalidations - # @return [OpenTox::Validation::RepeatedCrossValidation] - def repeated_crossvalidation - OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required - end - - # Get crossvalidations - # @return [Array "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset # try to import - Import::Enanomapper.import - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset - end - prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first - - model_validation = self.new( - :endpoint => prediction_feature.name, - :source => prediction_feature.source, - :species => "A549 human lung epithelial carcinoma cells", - :unit => prediction_feature.unit - ) - model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms - model_validation[:model_id] = model.id - repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5 - model_validation[:repeated_crossvalidation_id] = repeated_cv.id - model_validation.save - model_validation + # TODO: with neighbors + def predict_smiles smiles + end + + def predict smiles, variables + similarities = [] + @independent_variables.each do |row| + if @independent_variable_type == "binary" + similarities << Similarity.tanimoto([row, variables]) + elsif @independent_variable_type == "numeric" + similarities << Similarity.cosine([row, variables]) end - end + neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} + neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold + neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds + return [smiles,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2 + + neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]} + neighbor_weights = neighbor_idx.collect{|i| similarities[i]} + probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_weights) + probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 + + [ smiles, classification ] + probabilities + [ similarities.max, neighbor_idx.size ] + end + + # Weighted majority vote + # @param [Array<0,1>] dependent_variables + # @param [Array] weights + # @return [Array] probabilities + def weighted_majority_vote dependent_variables, weights + w = [] + w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]} + w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]} + weights_sum = weights.sum.to_f + weights_max = weights.max.to_f + probabilities = [] + probabilities[0] = weights_max*w[0].sum/weights_sum + probabilities[1] = weights_max*w[1].sum/weights_sum + probabilities end - end -- cgit v1.2.3