From 398d8ca681db3aa0a0552eee026705e60dd8449d Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 20 Aug 2015 14:02:04 +0200 Subject: crossvalidation fixes --- lib/crossvalidation.rb | 17 ++++- lib/dataset.rb | 2 +- lib/lazar-model.rb | 170 ----------------------------------------------- lib/lazar.rb | 2 +- lib/model.rb | 177 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/regression.rb | 24 +++++++ 6 files changed, 217 insertions(+), 175 deletions(-) delete mode 100644 lib/lazar-model.rb create mode 100644 lib/model.rb (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index d926cc4..d0ad324 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -2,11 +2,16 @@ module OpenTox class CrossValidation field :validation_ids, type: Array, default: [] + field :model_id, type: BSON::ObjectId field :folds, type: Integer field :nr_instances, type: Integer field :nr_unpredicted, type: Integer field :predictions, type: Array field :finished_at, type: Time + + def time + finished_at - created_at + end end class ClassificationCrossValidation < CrossValidation @@ -22,6 +27,7 @@ module OpenTox def self.create model, n=10 cv = self.new + cv.save # set created_at validation_ids = [] nr_instances = 0 nr_unpredicted = 0 @@ -64,6 +70,10 @@ module OpenTox end end cv.update_attributes( + name: model.name, + model_id: model.id, + folds: n, + validation_ids: validation_ids, nr_instances: nr_instances, nr_unpredicted: nr_unpredicted, accept_values: accept_values, @@ -85,10 +95,8 @@ module OpenTox #F measure carcinogen: 0.769, noncarcinogen: 0.348 end - class RegressionCrossValidation < Validation + class RegressionCrossValidation < CrossValidation - field :validation_ids, type: Array, default: [] - field :folds, type: Integer field :rmse, type: Float field :mae, type: Float field :weighted_rmse, type: Float @@ -96,6 +104,7 @@ module OpenTox def self.create model, n=10 cv = self.new + cv.save # set created_at validation_ids = [] nr_instances = 0 nr_unpredicted = 0 @@ -145,6 +154,8 @@ module OpenTox rmse = Math.sqrt(rmse/n) weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) cv.update_attributes( + name: model.name, + model_id: model.id, folds: n, validation_ids: validation_ids, nr_instances: nr_instances, diff --git a/lib/dataset.rb b/lib/dataset.rb index 8c5ffc0..5850c3d 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -49,7 +49,7 @@ module OpenTox @data_entries = Marshal.load(data_entry_file.data) bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size - bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries..first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size + bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size $logger.debug "Retrieving data: #{Time.now-t}" end end diff --git a/lib/lazar-model.rb b/lib/lazar-model.rb deleted file mode 100644 index 1970401..0000000 --- a/lib/lazar-model.rb +++ /dev/null @@ -1,170 +0,0 @@ -module OpenTox - - module Model - - class Lazar - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "models" - - field :title, type: String - field :creator, type: String, default: __FILE__ - # datasets - field :training_dataset_id, type: BSON::ObjectId - # algorithms - field :prediction_algorithm, type: String - field :neighbor_algorithm, type: String - field :neighbor_algorithm_parameters, type: Hash - # prediction feature - field :prediction_feature_id, type: BSON::ObjectId - - attr_accessor :prediction_dataset - attr_accessor :training_dataset - - # Create a lazar model from a training_dataset and a feature_dataset - # @param [OpenTox::Dataset] training_dataset - # @return [OpenTox::Model::Lazar] Regression or classification model - def self.create training_dataset - - bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 - - # TODO document convention - prediction_feature = training_dataset.features.first - prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new - lazar.training_dataset_id = training_dataset.id - lazar.prediction_feature_id = prediction_feature.id - lazar.title = prediction_feature.title - - lazar.save - lazar - end - - def predict object - - t = Time.now - at = Time.now - - training_dataset = Dataset.find training_dataset_id - prediction_feature = Feature.find prediction_feature_id - - # parse data - compounds = [] - case object.class.to_s - when "OpenTox::Compound" - compounds = [object] - when "Array" - compounds = object - when "OpenTox::Dataset" - compounds = object.compounds - else - bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." - end - - # make predictions - predictions = [] - neighbors = [] - compounds.each_with_index do |compound,c| - t = Time.new - database_activities = training_dataset.values(compound,prediction_feature) - if database_activities and !database_activities.empty? - database_activities = database_activities.first if database_activities.size == 1 - predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} - next - end - neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) - # add activities - # TODO: improve efficiency, takes 3 times longer than previous version - neighbors.collect! do |n| - rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} - acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact - acts.empty? ? nil : n << acts - end - neighbors.compact! # remove neighbors without training activities - predictions << Algorithm.run(prediction_algorithm, neighbors) - end - - # serialize result - case object.class.to_s - when "OpenTox::Compound" - prediction = predictions.first - prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity - return prediction - when "Array" - return predictions - when "OpenTox::Dataset" - # prepare prediction dataset - prediction_dataset = LazarPrediction.new( - :title => "Lazar prediction for #{prediction_feature.title}", - :creator => __FILE__, - :prediction_feature_id => prediction_feature.id - - ) - confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) - # TODO move into warnings field - warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") - prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] - prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]} - prediction_dataset.save_all - return prediction_dataset - end - - end - - def training_activities - i = training_dataset.feature_ids.index prediction_feature_id - training_dataset.data_entries.collect{|de| de[i]} - end - - end - - class LazarClassification < Lazar - def initialize - super - self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" - self.neighbor_algorithm_parameters = {:min_sim => 0.7} - end - end - - class LazarFminerClassification < LazarClassification - - def self.create training_dataset - model = super(training_dataset) - model.update "_type" => self.to_s # adjust class - model = self.find model.id # adjust class - model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity" - model.neighbor_algorithm_parameters = { - :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", - :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id, - :min_sim => 0.3 - } - model.save - model - end - end - - class LazarRegression < Lazar - - def initialize - super - self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" - self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" - self.neighbor_algorithm_parameters = {:min_sim => 0.7} - end - - end - - class PredictionModel < Lazar - field :category, type: String - field :endpoint, type: String - field :unit, type: String - field :model_id, type: BSON::ObjectId - field :crossvalidation_id, type: BSON::ObjectId - end - - end - -end - diff --git a/lib/lazar.rb b/lib/lazar.rb index 2ea8cba..174fb2c 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -58,7 +58,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor "algorithm.rb", "descriptor.rb", "bbrc.rb", - "lazar-model.rb", + "model.rb", "similarity.rb", "neighbor.rb", "classification.rb", diff --git a/lib/model.rb b/lib/model.rb new file mode 100644 index 0000000..bf8c549 --- /dev/null +++ b/lib/model.rb @@ -0,0 +1,177 @@ +module OpenTox + + module Model + + class Lazar + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + field :title, as: :name, type: String + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + # algorithms + field :prediction_algorithm, type: String + field :neighbor_algorithm, type: String + field :neighbor_algorithm_parameters, type: Hash + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId + + attr_accessor :prediction_dataset + attr_accessor :training_dataset + + # Create a lazar model from a training_dataset and a feature_dataset + # @param [OpenTox::Dataset] training_dataset + # @return [OpenTox::Model::Lazar] Regression or classification model + def self.create training_dataset + + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + + # TODO document convention + prediction_feature = training_dataset.features.first + prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new + lazar.training_dataset_id = training_dataset.id + lazar.prediction_feature_id = prediction_feature.id + lazar.title = prediction_feature.title + + lazar.save + lazar + end + + def predict object + + t = Time.now + at = Time.now + + training_dataset = Dataset.find training_dataset_id + prediction_feature = Feature.find prediction_feature_id + + # parse data + compounds = [] + case object.class.to_s + when "OpenTox::Compound" + compounds = [object] + when "Array" + compounds = object + when "OpenTox::Dataset" + compounds = object.compounds + else + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + end + + # make predictions + predictions = [] + neighbors = [] + compounds.each_with_index do |compound,c| + t = Time.new + database_activities = training_dataset.values(compound,prediction_feature) + if database_activities and !database_activities.empty? + database_activities = database_activities.first if database_activities.size == 1 + predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} + next + end + neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) + # add activities + # TODO: improve efficiency, takes 3 times longer than previous version + neighbors.collect! do |n| + rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} + acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact + acts.empty? ? nil : n << acts + end + neighbors.compact! # remove neighbors without training activities + predictions << Algorithm.run(prediction_algorithm, neighbors) + end + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + prediction = predictions.first + prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity + return prediction + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :title => "Lazar prediction for #{prediction_feature.title}", + :creator => __FILE__, + :prediction_feature_id => prediction_feature.id + + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) + # TODO move into warnings field + warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]} + prediction_dataset.save_all + return prediction_dataset + end + + end + + def training_activities + i = training_dataset.feature_ids.index prediction_feature_id + training_dataset.data_entries.collect{|de| de[i]} + end + + end + + class LazarClassification < Lazar + def initialize + super + self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" + self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.neighbor_algorithm_parameters = {:min_sim => 0.7} + end + end + + class LazarFminerClassification < LazarClassification + + def self.create training_dataset + model = super(training_dataset) + model.update "_type" => self.to_s # adjust class + model = self.find model.id # adjust class + model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity" + model.neighbor_algorithm_parameters = { + :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", + :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id, + :min_sim => 0.3 + } + model.save + model + end + end + + class LazarRegression < Lazar + + def initialize + super + self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" + self.neighbor_algorithm_parameters = {:min_sim => 0.7} + end + + end + + class PredictionModel + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + # TODO field Validations + field :endpoint, type: String + field :species, type: String + field :source, type: String + field :unit, type: String + field :model_id, type: BSON::ObjectId + field :crossvalidation_id, type: BSON::ObjectId + end + + end + +end + diff --git a/lib/regression.rb b/lib/regression.rb index 8a52e7d..0bc6547 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -34,6 +34,30 @@ module OpenTox {:value => prediction,:confidence => confidence} end + def self.weighted_average_with_relevant_fingerprints neighbors + weighted_sum = 0.0 + sim_sum = 0.0 + fingerprint_features = [] + neighbors.each do |row| + n,sim,acts = row + neighbor = Compound.find n + fingerprint_features += neighbor.fp4 + end + fingerprint_features.uniq! + p fingerprint_features +=begin + p n + acts.each do |act| + weighted_sum += sim*Math.log10(act) + sim_sum += sim + end + end +=end + confidence = sim_sum/neighbors.size.to_f + sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + {:value => prediction,:confidence => confidence} + end + # Local support vector regression from neighbors # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required # @return [Numeric] A prediction value. -- cgit v1.2.3