From 8aab046eb1ad39aaf10c5a8596102c35c7b2ee0b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 15 Apr 2016 11:01:16 +0200 Subject: data_entries removed from datasets. datasets are now just containers for compounds and features, feature values have to be retrieved from substances. --- lib/compound.rb | 3 +- lib/crossvalidation.rb | 12 ++++---- lib/dataset.rb | 65 +++++++++++++++-------------------------- lib/leave-one-out-validation.rb | 11 ++++--- lib/model.rb | 44 +++++++++++++++------------- lib/validation.rb | 5 ++-- test/prediction_models.rb | 1 - test/setup.rb | 4 +-- test/validation.rb | 16 +++++----- 9 files changed, 70 insertions(+), 91 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 55cd482..049d77b 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -288,8 +288,7 @@ module OpenTox training_dataset.compounds.each do |compound| candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - feature_values = training_dataset.values(compound,prediction_feature) - neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] + neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => compound.toxicities[prediction_feature.id.to_s]}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index f93a04c..752d393 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -22,8 +22,10 @@ module OpenTox end def self.create model, n=10 - model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation - bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + cv = klass.new( name: model.name, model_id: model.id, @@ -35,7 +37,7 @@ module OpenTox predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations + #fork do # parallel execution of validations can lead to Rserve and memory problems $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now validation = Validation.create(model, fold[0], fold[1],cv) @@ -121,7 +123,6 @@ module OpenTox end def misclassifications n=nil - #n = predictions.size unless n n ||= 10 model = Model::Lazar.find(self.model_id) training_dataset = Dataset.find(model.training_dataset_id) @@ -132,8 +133,7 @@ module OpenTox neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) neighbors.collect! do |n| neighbor = Compound.find(n[0]) - values = training_dataset.values(neighbor,prediction_feature) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]} end { :smiles => compound.smiles, diff --git a/lib/dataset.rb b/lib/dataset.rb index 274c475..fdf1bfc 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,7 +5,8 @@ module OpenTox class Dataset - field :data_entries, type: Hash, default: {} + field :substance_ids, type: Array, default: [] + field :feature_ids, type: Array, default: [] # Readers @@ -19,13 +20,13 @@ module OpenTox # Get all substances def substances - @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} + @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id} @substances end # Get all features def features - @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact + @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} @features end @@ -33,9 +34,9 @@ module OpenTox # @param compound [OpenTox::Compound] OpenTox Compound object # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values - def values(compound, feature) - data_entries[compound.id.to_s][feature.id.to_s] - end + #def values(compound, feature) + #data_entries[compound.id.to_s][feature.id.to_s] + #end # Writers @@ -45,9 +46,9 @@ module OpenTox end # Set features - #def features=(features) - #self.feature_ids = features.collect{|f| f.id} - #end + def features=(features) + self.feature_ids = features.collect{|f| f.id} + end # Dataset operations @@ -55,8 +56,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - substance_ids = data_entries.keys - len = substance_ids.size + len = self.substance_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -69,19 +69,11 @@ module OpenTox training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| - new_data_entries = {} - cids.each do |cid| - data_entries[cid].each do |f,v| - new_data_entries[cid] ||= {} - new_data_entries[cid][f] = v - end - end - dataset = self.class.new(:data_entries => new_data_entries, :source => self.id ) + dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save end - dataset.save dataset end start = last+1 @@ -90,12 +82,6 @@ module OpenTox chunks end - # Diagnostics - - def duplicates feature=self.features.first - data_entries.select{|sid,f| f[feature.id].size > 1} - end - # Serialisation # converts dataset to csv format including compound smiles as first column, other column headers are feature names @@ -161,7 +147,6 @@ module OpenTox compound_format = feature_names.shift.strip # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - numeric = [] # guess feature types feature_names.each_with_index do |f,i| @@ -180,8 +165,7 @@ module OpenTox numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end - @features ||= [] - @features << feature if feature + feature_ids << feature.id if feature end $logger.debug "Feature values: #{Time.now-time}" @@ -196,7 +180,7 @@ module OpenTox table.each_with_index do |vals,i| ct = Time.now identifier = vals.shift.strip - warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? + warn "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format when /SMILES/i @@ -208,41 +192,38 @@ module OpenTox rescue compound = nil end - if compound.nil? - # compound parsers may return nil - warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + if compound.nil? # compound parsers may return nil + warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end + substance_ids << compound.id compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id compound_time += Time.now-ct r += 1 - unless vals.size == @features.size - warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." + unless vals.size == feature_ids.size + warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end vals.each_with_index do |v,j| if v.blank? - warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] v = v.to_f else v = v.strip end - self.data_entries[compound.id.to_s] ||= {} - self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] - self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - compound.toxicities[@features[j].id.to_s] ||= [] - compound.toxicities[@features[j].id.to_s] << v + compound.toxicities[feature_ids[j].to_s] ||= [] + compound.toxicities[feature_ids[j].to_s] << v compound.save end end compounds.duplicates.each do |compound| positions = [] compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} - warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 10fbe85..ed917eb 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -10,6 +10,8 @@ module OpenTox field :finished_at, type: Time def self.create model + $logger.debug "#{model.name}: LOO validation started" + t = Time.now model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id predictions = model.predict model.training_dataset.compounds @@ -17,7 +19,7 @@ module OpenTox nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = model.training_dataset.data_entries[cid][prediction[:prediction_feature_id].to_s] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] else nr_unpredicted += 1 end @@ -28,6 +30,7 @@ module OpenTox loo.predictions = predictions loo.statistics loo.save + $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" loo end @@ -84,16 +87,12 @@ module OpenTox class RegressionLeaveOneOutValidation < LeaveOneOutValidation - - field :rmse, type: Float, default: 0.0 + field :rmse, type: Float, default: 0 field :mae, type: Float, default: 0 - #field :weighted_rmse, type: Float, default: 0 - #field :weighted_mae, type: Float, default: 0 field :r_squared, type: Float field :correlation_plot_id, type: BSON::ObjectId field :confidence_plot_id, type: BSON::ObjectId - def statistics stat = ValidationStatistics.regression predictions update_attributes(stat) diff --git a/lib/model.rb b/lib/model.rb index 1960c10..b82f098 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -20,6 +20,10 @@ module OpenTox def training_dataset Dataset.find(training_dataset_id) end + + def prediction_feature + Feature.find(prediction_feature_id) + end end class Lazar < Model @@ -31,13 +35,10 @@ module OpenTox # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model - def initialize training_dataset, params={} + def initialize prediction_feature, training_dataset, params={} super params - # TODO document convention - #p training_dataset.features - prediction_feature = training_dataset.features.first # set defaults for empty parameters self.prediction_feature_id ||= prediction_feature.id self.training_dataset_id ||= training_dataset.id @@ -49,7 +50,6 @@ module OpenTox end def predict_compound compound - prediction_feature = Feature.find prediction_feature_id neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) @@ -122,18 +122,13 @@ module OpenTox end end - - def training_activities - i = training_dataset.feature_ids.index prediction_feature_id - training_dataset.data_entries.collect{|de| de[i]} - end end class LazarClassification < Lazar - def self.create training_dataset, params={} - model = self.new training_dataset, params + def self.create prediction_feature, training_dataset, params={} + model = self.new prediction_feature, training_dataset, params model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm model.neighbor_algorithm ||= "fingerprint_neighbors" model.neighbor_algorithm_parameters ||= {} @@ -151,8 +146,8 @@ module OpenTox class LazarRegression < Lazar - def self.create training_dataset, params={} - model = self.new training_dataset, params + def self.create prediction_feature, training_dataset, params={} + model = self.new prediction_feature, training_dataset, params model.neighbor_algorithm ||= "fingerprint_neighbors" model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} @@ -173,13 +168,13 @@ module OpenTox include Mongoid::Document include Mongoid::Timestamps - # TODO field Validations field :endpoint, type: String field :species, type: String field :source, type: String field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId + field :leave_one_out_validation_id, type: BSON::ObjectId def predict object Lazar.find(model_id).predict object @@ -201,12 +196,16 @@ module OpenTox repeated_crossvalidation.crossvalidations end + def leave_one_out_validation + LeaveOneOutValidation.find leave_one_out_validation_id + end + def regression? - training_dataset.features.first.numeric? + model.is_a? LazarRegression end def classification? - training_dataset.features.first.nominal? + model.is_a? LazarClassification end def self.from_csv_file file @@ -214,14 +213,17 @@ module OpenTox bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file prediction_model = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file + prediction_feature = training_dataset.features.first model = nil - if training_dataset.features.first.nominal? - model = LazarClassification.create training_dataset - elsif training_dataset.features.first.numeric? - model = LazarRegression.create training_dataset + if prediction_feature.nominal? + model = LazarClassification.create prediction_feature, training_dataset + elsif prediction_feature.numeric? + model = LazarRegression.create prediction_feature, training_dataset end prediction_model[:model_id] = model.id + prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id + prediction_model[:leave_one_out_validation_id] = LeaveOneOutValidation.create(model).id prediction_model.save prediction_model end diff --git a/lib/validation.rb b/lib/validation.rb index 484e22e..6b515e4 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -27,14 +27,14 @@ module OpenTox atts = model.attributes.dup # do not modify attributes from original model atts["_id"] = BSON::ObjectId.new atts[:training_dataset_id] = training_set.id - validation_model = model.class.create training_set, atts + validation_model = model.class.create model.prediction_feature, training_set, atts validation_model.save predictions = validation_model.predict test_set.compounds predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = test_set.data_entries[cid][prediction[:prediction_feature_id].to_s] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] else nr_unpredicted += 1 end @@ -42,7 +42,6 @@ module OpenTox end validation = self.new( :model_id => validation_model.id, - #:prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, :nr_instances => test_set.compounds.size, :nr_unpredicted => nr_unpredicted, diff --git a/test/prediction_models.rb b/test/prediction_models.rb index a2e5fe2..49a2472 100644 --- a/test/prediction_models.rb +++ b/test/prediction_models.rb @@ -10,7 +10,6 @@ class PredictionModelTest < MiniTest::Test assert pm.classification? refute pm.regression? pm.crossvalidations.each do |cv| - p cv assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split." end prediction = pm.predict Compound.from_smiles("CCCC(NN)C") diff --git a/test/setup.rb b/test/setup.rb index 6c97282..e7c32b4 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -#$mongo.database.drop -#$gridfs = $mongo.database.fs +$mongo.database.drop +$gridfs = $mongo.database.fs diff --git a/test/validation.rb b/test/validation.rb index e702278..baee2d1 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -6,14 +6,14 @@ class ValidationTest < MiniTest::Test def test_default_classification_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset + model = Model::LazarClassification.create dataset.features.first, dataset cv = ClassificationCrossValidation.create model assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split" end def test_default_regression_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" - model = Model::LazarRegression.create dataset + model = Model::LazarRegression.create dataset.features.first, dataset cv = RegressionCrossValidation.create model assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be larger than 1.5, this may occur due to an unfavorable training/test set split" assert cv.mae < 1, "MAE #{cv.mae} should be larger than 1, this may occur due to an unfavorable training/test set split" @@ -30,7 +30,7 @@ class ValidationTest < MiniTest::Test :type => "FP3" } } - model = Model::LazarClassification.create dataset, params + model = Model::LazarClassification.create dataset.features.first, dataset, params model.save cv = ClassificationCrossValidation.create model params = model.neighbor_algorithm_parameters @@ -54,7 +54,7 @@ class ValidationTest < MiniTest::Test :min_sim => 0.7, } } - model = Model::LazarRegression.create dataset, params + model = Model::LazarRegression.create dataset.features.first, dataset, params cv = RegressionCrossValidation.create model cv.validation_ids.each do |vid| model = Model::Lazar.find(Validation.find(vid).model_id) @@ -70,7 +70,7 @@ class ValidationTest < MiniTest::Test def test_physchem_regression_crossvalidation training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv") - model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") + model = Model::LazarRegression.create(training_dataset.features.first, training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") cv = RegressionCrossValidation.create model refute_nil cv.rmse refute_nil cv.mae @@ -80,7 +80,7 @@ class ValidationTest < MiniTest::Test def test_classification_loo_validation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset + model = Model::LazarClassification.create dataset.features.first, dataset loo = ClassificationLeaveOneOutValidation.create model assert_equal 14, loo.nr_unpredicted refute_empty loo.confusion_matrix @@ -89,7 +89,7 @@ class ValidationTest < MiniTest::Test def test_regression_loo_validation dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv") - model = Model::LazarRegression.create dataset + model = Model::LazarRegression.create dataset.features.first, dataset loo = RegressionLeaveOneOutValidation.create model assert loo.r_squared > 0.34 end @@ -98,7 +98,7 @@ class ValidationTest < MiniTest::Test def test_repeated_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset + model = Model::LazarClassification.create dataset.features.first, dataset repeated_cv = RepeatedCrossValidation.create model repeated_cv.crossvalidations.each do |cv| assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" -- cgit v1.2.3