validation tests pass

author: Christoph Helma <helma@in-silico.ch> 2016-04-13 15:15:51 +0200
committer: Christoph Helma <helma@in-silico.ch> 2016-04-13 15:15:51 +0200
commit: a8368dda776c05331474adf7eaf9a6e413a3b1eb (patch)
tree: daafac9a7453a8d453fc6992293afe6f6a937551
parent: 84222bae2bbb9fb3e0ce3e65de1be8e7f94d2147 (diff)
9 files changed, 78 insertions, 281 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 84d8891..757ba1a 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -341,7 +341,7 @@ module OpenTox
         {'$sort' => {'tanimoto' => -1}}
       ]
       
-      $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
+      $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
         
     end
     
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index b7cd7bf..f93a04c 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -6,7 +6,7 @@ module OpenTox
     field :folds, type: Integer
     field :nr_instances, type: Integer
     field :nr_unpredicted, type: Integer
-    field :predictions, type: Array, default: []
+    field :predictions, type: Hash, default: {}
     field :finished_at, type: Time 
 
     def time
@@ -32,7 +32,7 @@ module OpenTox
       cv.save # set created_at
       nr_instances = 0
       nr_unpredicted = 0
-      predictions = []
+      predictions = {}
       training_dataset = Dataset.find model.training_dataset_id
       training_dataset.folds(n).each_with_index do |fold,fold_nr|
         #fork do # parallel execution of validations
@@ -42,12 +42,12 @@ module OpenTox
           $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
         #end
       end
-      #Process.waitall
+      Process.waitall
       cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
       cv.validations.each do |validation|
         nr_instances += validation.nr_instances
         nr_unpredicted += validation.nr_unpredicted
-        predictions += validation.predictions
+        predictions.merge! validation.predictions
       end
       cv.update_attributes(
         nr_instances: nr_instances,
@@ -73,61 +73,8 @@ module OpenTox
     # TODO auc, f-measure (usability??)
 
     def statistics
-      accept_values = Feature.find(model.prediction_feature_id).accept_values
-      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
-      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
-      true_rate = {}
-      predictivity = {}
-      predictions.each do |pred|
-        compound_id,activities,prediction,confidence = pred
-        if activities and prediction #and confidence.numeric? 
-          if activities.uniq.size == 1
-            activity = activities.uniq.first
-            if prediction == activity
-              if prediction == accept_values[0]
-                confusion_matrix[0][0] += 1
-                #weighted_confusion_matrix[0][0] += confidence
-              elsif prediction == accept_values[1]
-                confusion_matrix[1][1] += 1
-                #weighted_confusion_matrix[1][1] += confidence
-              end
-            elsif prediction != activity
-              if prediction == accept_values[0]
-                confusion_matrix[0][1] += 1
-                #weighted_confusion_matrix[0][1] += confidence
-              elsif prediction == accept_values[1]
-                confusion_matrix[1][0] += 1
-                #weighted_confusion_matrix[1][0] += confidence
-              end
-            end
-          end
-        else
-          nr_unpredicted += 1 if prediction.nil?
-        end
-      end
-      true_rate = {}
-      predictivity = {}
-      accept_values.each_with_index do |v,i|
-        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
-        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
-      end
-      confidence_sum = 0
-      #weighted_confusion_matrix.each do |r|
-        #r.each do |c|
-          #confidence_sum += c
-        #end
-      #end
-      update_attributes(
-        accept_values: accept_values,
-        confusion_matrix: confusion_matrix,
-        #weighted_confusion_matrix: weighted_confusion_matrix,
-        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
-        #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
-        true_rate: true_rate,
-        predictivity: predictivity,
-        finished_at: Time.now
-      )
-      $logger.debug "Accuracy #{accuracy}"
+      stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values)
+      update_attributes(stat)
     end
 
     def confidence_plot
@@ -169,48 +116,8 @@ module OpenTox
     field :correlation_plot_id, type: BSON::ObjectId
 
     def statistics
-      rmse = 0
-      mae = 0
-      x = []
-      y = []
-      predictions.each do |pred|
-        compound_id,activity,prediction,confidence = pred
-        if activity and prediction 
-          unless activity == [nil]
-            x << -Math.log10(activity.median)
-            y << -Math.log10(prediction)
-            error = Math.log10(prediction)-Math.log10(activity.median)
-            rmse += error**2
-            #weighted_rmse += confidence*error**2
-            mae += error.abs
-            #weighted_mae += confidence*error.abs
-            #confidence_sum += confidence
-          end
-        else
-          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-        end
-      end
-      R.assign "measurement", x
-      R.assign "prediction", y
-      R.eval "r <- cor(measurement,prediction,use='complete')"
-      r = R.eval("r").to_ruby
-
-      mae = mae/predictions.size
-      #weighted_mae = weighted_mae/confidence_sum
-      rmse = Math.sqrt(rmse/predictions.size)
-      #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
-      update_attributes(
-        mae: mae,
-        rmse: rmse,
-        #weighted_mae: weighted_mae,
-        #weighted_rmse: weighted_rmse,
-        r_squared: r**2,
-        finished_at: Time.now
-      )
-      $logger.debug "R^2 #{r**2}"
-      $logger.debug "RMSE #{rmse}"
-      $logger.debug "MAE #{mae}"
+      stat = ValidationStatistics.regression predictions
+      update_attributes(stat)
     end
 
     def misclassifications n=nil
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 5c04382..25307c9 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -5,9 +5,6 @@ module OpenTox
 
   class Dataset
 
-    # associations like has_many, belongs_to deteriorate performance
-    #field :feature_ids, type: Array, default: []
-    #field :substance_ids, type: Array, default: []
     field :data_entries, type: Hash, default: {}
 
     # Readers
@@ -24,7 +21,7 @@ module OpenTox
 
     # Get all features
     def features
-      @features ||= data_entries.collect{|cid,f| f.keys}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)}
+      @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)}
       @features
     end
 
@@ -33,7 +30,7 @@ module OpenTox
     # @param feature [OpenTox::Feature] OpenTox Feature object
     # @return [Array] Data entry values
     def values(compound, feature)
-      data_entries[compound.id,feature.id]
+      data_entries[compound.id.to_s][feature.id.to_s]
     end
 
     # Writers
@@ -68,15 +65,14 @@ module OpenTox
         training_idxs = indices-test_idxs
         training_cids = training_idxs.collect{|i| substance_ids[i]}
         chunk = [training_cids,test_cids].collect do |cids|
-          new_cids = []
-          new_data_entries = []
+          new_data_entries = {}
           cids.each do |cid| 
-            data_entries[cid].each do |de|
-              new_cids << cid
-              new_data_entries << de
+            data_entries[cid].each do |f,v|
+              new_data_entries[cid] ||= {}
+              new_data_entries[cid][f] = v
             end
           end
-          dataset = self.class.new(:data_entries => data_entries, :source => self.id )
+          dataset = self.class.new(:data_entries => new_data_entries, :source => self.id )
           dataset.compounds.each do |compound|
             compound.dataset_ids << dataset.id
             compound.save
@@ -213,9 +209,6 @@ module OpenTox
           next
         end
 
-        #substance_ids << compound.id
-        #table.first.size == 0 ?  self.data_entries[compound.id] = Array.new(0) : self.data_entries[compound.id] = Array.new(table.first.size-1) 
-        
         vals.each_with_index do |v,j|
           if v.blank?
             warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
@@ -228,10 +221,8 @@ module OpenTox
           self.data_entries[compound.id.to_s] ||= {}
           self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= []
           self.data_entries[compound.id.to_s][@features[j].id.to_s] << v
-          #i = compound.feature_ids.index feature_ids[j]
-          #TODO
-          #compound.features[feature_ids[j].to_s] ||= []
-          #compound.features[feature_ids[j].to_s] << v
+          compound.features[@features[j].id.to_s] ||= []
+          compound.features[@features[j].id.to_s] << v
           compound.save
         end
       end
@@ -251,14 +242,23 @@ module OpenTox
   end
 
   # Dataset for lazar predictions
-  class LazarPrediction < Dataset
+  class LazarPrediction #< Dataset
     field :creator, type: String
-    field :prediction_feature_id, type: String
+    field :prediction_feature_id, type: BSON::ObjectId
+    field :predictions, type: Hash, default: {}
 
     def prediction_feature
       Feature.find prediction_feature_id
     end
 
+    def compounds
+      substances.select{|s| s.is_a? Compound}
+    end
+
+    def substances
+      predictions.keys.collect{|id| Substance.find id}
+    end
+
   end
 
 end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 2bcecc5..a1ad551 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -62,7 +62,7 @@ suppressPackageStartupMessages({
 
 # OpenTox classes and includes
 #CLASSES = ["Feature","Substance::Compound","Substance::Nanoparticle","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
-CLASSES = ["Feature","Substance","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
 
 [ # be aware of the require sequence as it affects class/method overwrites
   "overwrite.rb",
@@ -81,6 +81,7 @@ CLASSES = ["Feature","Substance","Dataset","Validation","CrossValidation","Leave
   "validation.rb",
   "crossvalidation.rb",
   "leave-one-out-validation.rb",
+  "validation-statistics.rb",
   "experiment.rb",
   "import.rb",
 ].each{ |f| require_relative f }
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 2cd13db..10fbe85 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -6,20 +6,26 @@ module OpenTox
     field :dataset_id, type: BSON::ObjectId
     field :nr_instances, type: Integer
     field :nr_unpredicted, type: Integer
-    field :predictions, type: Array
+    field :predictions, type: Hash
     field :finished_at, type: Time 
 
     def self.create model
       model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
       loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
-      compound_ids = model.training_dataset.compound_ids
       predictions = model.predict model.training_dataset.compounds
-      predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
-      predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
+      predictions.each{|cid,p| p.delete(:neighbors)}
+      nr_unpredicted = 0
+      predictions.each do |cid,prediction|
+        if prediction[:value]
+          prediction[:measured] = model.training_dataset.data_entries[cid][prediction[:prediction_feature_id].to_s]
+        else
+          nr_unpredicted += 1
+        end
+        predictions.delete(cid) unless prediction[:value] and prediction[:measured]
+      end
       loo.nr_instances = predictions.size
-      predictions.select!{|p| p[:value]} # remove unpredicted
-      loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
-      loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
+      loo.nr_unpredicted = nr_unpredicted
+      loo.predictions = predictions
       loo.statistics
       loo.save
       loo
@@ -42,53 +48,8 @@ module OpenTox
     field :confidence_plot_id, type: BSON::ObjectId
 
     def statistics
-      accept_values = Feature.find(model.prediction_feature_id).accept_values
-      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
-      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
-      predictions.each do |pred|
-        pred[:database_activities].each do |db_act|
-          if pred[:value]
-            if pred[:value] == db_act
-              if pred[:value] == accept_values[0]
-                confusion_matrix[0][0] += 1
-                weighted_confusion_matrix[0][0] += pred[:confidence]
-              elsif pred[:value] == accept_values[1]
-                confusion_matrix[1][1] += 1
-                weighted_confusion_matrix[1][1] += pred[:confidence]
-              end
-            else
-              if pred[:value] == accept_values[0]
-                confusion_matrix[0][1] += 1
-                weighted_confusion_matrix[0][1] += pred[:confidence]
-              elsif pred[:value] == accept_values[1]
-                confusion_matrix[1][0] += 1
-                weighted_confusion_matrix[1][0] += pred[:confidence]
-              end
-            end
-          end
-        end
-      end
-      accept_values.each_with_index do |v,i|
-        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
-        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
-      end
-      confidence_sum = 0
-      weighted_confusion_matrix.each do |r|
-        r.each do |c|
-          confidence_sum += c
-        end
-      end
-      update_attributes(
-        accept_values: accept_values,
-        confusion_matrix: confusion_matrix,
-        weighted_confusion_matrix: weighted_confusion_matrix,
-        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
-        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
-        true_rate: true_rate,
-        predictivity: predictivity,
-        finished_at: Time.now
-      )
-      $logger.debug "Accuracy #{accuracy}"
+      stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values)
+      update_attributes(stat)
     end
 
     def confidence_plot
@@ -132,43 +93,10 @@ module OpenTox
     field :correlation_plot_id, type: BSON::ObjectId
     field :confidence_plot_id, type: BSON::ObjectId
 
+
     def statistics
-      confidence_sum = 0
-      predicted_values = []
-      measured_values = []
-      predictions.each do |pred|
-        pred[:database_activities].each do |activity|
-          if pred[:value]
-            predicted_values << pred[:value]
-            measured_values << activity
-            error = Math.log10(pred[:value])-Math.log10(activity)
-            self.rmse += error**2
-            #self.weighted_rmse += pred[:confidence]*error**2
-            self.mae += error.abs
-            #self.weighted_mae += pred[:confidence]*error.abs
-            #confidence_sum += pred[:confidence]
-          end
-        end
-        if pred[:database_activities].empty?
-          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-        end
-      end
-      R.assign "measurement", measured_values
-      R.assign "prediction", predicted_values
-      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
-      r = R.eval("r").to_ruby
-
-      self.mae = self.mae/predictions.size
-      #self.weighted_mae = self.weighted_mae/confidence_sum
-      self.rmse = Math.sqrt(self.rmse/predictions.size)
-      #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
-      self.r_squared = r**2
-      self.finished_at = Time.now
-      save
-      $logger.debug "R^2 #{r**2}"
-      $logger.debug "RMSE #{rmse}"
-      $logger.debug "MAE #{mae}"
+      stat = ValidationStatistics.regression predictions
+      update_attributes(stat)
     end
 
     def correlation_plot
diff --git a/lib/model.rb b/lib/model.rb
index 1f9942b..5140d5a 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -90,33 +90,36 @@ module OpenTox
         end
 
         # make predictions
-        predictions = []
-        predictions = compounds.collect{|c| predict_compound c}
+        predictions = {}
+        compounds.each do |c|
+          predictions[c.id.to_s] = predict_compound c
+          predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
+        end
 
         # serialize result
         case object.class.to_s
         when "OpenTox::Compound"
-          prediction = predictions.first
+          prediction = predictions[compounds.first.id.to_s]
           prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
-          return prediction
+          return predictions
         when "Array"
           return predictions
         when "OpenTox::Dataset"
+          predictions.each{|cid,p| p.delete(:neighbors)}
           # prepare prediction dataset
           measurement_feature = Feature.find prediction_feature_id
 
-          prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
+          prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
           prediction_dataset = LazarPrediction.new(
             :name => "Lazar prediction for #{prediction_feature.name}",
             :creator =>  __FILE__,
             :prediction_feature_id => prediction_feature.id
 
           )
-          confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
-          warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
-          prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
-          prediction_dataset.compounds = compounds
-          prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
+
+          compounds.each_with_index do |c,i|
+            prediction_dataset.predictions[c.id.to_s] = predictions[i]
+          end
           prediction_dataset.save
           return prediction_dataset
         end
diff --git a/lib/validation.rb b/lib/validation.rb
index b72d273..484e22e 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -8,7 +8,7 @@ module OpenTox
     field :test_dataset_id, type: BSON::ObjectId
     field :nr_instances, type: Integer
     field :nr_unpredicted, type: Integer
-    field :predictions, type: Array
+    field :predictions, type: Hash
 
     def prediction_dataset
       Dataset.find prediction_dataset_id
@@ -29,30 +29,22 @@ module OpenTox
       atts[:training_dataset_id] = training_set.id
       validation_model = model.class.create training_set, atts
       validation_model.save
-      cids = test_set.compound_ids
-
-      test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
-      prediction_dataset = validation_model.predict test_set_without_activities
-      predictions = []
+      predictions = validation_model.predict test_set.compounds
+      predictions.each{|cid,p| p.delete(:neighbors)}
       nr_unpredicted = 0
-      activities = test_set.data_entries.collect{|de| de.first}
-      prediction_dataset.data_entries.each_with_index do |de,i|
-        if de[0] #and de[1] 
-          cid = prediction_dataset.compound_ids[i]
-          rows = cids.each_index.select{|r| cids[r] == cid }
-          activities = rows.collect{|r| test_set.data_entries[r][0]}
-          prediction = de.first
-          confidence = de[1]
-          predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
+      predictions.each do |cid,prediction|
+        if prediction[:value]
+          prediction[:measured] = test_set.data_entries[cid][prediction[:prediction_feature_id].to_s]
         else
           nr_unpredicted += 1
         end
+        predictions.delete(cid) unless prediction[:value] and prediction[:measured]
       end
       validation = self.new(
         :model_id => validation_model.id,
-        :prediction_dataset_id => prediction_dataset.id,
+        #:prediction_dataset_id => prediction_dataset.id,
         :test_dataset_id => test_set.id,
-        :nr_instances => test_set.compound_ids.size,
+        :nr_instances => test_set.compounds.size,
         :nr_unpredicted => nr_unpredicted,
         :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
       )
@@ -67,42 +59,6 @@ module OpenTox
   end
 
   class RegressionValidation < Validation
-
-    def statistics
-      rmse = 0
-      weighted_rmse = 0
-      rse = 0
-      weighted_rse = 0
-      mae = 0
-      weighted_mae = 0
-      confidence_sum = 0
-      predictions.each do |pred|
-        compound_id,activity,prediction,confidence = pred
-        if activity and prediction
-          error = Math.log10(prediction)-Math.log10(activity.median)
-          rmse += error**2
-          weighted_rmse += confidence*error**2
-          mae += error.abs
-          weighted_mae += confidence*error.abs
-          confidence_sum += confidence
-        else
-          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-        end
-      end
-      x = predictions.collect{|p| p[1].median}
-      y = predictions.collect{|p| p[2]}
-      R.assign "measurement", x
-      R.assign "prediction", y
-      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
-      r = R.eval("r").to_ruby
-
-      mae = mae/predictions.size
-      weighted_mae = weighted_mae/confidence_sum
-      rmse = Math.sqrt(rmse/predictions.size)
-      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
-      { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
-    end
   end
 
 end
diff --git a/test/classification.rb b/test/classification.rb
index bedbe14..af23db6 100644
--- a/test/classification.rb
+++ b/test/classification.rb
@@ -33,8 +33,10 @@ class LazarClassificationTest < MiniTest::Test
     prediction = model.predict compound_dataset
     assert_equal compound_dataset.compounds, prediction.compounds
 
-    assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3]
-    assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3]
+    cid = prediction.compounds[7].id.to_s
+    assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.predictions[cid][:warning]
+    cid = prediction.compounds[9].id.to_s
+    assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.predictions[cid][:warning]
     # cleanup
     [training_dataset,model,compound_dataset].each{|o| o.delete}
   end
diff --git a/test/validation.rb b/test/validation.rb
index d8eea59..e702278 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -8,15 +8,15 @@ class ValidationTest < MiniTest::Test
     dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
     model = Model::LazarClassification.create dataset
     cv = ClassificationCrossValidation.create model
-    assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
+    assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split"
   end
 
   def test_default_regression_crossvalidation
     dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
     model = Model::LazarRegression.create dataset
     cv = RegressionCrossValidation.create model
-    assert cv.rmse < 1.5, "RMSE > 1.5"
-    assert cv.mae < 1
+    assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be larger than 1.5, this may occur due to an unfavorable training/test set split"
+    assert cv.mae < 1, "MAE #{cv.mae} should be larger than 1, this may occur due to an unfavorable training/test set split"
   end
 
   # parameters
author	Christoph Helma <helma@in-silico.ch>	2016-04-13 15:15:51 +0200
committer	Christoph Helma <helma@in-silico.ch>	2016-04-13 15:15:51 +0200
commit	a8368dda776c05331474adf7eaf9a6e413a3b1eb (patch)
tree	daafac9a7453a8d453fc6992293afe6f6a937551
parent	84222bae2bbb9fb3e0ce3e65de1be8e7f94d2147 (diff)