From d9c9d78e49d886ea91386adbbd2b523347df226e Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Mon, 29 Oct 2018 20:34:39 +0100
Subject: dataset predictions fixed

---
 lib/dataset.rb               | 23 ++++++++++++++++++++++
 lib/feature.rb               | 12 ++++++++++++
 lib/model.rb                 |  3 ++-
 lib/validation-statistics.rb |  3 +--
 test/classification-model.rb | 46 ++++++++++++--------------------------------
 test/compound.rb             |  2 +-
 test/dataset.rb              |  2 +-
 7 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/lib/dataset.rb b/lib/dataset.rb
index 41d7b5c..78f5633 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -86,6 +86,10 @@ module OpenTox
       features.select{|f| f._type.match("SubstanceProperty")}
     end
 
+    def prediction_features
+      features.select{|f| f._type.match("Prediction")}
+    end
+
     # Writers
 
     # Add a value for a given substance and feature
@@ -352,6 +356,25 @@ module OpenTox
       sdf
     end
 
+    def predictions
+      predictions = {}
+      substances.each do |s| 
+        predictions[s] ||= {}
+        prediction_feature = prediction_features.first
+        predictions[s][:value] = values(s,prediction_feature).first
+        predictions[s][:warnings] = []
+        warnings_features.each { |w| predictions[s][:warnings] += values(s,w) }
+        if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction
+          prediction_feature.accept_values.each do |v|
+            f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id)
+            predictions[s][:probabilities] ||= {}
+            predictions[s][:probabilities][v] = values(s,f).first
+          end
+        end
+      end
+      predictions
+    end
+
     # Dataset operations
 
     # Merge an array of datasets 
diff --git a/lib/feature.rb b/lib/feature.rb
index be07e7a..c18b0b8 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -54,18 +54,30 @@ module OpenTox
   class NominalLazarPrediction < NominalFeature
     field :model_id, type: BSON::ObjectId
     field :training_feature_id, type: BSON::ObjectId
+    def name
+      "#{self[:name]} Prediction"
+    end
   end
 
   class LazarPredictionProbability < NominalLazarPrediction
+    def name
+      "probability(#{self[:name]})"
+    end
   end
 
   # Numeric lazar prediction
   class NumericLazarPrediction < NumericFeature
     field :model_id, type: BSON::ObjectId
     field :training_feature_id, type: BSON::ObjectId
+    def name
+      "#{name} Prediction"
+    end
   end
 
   class LazarPredictionInterval < NumericLazarPrediction
+    def name
+      "prediction_interval_#{self[:name]}"
+    end
   end
 
   class NominalSubstanceProperty < NominalFeature
diff --git a/lib/model.rb b/lib/model.rb
index fc98e09..7eaa469 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -346,7 +346,8 @@ module OpenTox
 
           # add predictions to dataset
           predictions.each do |substance_id,p|
-            d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings]
+            substance_id = BSON::ObjectId.from_string(substance_id)
+            d.add substance_id,warning_feature,p[:warnings].join(" ") unless p[:warnings].empty?
             unless p[:value].nil?
               d.add substance_id,f,p[:value]
               p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index e440731..7bae891 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -150,8 +150,7 @@ module OpenTox
         y = {:all => [],:without_warnings => []}
         self.nr_predictions = {:all =>0,:without_warnings => 0}
         predictions.each do |cid,pred|
-          p pred
-          if pred[:value] and pred[:measurements] 
+          !if pred[:value] and pred[:measurements] and !pred[:measurements].empty?
             self.nr_predictions[:all] +=1
             x[:all] << pred[:measurements].median
             y[:all] << pred[:value]
diff --git a/test/classification-model.rb b/test/classification-model.rb
index bfb64db..85668fb 100644
--- a/test/classification-model.rb
+++ b/test/classification-model.rb
@@ -1,6 +1,6 @@
 require_relative "setup.rb"
 
-class LazarClassificationTest < MiniTest::Test
+class ClassificationModelTest < MiniTest::Test
 
   def test_classification_default
     algorithms = {
@@ -31,31 +31,6 @@ class LazarClassificationTest < MiniTest::Test
       prediction = model.predict example[:compound]
       assert_equal example[:prediction], prediction[:value]
     end
-
-    # make a dataset prediction
-    compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
-    prediction_dataset = model.predict compound_dataset
-    puts prediction_dataset.to_csv
-    assert_equal compound_dataset.compounds.size, prediction_dataset.compounds.size
-    c = Compound.from_smiles "CC(CN(CC(O)C)N=O)O"
-    prediction_feature = prediction_dataset.features.select{|f| f.class == NominalLazarPrediction}[0]
-    assert_equal ["true"], prediction_dataset.values(c, prediction_feature)
-    p_true = LazarPredictionProbability.find_by(:name => "true")
-    p_false = LazarPredictionProbability.find_by(:name => "false")
-    p p_true
-    assert_equal [0.7], prediction_dataset.values(c,p_true)
-    assert_equal [0.0], prediction_dataset.values(c,p_false)
-    assert_equal 0.0, p_false
-
-#    cid = prediction_dataset.compounds[7].id.to_s
-#    assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0]
-#    expectations = ["Cannot create prediction: Only one similar compound in the training set.",
-#    "Could not find similar substances with experimental data in the training dataset."]
-#    prediction_dataset.predictions.each do |cid,pred|
-#      assert_includes expectations, pred[:warnings][0] if pred[:value].nil?
-#    end
-#    cid = Compound.from_smiles("CCOC(=O)N").id.to_s
-#    assert_match "excluded", prediction_dataset.predictions[cid][:info]
   end
  
   def test_classification_parameters
@@ -81,16 +56,19 @@ class LazarClassificationTest < MiniTest::Test
   end
 
   def test_dataset_prediction
-    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
+    test_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
     model = Model::Lazar.create training_dataset: training_dataset
-    result = model.predict training_dataset
-    puts result.to_csv
+    result = model.predict test_dataset
     assert_kind_of Dataset, result
-    assert 3, result.features.size
-    assert 8, result.compounds.size
-    assert_equal ["true"], result.values(result.compounds.first, result.features[0])
-    assert_equal [0.65], result.values(result.compounds.first, result.features[1])
-    assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if 
+    assert_equal 7, result.features.size
+    assert_equal 85, result.compounds.size
+    prediction_feature = result.prediction_features.first
+    assert_equal ["yes"], result.values(result.compounds[1], prediction_feature)
+    assert_equal ["no"], result.values(result.compounds[5], prediction_feature)
+    assert_nil result.predictions[result.compounds.first][:value]
+    assert_equal "yes", result.predictions[result.compounds[1]][:value]
+    assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["no"].round(2)
   end
 
   def test_carcinogenicity_rf_classification
diff --git a/test/compound.rb b/test/compound.rb
index 69ad21e..44e47f1 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -74,7 +74,7 @@ class CompoundTest < MiniTest::Test
   end
 
   def test_openbabel_segfault
-    inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1"
+    inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/t11-,12-,14-,18-,19?/m1/s1"
 
     c = Compound.from_inchi(inchi)
     assert_equal inchi, c.inchi
diff --git a/test/dataset.rb b/test/dataset.rb
index c197648..fd6ed52 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -110,7 +110,7 @@ class DatasetTest < MiniTest::Test
       assert_match smi, d.warnings.join
     end
     duplicates.each do |inchi|
-      refute_empty d.values(Compound.from_inchi(inchi),d.warnings_feature)
+      refute_empty d.values(Compound.from_inchi(inchi),d.warnings_features.first)
     end
     d.delete
   end
-- 
cgit v1.2.3