From d9c9d78e49d886ea91386adbbd2b523347df226e Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 20:34:39 +0100 Subject: dataset predictions fixed --- lib/dataset.rb | 23 ++++++++++++++++++++++ lib/feature.rb | 12 ++++++++++++ lib/model.rb | 3 ++- lib/validation-statistics.rb | 3 +-- test/classification-model.rb | 46 ++++++++++++-------------------------------- test/compound.rb | 2 +- test/dataset.rb | 2 +- 7 files changed, 52 insertions(+), 39 deletions(-) diff --git a/lib/dataset.rb b/lib/dataset.rb index 41d7b5c..78f5633 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -86,6 +86,10 @@ module OpenTox features.select{|f| f._type.match("SubstanceProperty")} end + def prediction_features + features.select{|f| f._type.match("Prediction")} + end + # Writers # Add a value for a given substance and feature @@ -352,6 +356,25 @@ module OpenTox sdf end + def predictions + predictions = {} + substances.each do |s| + predictions[s] ||= {} + prediction_feature = prediction_features.first + predictions[s][:value] = values(s,prediction_feature).first + predictions[s][:warnings] = [] + warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } + if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction + prediction_feature.accept_values.each do |v| + f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id) + predictions[s][:probabilities] ||= {} + predictions[s][:probabilities][v] = values(s,f).first + end + end + end + predictions + end + # Dataset operations # Merge an array of datasets diff --git a/lib/feature.rb b/lib/feature.rb index be07e7a..c18b0b8 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -54,18 +54,30 @@ module OpenTox class NominalLazarPrediction < NominalFeature field :model_id, type: BSON::ObjectId field :training_feature_id, type: BSON::ObjectId + def name + "#{self[:name]} Prediction" + end end class LazarPredictionProbability < NominalLazarPrediction + def name + "probability(#{self[:name]})" + end end # Numeric lazar prediction class NumericLazarPrediction < NumericFeature field :model_id, type: BSON::ObjectId field :training_feature_id, type: BSON::ObjectId + def name + "#{name} Prediction" + end end class LazarPredictionInterval < NumericLazarPrediction + def name + "prediction_interval_#{self[:name]}" + end end class NominalSubstanceProperty < NominalFeature diff --git a/lib/model.rb b/lib/model.rb index fc98e09..7eaa469 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -346,7 +346,8 @@ module OpenTox # add predictions to dataset predictions.each do |substance_id,p| - d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings] + substance_id = BSON::ObjectId.from_string(substance_id) + d.add substance_id,warning_feature,p[:warnings].join(" ") unless p[:warnings].empty? unless p[:value].nil? d.add substance_id,f,p[:value] p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index e440731..7bae891 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -150,8 +150,7 @@ module OpenTox y = {:all => [],:without_warnings => []} self.nr_predictions = {:all =>0,:without_warnings => 0} predictions.each do |cid,pred| - p pred - if pred[:value] and pred[:measurements] + !if pred[:value] and pred[:measurements] and !pred[:measurements].empty? self.nr_predictions[:all] +=1 x[:all] << pred[:measurements].median y[:all] << pred[:value] diff --git a/test/classification-model.rb b/test/classification-model.rb index bfb64db..85668fb 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -1,6 +1,6 @@ require_relative "setup.rb" -class LazarClassificationTest < MiniTest::Test +class ClassificationModelTest < MiniTest::Test def test_classification_default algorithms = { @@ -31,31 +31,6 @@ class LazarClassificationTest < MiniTest::Test prediction = model.predict example[:compound] assert_equal example[:prediction], prediction[:value] end - - # make a dataset prediction - compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv") - prediction_dataset = model.predict compound_dataset - puts prediction_dataset.to_csv - assert_equal compound_dataset.compounds.size, prediction_dataset.compounds.size - c = Compound.from_smiles "CC(CN(CC(O)C)N=O)O" - prediction_feature = prediction_dataset.features.select{|f| f.class == NominalLazarPrediction}[0] - assert_equal ["true"], prediction_dataset.values(c, prediction_feature) - p_true = LazarPredictionProbability.find_by(:name => "true") - p_false = LazarPredictionProbability.find_by(:name => "false") - p p_true - assert_equal [0.7], prediction_dataset.values(c,p_true) - assert_equal [0.0], prediction_dataset.values(c,p_false) - assert_equal 0.0, p_false - -# cid = prediction_dataset.compounds[7].id.to_s -# assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] -# expectations = ["Cannot create prediction: Only one similar compound in the training set.", -# "Could not find similar substances with experimental data in the training dataset."] -# prediction_dataset.predictions.each do |cid,pred| -# assert_includes expectations, pred[:warnings][0] if pred[:value].nil? -# end -# cid = Compound.from_smiles("CCOC(=O)N").id.to_s -# assert_match "excluded", prediction_dataset.predictions[cid][:info] end def test_classification_parameters @@ -81,16 +56,19 @@ class LazarClassificationTest < MiniTest::Test end def test_dataset_prediction - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv") + test_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::Lazar.create training_dataset: training_dataset - result = model.predict training_dataset - puts result.to_csv + result = model.predict test_dataset assert_kind_of Dataset, result - assert 3, result.features.size - assert 8, result.compounds.size - assert_equal ["true"], result.values(result.compounds.first, result.features[0]) - assert_equal [0.65], result.values(result.compounds.first, result.features[1]) - assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if + assert_equal 7, result.features.size + assert_equal 85, result.compounds.size + prediction_feature = result.prediction_features.first + assert_equal ["yes"], result.values(result.compounds[1], prediction_feature) + assert_equal ["no"], result.values(result.compounds[5], prediction_feature) + assert_nil result.predictions[result.compounds.first][:value] + assert_equal "yes", result.predictions[result.compounds[1]][:value] + assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["no"].round(2) end def test_carcinogenicity_rf_classification diff --git a/test/compound.rb b/test/compound.rb index 69ad21e..44e47f1 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -74,7 +74,7 @@ class CompoundTest < MiniTest::Test end def test_openbabel_segfault - inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1" + inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/t11-,12-,14-,18-,19?/m1/s1" c = Compound.from_inchi(inchi) assert_equal inchi, c.inchi diff --git a/test/dataset.rb b/test/dataset.rb index c197648..fd6ed52 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -110,7 +110,7 @@ class DatasetTest < MiniTest::Test assert_match smi, d.warnings.join end duplicates.each do |inchi| - refute_empty d.values(Compound.from_inchi(inchi),d.warnings_feature) + refute_empty d.values(Compound.from_inchi(inchi),d.warnings_features.first) end d.delete end -- cgit v1.2.3