From 5e9a08c0b534fa96179fb5c81a9b4193e7b0aad8 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 17:58:09 +0100 Subject: dataset folds fixed --- lib/crossvalidation.rb | 16 +++++++--------- lib/dataset.rb | 30 +++--------------------------- lib/feature.rb | 2 +- lib/model.rb | 15 +++++++++------ test/classification-model.rb | 1 + test/dataset.rb | 4 ++++ test/regression-model.rb | 18 ++++++++++++++++++ 7 files changed, 43 insertions(+), 43 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 2e44ff2..4f61ff4 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -29,16 +29,14 @@ module OpenTox training_dataset = model.training_dataset training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems - $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" - t = Time.now - validation = TrainTest.create(model, fold[0], fold[1]) - cv.validation_ids << validation.id - cv.nr_instances += validation.nr_instances - cv.nr_unpredicted += validation.nr_unpredicted - $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - #end + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = TrainTest.create(model, fold[0], fold[1]) + cv.validation_ids << validation.id + cv.nr_instances += validation.nr_instances + cv.nr_unpredicted += validation.nr_unpredicted + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" end - #Process.waitall cv.save $logger.debug "Nr unpredicted: #{cv.nr_unpredicted}" cv.statistics diff --git a/lib/dataset.rb b/lib/dataset.rb index 9611fff..41d7b5c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -401,7 +401,7 @@ module OpenTox substance.dataset_ids << dataset.id substance.dataset_ids.uniq! substance.save - dataset.data_entries << data_entries.select{|row| row[0] == substance.id} + dataset.data_entries += data_entries.select{|row| row[0] == substance.id} end dataset.save dataset @@ -433,6 +433,8 @@ module OpenTox if map values(c,feature).each { |v| dataset.add c, new_feature, map[v] } else + end + end end def transform # TODO @@ -446,30 +448,4 @@ module OpenTox end - # Dataset for lazar predictions - class LazarPrediction < Dataset - field :creator, type: String - #field :prediction_feature_id, type: BSON::ObjectId - field :predictions, type: Hash, default: {} - - # Get prediction feature - # @return [OpenTox::Feature] - def prediction_feature - Feature.find prediction_feature_id - end - - def prediction compound - end - - def probability klass - end - - def prediction_interval - end - - def predictions - end - - end - end diff --git a/lib/feature.rb b/lib/feature.rb index 50dea77..be07e7a 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -65,7 +65,7 @@ module OpenTox field :training_feature_id, type: BSON::ObjectId end - class LazarConfidenceInterval < NumericLazarPrediction + class LazarPredictionInterval < NumericLazarPrediction end class NominalSubstanceProperty < NominalFeature diff --git a/lib/model.rb b/lib/model.rb index 9858949..fc98e09 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -277,7 +277,7 @@ module OpenTox prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} #if neighbor_similarities.max < algorithms[:similarity][:warn_min] - #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." + #prediction[:warnings] << "Closest neighbor has similarity #{neighbor_similarities.max} < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." #end end if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 @@ -328,7 +328,8 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - warning_feature = InfoFeature.find_or_create_by(:name => "Warnings") + d = object.copy + warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) if prediction_feature.is_a? NominalBioActivity f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) probability_features = {} @@ -337,17 +338,19 @@ module OpenTox end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) - # TODO prediction interval + prediction_interval = {} + ["lower","upper"].each do |v| + prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + end end - d = Dataset.new(:name => object.name) # add predictions to dataset predictions.each do |substance_id,p| d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings] unless p[:value].nil? d.add substance_id,f,p[:value] - p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} - # TODO prediction interval + p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] + p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval] end end d.save diff --git a/test/classification-model.rb b/test/classification-model.rb index 7a2a64f..bfb64db 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -84,6 +84,7 @@ class LazarClassificationTest < MiniTest::Test training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::Lazar.create training_dataset: training_dataset result = model.predict training_dataset + puts result.to_csv assert_kind_of Dataset, result assert 3, result.features.size assert 8, result.compounds.size diff --git a/test/dataset.rb b/test/dataset.rb index 0beea2d..c197648 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -172,6 +172,10 @@ class DatasetTest < MiniTest::Test fold.each do |d| assert_operator d.compounds.size, :>=, d.compounds.uniq.size end + refute_empty fold[0].compounds + refute_empty fold[1].compounds + refute_empty fold[0].data_entries + refute_empty fold[1].data_entries assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size assert_equal dataset.substances.size, fold.first.substances.size + fold.last.substances.size assert_empty (fold.first.substances & fold.last.substances) diff --git a/test/regression-model.rb b/test/regression-model.rb index 5903e88..0104741 100644 --- a/test/regression-model.rb +++ b/test/regression-model.rb @@ -168,4 +168,22 @@ class LazarRegressionTest < MiniTest::Test assert_equal 0.83, prediction[:value].round(2) end + def test_dataset_prediction + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + model = Model::Lazar.create training_dataset: training_dataset + result = model.predict training_dataset + assert_kind_of Dataset, result + puts result.to_csv + puts result.features + # TODO + # check prediction + # check prediction_interval + # check warnings/applicability domain + assert 3, result.features.size + assert 8, result.compounds.size + assert_equal ["true"], result.values(result.compounds.first, result.features[1]) + assert_equal [0.65], result.values(result.compounds.first, result.features[2]) + assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if + end + end -- cgit v1.2.3