From de763211bd2b6451e3a8dc20eb95a3ecf72bef17 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Thu, 11 Oct 2018 12:13:40 +0200 Subject: initial dataset batch prediction --- lib/classification.rb | 8 +++---- lib/feature.rb | 4 ++++ lib/model.rb | 50 +++++++++++++++++++++++++++++--------------- test/model-classification.rb | 12 +++++++++++ test/setup.rb | 4 ++-- 5 files changed, 55 insertions(+), 23 deletions(-) diff --git a/lib/classification.rb b/lib/classification.rb index 2668e4a..468c06a 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -20,10 +20,10 @@ module OpenTox end # DG: hack to ensure always two probability values # TODO: does not work for arbitrary feature names FIX!! - if probabilities.keys.uniq.size == 1 - missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0] - probabilities[missing_key] = 0.0 - end +# if probabilities.keys.uniq.size == 1 +# missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0] +# probabilities[missing_key] = 0.0 +# end probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) diff --git a/lib/feature.rb b/lib/feature.rb index e6fede6..2c10c26 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -37,6 +37,10 @@ module OpenTox field :training_feature_id, type: BSON::ObjectId end + class LazarPredictionProbability < NominalLazarPrediction + field :value, type: Float + end + # Numeric lazar prediction class NumericLazarPrediction < NumericFeature field :model_id, type: BSON::ObjectId diff --git a/lib/model.rb b/lib/model.rb index 8901a2c..7ee50fe 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -36,12 +36,12 @@ module OpenTox # # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:, algorithms:{} - bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset - prediction_feature = training_dataset.features.select{|f| f.measured}.first unless prediction_feature + bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset + prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature # TODO: prediction_feature without training_dataset: use all available data # guess model type - prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new + prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id @@ -199,7 +199,7 @@ module OpenTox # @return [Hash] def predict_substance substance, threshold = self.algorithms[:similarity][:min] - p substance.smiles + #p substance.smiles t = Time.now @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] @@ -286,8 +286,8 @@ module OpenTox else # try again with a lower threshold predict_substance substance, 0.2 end - p prediction - p Time.now - t + #p prediction + #p Time.now - t prediction end @@ -314,6 +314,11 @@ module OpenTox predictions = {} substances.each do |c| predictions[c.id.to_s] = predict_substance c + if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value] + prediction_feature.accept_values.each do |v| + predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity) + end + end predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end @@ -325,17 +330,28 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - # prepare prediction dataset - measurement_feature = Feature.find prediction_feature_id - - prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) - prediction_dataset = LazarPrediction.create( - :name => "Lazar prediction for #{prediction_feature.name}", - :creator => __FILE__, - :prediction_feature_id => prediction_feature.id, - :predictions => predictions - ) - return prediction_dataset + if prediction_feature.is_a? NominalBioActivity + f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) + probability_features = {} + prediction_feature.accept_values.each do |v| + probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => "probability(#{v})", :accept_values => prediction_feature.accept_values, :value => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + end + elsif prediction_feature.is_a? NumericBioActivity + f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) + # TODO prediction interval + end + + d = Dataset.new(:name => object.name) + # add predictions to dataset + predictions.each do |substance_id,p| + d.warnings += p[:warnings] + unless p[:value].nil? + d.add substance_id,f,p[:value] + p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} + end + end + d.save + return d end end diff --git a/test/model-classification.rb b/test/model-classification.rb index 7751bba..ca6eb27 100644 --- a/test/model-classification.rb +++ b/test/model-classification.rb @@ -94,6 +94,18 @@ class LazarClassificationTest < MiniTest::Test training_dataset.delete end + def test_dataset_prediction + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset + result = model.predict training_dataset + assert 3, result.features.size + assert 8, result.compounds.size + assert_equal ["true"], result.values(result.compounds.first, result.features[0]) + assert_equal [0.65], result.values(result.compounds.first, result.features[1]) + assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if + #p prediction_dataset + end + def test_carcinogenicity_rf_classification skip "Caret rf may run into a (endless?) loop for some compounds." dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv" diff --git a/test/setup.rb b/test/setup.rb index c4c04cb..51871a2 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -3,8 +3,8 @@ require 'minitest/autorun' require_relative '../lib/lazar.rb' #require 'lazar' include OpenTox -#$mongo.database.drop -#$gridfs = $mongo.database.fs # recreate GridFS indexes +$mongo.database.drop +$gridfs = $mongo.database.fs # recreate GridFS indexes #PhysChem.descriptors TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -- cgit v1.2.3