summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-10-11 12:13:40 +0200
committerhelma@in-silico.ch <helma@in-silico.ch>2018-10-11 12:13:40 +0200
commitde763211bd2b6451e3a8dc20eb95a3ecf72bef17 (patch)
treee33da702f80d70dfa424cecea0a6495bab0fcfc2
parent8b31acab67e22f30a87c995a94f1ee1e2a3d510f (diff)
initial dataset batch prediction
-rw-r--r--lib/classification.rb8
-rw-r--r--lib/feature.rb4
-rw-r--r--lib/model.rb50
-rw-r--r--test/model-classification.rb12
-rw-r--r--test/setup.rb4
5 files changed, 55 insertions, 23 deletions
diff --git a/lib/classification.rb b/lib/classification.rb
index 2668e4a..468c06a 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -20,10 +20,10 @@ module OpenTox
end
# DG: hack to ensure always two probability values
# TODO: does not work for arbitrary feature names FIX!!
- if probabilities.keys.uniq.size == 1
- missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
- probabilities[missing_key] = 0.0
- end
+# if probabilities.keys.uniq.size == 1
+# missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
+# probabilities[missing_key] = 0.0
+# end
probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
p_max = probabilities.collect{|a,p| p}.max
prediction = probabilities.key(p_max)
diff --git a/lib/feature.rb b/lib/feature.rb
index e6fede6..2c10c26 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -37,6 +37,10 @@ module OpenTox
field :training_feature_id, type: BSON::ObjectId
end
+ class LazarPredictionProbability < NominalLazarPrediction
+ field :value, type: Float
+ end
+
# Numeric lazar prediction
class NumericLazarPrediction < NumericFeature
field :model_id, type: BSON::ObjectId
diff --git a/lib/model.rb b/lib/model.rb
index 8901a2c..7ee50fe 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -36,12 +36,12 @@ module OpenTox
#
# @return [OpenTox::Model::Lazar]
def self.create prediction_feature:nil, training_dataset:, algorithms:{}
- bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
- prediction_feature = training_dataset.features.select{|f| f.measured}.first unless prediction_feature
+ bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
+ prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
# TODO: prediction_feature without training_dataset: use all available data
# guess model type
- prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new
+ prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
model.prediction_feature_id = prediction_feature.id
model.training_dataset_id = training_dataset.id
@@ -199,7 +199,7 @@ module OpenTox
# @return [Hash]
def predict_substance substance, threshold = self.algorithms[:similarity][:min]
- p substance.smiles
+ #p substance.smiles
t = Time.now
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
@@ -286,8 +286,8 @@ module OpenTox
else # try again with a lower threshold
predict_substance substance, 0.2
end
- p prediction
- p Time.now - t
+ #p prediction
+ #p Time.now - t
prediction
end
@@ -314,6 +314,11 @@ module OpenTox
predictions = {}
substances.each do |c|
predictions[c.id.to_s] = predict_substance c
+ if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value]
+ prediction_feature.accept_values.each do |v|
+ predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity)
+ end
+ end
predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
end
@@ -325,17 +330,28 @@ module OpenTox
elsif object.is_a? Array
return predictions
elsif object.is_a? Dataset
- # prepare prediction dataset
- measurement_feature = Feature.find prediction_feature_id
-
- prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
- prediction_dataset = LazarPrediction.create(
- :name => "Lazar prediction for #{prediction_feature.name}",
- :creator => __FILE__,
- :prediction_feature_id => prediction_feature.id,
- :predictions => predictions
- )
- return prediction_dataset
+ if prediction_feature.is_a? NominalBioActivity
+ f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ probability_features = {}
+ prediction_feature.accept_values.each do |v|
+ probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => "probability(#{v})", :accept_values => prediction_feature.accept_values, :value => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ end
+ elsif prediction_feature.is_a? NumericBioActivity
+ f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ # TODO prediction interval
+ end
+
+ d = Dataset.new(:name => object.name)
+ # add predictions to dataset
+ predictions.each do |substance_id,p|
+ d.warnings += p[:warnings]
+ unless p[:value].nil?
+ d.add substance_id,f,p[:value]
+ p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p}
+ end
+ end
+ d.save
+ return d
end
end
diff --git a/test/model-classification.rb b/test/model-classification.rb
index 7751bba..ca6eb27 100644
--- a/test/model-classification.rb
+++ b/test/model-classification.rb
@@ -94,6 +94,18 @@ class LazarClassificationTest < MiniTest::Test
training_dataset.delete
end
+ def test_dataset_prediction
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
+ model = Model::Lazar.create training_dataset: training_dataset
+ result = model.predict training_dataset
+ assert 3, result.features.size
+ assert 8, result.compounds.size
+ assert_equal ["true"], result.values(result.compounds.first, result.features[0])
+ assert_equal [0.65], result.values(result.compounds.first, result.features[1])
+ assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if
+ #p prediction_dataset
+ end
+
def test_carcinogenicity_rf_classification
skip "Caret rf may run into a (endless?) loop for some compounds."
dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv"
diff --git a/test/setup.rb b/test/setup.rb
index c4c04cb..51871a2 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -3,8 +3,8 @@ require 'minitest/autorun'
require_relative '../lib/lazar.rb'
#require 'lazar'
include OpenTox
-#$mongo.database.drop
-#$gridfs = $mongo.database.fs # recreate GridFS indexes
+$mongo.database.drop
+$gridfs = $mongo.database.fs # recreate GridFS indexes
#PhysChem.descriptors
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")