summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-02-13 13:15:29 +0100
committerChristoph Helma <helma@in-silico.ch>2016-02-13 13:15:29 +0100
commite778475c578f13f30af4437845716d7e781c2609 (patch)
tree82c14dabc4cf29df1f097a9f8c5c4d8b0b406c4d
parentf61b7d3c65d084747dc1bf87214e5ec0c57326be (diff)
improved handling of duplicates in validations
-rw-r--r--lib/crossvalidation.rb3
-rw-r--r--lib/dataset.rb1
-rw-r--r--lib/model.rb30
-rw-r--r--lib/regression.rb62
-rw-r--r--lib/validation.rb62
-rw-r--r--test/validation.rb16
6 files changed, 111 insertions, 63 deletions
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 9789882..0c5f0be 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -176,7 +176,6 @@ module OpenTox
mae = 0
weighted_mae = 0
confidence_sum = 0
- p predictions
predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction
@@ -195,8 +194,6 @@ module OpenTox
y = predictions.collect{|p| p[2]}
R.assign "measurement", x
R.assign "prediction", y
- p x
- p y
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
r = R.eval("r").to_ruby
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 55cde63..7925bcd 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -93,6 +93,7 @@ module OpenTox
# @param [Integer] number of folds
# @return [Array] Array with folds [training_dataset,test_dataset]
def folds n
+ # TODO fix splits for duplicates
len = self.compound_ids.size
indices = (0..len-1).to_a.shuffle
mid = (len/n)
diff --git a/lib/model.rb b/lib/model.rb
index 44b36e6..0d2354f 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -48,7 +48,7 @@ module OpenTox
self
end
- def predict object, use_database_values=true
+ def predict object
t = Time.now
at = Time.now
@@ -79,31 +79,21 @@ module OpenTox
# remove neighbors without prediction_feature
# check for database activities (neighbors may include query compound)
database_activities = nil
+ prediction = {}
if neighbors.collect{|n| n["_id"]}.include? compound.id
database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s]
+ prediction[:database_activities] = database_activities
+ prediction[:warning] = "#{database_activities.size} structures have been removed from neighbors, because they have the same structure as the query compound."
neighbors.delete_if{|n| n["_id"] == compound.id}
end
neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
if neighbors.empty?
- prediction = {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."}
+ prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset."})
else
- prediction = Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset.id,:prediction_feature_id => prediction_feature.id})
+ prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset.id,:prediction_feature_id => prediction_feature.id}))
end
- prediction[:database_activities] = database_activities
predictions << prediction
-
-=begin
-# TODO scaled dataset for physchem
- p neighbor_algorithm_parameters
- p (neighbor_algorithm_parameters["feature_dataset_id"])
- d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"])
- p d
- p d.class
- if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset
- p "SCALED"
- end
-=end
end
# serialize result
@@ -116,6 +106,8 @@ module OpenTox
return predictions
when "OpenTox::Dataset"
# prepare prediction dataset
+ measurement_feature = prediction_feature
+ prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
prediction_dataset = LazarPrediction.new(
:name => "Lazar prediction for #{prediction_feature.name}",
:creator => __FILE__,
@@ -125,9 +117,11 @@ module OpenTox
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" )
# TODO move into warnings field
warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
- prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+ prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
prediction_dataset.compounds = compounds
- prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:database_activities] ? "measured" : p[:confidence] , p[:warning]]}
+ #prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:database_activities] ? "measured" : p[:confidence] , p[:warning]]}
+ # TODO fix dataset measurements
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence] , p[:dataset_activities].to_s, p[:warning]]}
prediction_dataset.save_all
return prediction_dataset
end
diff --git a/lib/regression.rb b/lib/regression.rb
index 7c64d8f..2b41851 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -4,23 +4,19 @@ module OpenTox
class Regression
def self.weighted_average compound, params
- #p params.keys
weighted_sum = 0.0
sim_sum = 0.0
confidence = 0.0
neighbors = params[:neighbors]
- #activities = []
neighbors.each do |row|
- #if row["dataset_ids"].include? params[:training_dataset_id]
- sim = row["tanimoto"]
- confidence = sim if sim > confidence # distance to nearest neighbor
- # TODO add LOO errors
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- weighted_sum += sim*Math.log10(act)
- #activities << act # TODO: Transformation??
- sim_sum += sim
- end
- #end
+ sim = row["tanimoto"]
+ confidence = sim if sim > confidence # distance to nearest neighbor
+ # TODO add LOO errors
+ row["features"][params[:prediction_feature_id].to_s].each do |act|
+ weighted_sum += sim*Math.log10(act)
+ #activities << act # TODO: Transformation??
+ sim_sum += sim
+ end
end
#R.assign "activities", activities
#R.eval "cv = cv(activities)"
@@ -35,7 +31,7 @@ module OpenTox
def self.local_pls_regression compound, params
neighbors = params[:neighbors]
- return {:value => nil, :confidence => nil} unless neighbors.size > 0
+ return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
activities = []
fingerprints = {}
weights = []
@@ -62,21 +58,37 @@ module OpenTox
fingerprints.each do |k,v|
unless v.uniq.size == 1
data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))"
- variables << "'#{k}'"
+ variables << k
end
end
- begin
+ if variables.empty?
+ result = weighted_average(compound, params)
+ result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
+ return result
+ return {:value => nil, :confidence => nil} # TODO confidence
+ else
R.eval "data <- data.frame(#{data_frame.join ","})"
- R.eval "names(data) <- c('activities',#{variables.join ','})"
- R.eval "model <- plsr(activities ~ .,data = data, ncomp = 3, weights = weights)"
- compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f }
- R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))"
- R.eval "names(fingerprint) <- c(#{variables.join ','})"
- R.eval "prediction <- predict(model,fingerprint)"
- prediction = 10**R.eval("prediction").to_f
- {:value => prediction, :confidence => 1} # TODO confidence
- rescue
- {:value => nil, :confidence => nil} # TODO confidence
+ R.assign "features", variables
+ R.eval "names(data) <- append(c('activities'),features)" #
+ begin
+ R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)"
+ rescue # fall back to weighted average
+ result = weighted_average(compound, params)
+ result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
+ return result
+ end
+ #begin
+ #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX
+ compound_features = variables.collect{|f| compound.fingerprint.include? f }
+ R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))"
+ R.eval "names(fingerprint) <- features" #
+ R.eval "prediction <- predict(model,fingerprint)"
+ prediction = 10**R.eval("prediction").to_f
+ return {:value => prediction, :confidence => 1} # TODO confidence
+ #rescue
+ #p "Prediction failed"
+ #return {:value => nil, :confidence => nil} # TODO confidence
+ #end
end
end
diff --git a/lib/validation.rb b/lib/validation.rb
index c52ffc0..651860e 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -29,17 +29,22 @@ module OpenTox
atts[:training_dataset_id] = training_set.id
validation_model = model.class.create training_set, atts
validation_model.save
- test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+ cids = test_set.compound_ids
+
+ test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
prediction_dataset = validation_model.predict test_set_without_activities
predictions = []
nr_unpredicted = 0
activities = test_set.data_entries.collect{|de| de.first}
prediction_dataset.data_entries.each_with_index do |de,i|
- if de[0] and de[1] and de[1].numeric?
- activity = activities[i]
+ if de[0] and de[1]
+ cid = prediction_dataset.compound_ids[i]
+ rows = cids.each_index.select{|r| cids[r] == cid }
+ activities = rows.collect{|r| test_set.data_entries[r][0]}
+ #activity = activities[i]
prediction = de.first
confidence = de[1]
- predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]]
+ predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
else
nr_unpredicted += 1
end
@@ -57,6 +62,55 @@ module OpenTox
validation
end
+ def statistics
+ rmse = 0
+ weighted_rmse = 0
+ rse = 0
+ weighted_rse = 0
+ mae = 0
+ weighted_mae = 0
+ confidence_sum = 0
+ predictions.each do |pred|
+ compound_id,activity,prediction,confidence = pred
+ if activity and prediction
+ error = Math.log10(prediction)-Math.log10(activity.median)
+ rmse += error**2
+ weighted_rmse += confidence*error**2
+ mae += error.abs
+ weighted_mae += confidence*error.abs
+ confidence_sum += confidence
+ else
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ end
+ end
+ x = predictions.collect{|p| p[1].median}
+ y = predictions.collect{|p| p[2]}
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+ r = R.eval("r").to_ruby
+
+ mae = mae/predictions.size
+ weighted_mae = weighted_mae/confidence_sum
+ rmse = Math.sqrt(rmse/predictions.size)
+ weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+=begin
+ update_attributes(
+ mae: mae,
+ rmse: rmse,
+ weighted_mae: weighted_mae,
+ weighted_rmse: weighted_rmse,
+ r_squared: r**2,
+ finished_at: Time.now
+ )
+=end
+ puts "R^2 #{r**2}"
+ puts "RMSE #{rmse}"
+ puts "MAE #{mae}"
+ return { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
+ end
+
end
class ClassificationValidation < Validation
diff --git a/test/validation.rb b/test/validation.rb
index 066ec95..b1dc95e 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -73,21 +73,11 @@ class ValidationTest < MiniTest::Test
def test_pls_regression_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
- #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
- params = {
- :prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression",
- }
+ params = { :prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression", }
model = Model::LazarRegression.create dataset, params
cv = RegressionCrossValidation.create model
- #p cv
- cv.validation_ids.each do |vid|
- model = Model::Lazar.find(Validation.find(vid).model_id)
- p model
- #assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
- #assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
- #refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
- end
-
+ p cv.nr_instances
+ p cv.nr_unpredicted
assert cv.rmse < 1.5, "RMSE > 1.5"
assert cv.mae < 1
end