summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-02-29 08:59:43 +0100
committerChristoph Helma <helma@in-silico.ch>2016-02-29 08:59:43 +0100
commit72f6cd966a249859e009a0db5f7b089aad1d6511 (patch)
tree8668abfd8e5cefdee9565b184b3ea63dd5e9491f
parentd0c6234fed7d45227fcf9309cb6dc0854d17e647 (diff)
regression crossvalidation fixed
-rw-r--r--lib/crossvalidation.rb20
-rw-r--r--lib/regression.rb74
-rw-r--r--test/lazar-regression.rb2
-rw-r--r--test/validation.rb20
4 files changed, 46 insertions, 70 deletions
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 362842e..ea32a2b 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -176,11 +176,15 @@ module OpenTox
mae = 0
weighted_mae = 0
confidence_sum = 0
+ x = []
+ y = []
predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
- if activity and prediction
- activity.each do |act|
- error = Math.log10(prediction)-Math.log10(act)
+ if activity and prediction
+ unless activity == [nil]
+ x << -Math.log10(activity.median)
+ y << -Math.log10(prediction)
+ error = Math.log10(prediction)-Math.log10(activity.median)
rmse += error**2
weighted_rmse += confidence*error**2
mae += error.abs
@@ -192,22 +196,20 @@ module OpenTox
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
end
end
- x = predictions.collect{|p| p[1]}
- y = predictions.collect{|p| p[2]}
R.assign "measurement", x
R.assign "prediction", y
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
r = R.eval("r").to_ruby
mae = mae/predictions.size
- weighted_mae = weighted_mae/confidence_sum
+ #weighted_mae = weighted_mae/confidence_sum
rmse = Math.sqrt(rmse/predictions.size)
- weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+ #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
update_attributes(
mae: mae,
rmse: rmse,
- weighted_mae: weighted_mae,
- weighted_rmse: weighted_rmse,
+ #weighted_mae: weighted_mae,
+ #weighted_rmse: weighted_rmse,
r_squared: r**2,
finished_at: Time.now
)
diff --git a/lib/regression.rb b/lib/regression.rb
index 10a1861..0694a68 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -1,6 +1,7 @@
module OpenTox
module Algorithm
+ # TODO add LOO errors
class Regression
def self.weighted_average compound, params
@@ -11,19 +12,11 @@ module OpenTox
neighbors.each do |row|
sim = row["tanimoto"]
confidence = sim if sim > confidence # distance to nearest neighbor
- # TODO add LOO errors
row["features"][params[:prediction_feature_id].to_s].each do |act|
weighted_sum += sim*Math.log10(act)
- #activities << act # TODO: Transformation??
sim_sum += sim
end
end
- #R.assign "activities", activities
- #R.eval "cv = cv(activities)"
- #confidence /= activities.standard_deviation#/activities.mean
- #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size]
- #confidence = sim_sum/neighbors.size.to_f
- #confidence = neighbors.size.to_f
confidence = 0 if confidence.nan?
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
{:value => prediction,:confidence => confidence}
@@ -94,45 +87,46 @@ module OpenTox
end
def self.local_physchem_regression compound, params
+
neighbors = params[:neighbors]
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
+ return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+
activities = []
- fingerprints = {}
weights = []
- fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
+ physchem = {}
neighbors.each_with_index do |row,i|
neighbor = Compound.find row["_id"]
- fingerprint = neighbor.fingerprint
row["features"][params[:prediction_feature_id].to_s].each do |act|
activities << Math.log10(act)
- weights << row["tanimoto"]
- fingerprint_ids.each_with_index do |id,j|
- fingerprints[id] ||= []
- fingerprints[id] << fingerprint.include?(id)
+ weights << row["tanimoto"] # TODO cosine ?
+ neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
+ physchem[pid] ||= []
+ physchem[pid] << v
end
end
end
- name = Feature.find(params[:prediction_feature_id]).name
- R.assign "activities", activities
- R.assign "weights", weights
- variables = []
- data_frame = ["c(#{activities.join ","})"]
- fingerprints.each do |k,v|
- unless v.uniq.size == 1
- data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))"
- variables << k
- end
+ # remove properties with a single value
+ physchem.each do |pid,v|
+ physchem.delete(pid) if v.uniq.size <= 1
end
- if variables.empty?
- result = weighted_average(compound, params)
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
- return result
- return {:value => nil, :confidence => nil} # TODO confidence
+
+ if physchem.empty?
+ result = weighted_average(compound, params)
+ result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
+ return result
else
+
+ name = Feature.find(params[:prediction_feature_id]).name
+ R.assign "weights", weights
+ data_frame = ["c(#{activities.join ","})"]
+ physchem.keys.each do |pid|
+ data_frame << "c(#{physchem[pid].join ","})"
+ end
R.eval "data <- data.frame(#{data_frame.join ","})"
- R.assign "features", variables
+ R.assign "features", physchem.keys
R.eval "names(data) <- append(c('activities'),features)" #
begin
R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)"
@@ -141,18 +135,12 @@ module OpenTox
result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
return result
end
- #begin
- #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX
- compound_features = variables.collect{|f| compound.fingerprint.include? f }
- R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))"
- R.eval "names(fingerprint) <- features" #
- R.eval "prediction <- predict(model,fingerprint)"
- prediction = 10**R.eval("prediction").to_f
- return {:value => prediction, :confidence => 1} # TODO confidence
- #rescue
- #p "Prediction failed"
- #return {:value => nil, :confidence => nil} # TODO confidence
- #end
+ compound_features = physchem.keys.collect{|pid| compound.physchem[pid]}
+ R.eval "fingerprint <- rbind(c(#{compound_features.join ','}))"
+ R.eval "names(fingerprint) <- features" #
+ R.eval "prediction <- predict(model,fingerprint)"
+ prediction = 10**R.eval("prediction").to_f
+ return {:value => prediction, :confidence => 1} # TODO confidence
end
end
diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb
index 932b91c..ae8f725 100644
--- a/test/lazar-regression.rb
+++ b/test/lazar-regression.rb
@@ -42,7 +42,7 @@ class LazarRegressionTest < MiniTest::Test
prediction = model.predict compound
model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression")
prediction = model.predict compound
- p prediction
+ # TODO assertions
#assert_equal 13.6, prediction[:value].round(1)
#assert_equal 0.83, prediction[:confidence].round(2)
#assert_equal 1, prediction[:neighbors].size
diff --git a/test/validation.rb b/test/validation.rb
index b1dc95e..d8aae87 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -115,28 +115,14 @@ class ValidationTest < MiniTest::Test
end
def test_physchem_regression_crossvalidation
- skip
-
- @descriptors = OpenTox::Algorithm::Descriptor::OBDESCRIPTORS.keys
- refute_empty @descriptors
# UPLOAD DATA
training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
- feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors
- feature_dataset.save
- scaled_feature_dataset = feature_dataset.scale
- scaled_feature_dataset.save
- model = Model::LazarRegression.create training_dataset
- model.neighbor_algorithm = "physchem_neighbors"
- model.neighbor_algorithm_parameters = {
- :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.physchem",
- :descriptors => @descriptors,
- :feature_dataset_id => scaled_feature_dataset.id,
- :min_sim => 0.3
- }
- model.save
+ model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
cv = RegressionCrossValidation.create model
p cv
+ p cv.id
+ p cv.statistics
end
def test_classification_loo_validation