From 72f6cd966a249859e009a0db5f7b089aad1d6511 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 29 Feb 2016 08:59:43 +0100 Subject: regression crossvalidation fixed --- lib/crossvalidation.rb | 20 +++++++------ lib/regression.rb | 74 ++++++++++++++++++++---------------------------- test/lazar-regression.rb | 2 +- test/validation.rb | 20 ++----------- 4 files changed, 46 insertions(+), 70 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 362842e..ea32a2b 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -176,11 +176,15 @@ module OpenTox mae = 0 weighted_mae = 0 confidence_sum = 0 + x = [] + y = [] predictions.each do |pred| compound_id,activity,prediction,confidence = pred - if activity and prediction - activity.each do |act| - error = Math.log10(prediction)-Math.log10(act) + if activity and prediction + unless activity == [nil] + x << -Math.log10(activity.median) + y << -Math.log10(prediction) + error = Math.log10(prediction)-Math.log10(activity.median) rmse += error**2 weighted_rmse += confidence*error**2 mae += error.abs @@ -192,22 +196,20 @@ module OpenTox $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." end end - x = predictions.collect{|p| p[1]} - y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" r = R.eval("r").to_ruby mae = mae/predictions.size - weighted_mae = weighted_mae/confidence_sum + #weighted_mae = weighted_mae/confidence_sum rmse = Math.sqrt(rmse/predictions.size) - weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) + #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) update_attributes( mae: mae, rmse: rmse, - weighted_mae: weighted_mae, - weighted_rmse: weighted_rmse, + #weighted_mae: weighted_mae, + #weighted_rmse: weighted_rmse, r_squared: r**2, finished_at: Time.now ) diff --git a/lib/regression.rb b/lib/regression.rb index 10a1861..0694a68 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,6 +1,7 @@ module OpenTox module Algorithm + # TODO add LOO errors class Regression def self.weighted_average compound, params @@ -11,19 +12,11 @@ module OpenTox neighbors.each do |row| sim = row["tanimoto"] confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors row["features"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) - #activities << act # TODO: Transformation?? sim_sum += sim end end - #R.assign "activities", activities - #R.eval "cv = cv(activities)" - #confidence /= activities.standard_deviation#/activities.mean - #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size] - #confidence = sim_sum/neighbors.size.to_f - #confidence = neighbors.size.to_f confidence = 0 if confidence.nan? sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) {:value => prediction,:confidence => confidence} @@ -94,45 +87,46 @@ module OpenTox end def self.local_physchem_regression compound, params + neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 + return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + activities = [] - fingerprints = {} weights = [] - fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + physchem = {} neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] - fingerprint = neighbor.fingerprint row["features"][params[:prediction_feature_id].to_s].each do |act| activities << Math.log10(act) - weights << row["tanimoto"] - fingerprint_ids.each_with_index do |id,j| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) + weights << row["tanimoto"] # TODO cosine ? + neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + physchem[pid] ||= [] + physchem[pid] << v end end end - name = Feature.find(params[:prediction_feature_id]).name - R.assign "activities", activities - R.assign "weights", weights - variables = [] - data_frame = ["c(#{activities.join ","})"] - fingerprints.each do |k,v| - unless v.uniq.size == 1 - data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" - variables << k - end + # remove properties with a single value + physchem.each do |pid,v| + physchem.delete(pid) if v.uniq.size <= 1 end - if variables.empty? - result = weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result - return {:value => nil, :confidence => nil} # TODO confidence + + if physchem.empty? + result = weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result else + + name = Feature.find(params[:prediction_feature_id]).name + R.assign "weights", weights + data_frame = ["c(#{activities.join ","})"] + physchem.keys.each do |pid| + data_frame << "c(#{physchem[pid].join ","})" + end R.eval "data <- data.frame(#{data_frame.join ","})" - R.assign "features", variables + R.assign "features", physchem.keys R.eval "names(data) <- append(c('activities'),features)" # begin R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)" @@ -141,18 +135,12 @@ module OpenTox result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return result end - #begin - #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX - compound_features = variables.collect{|f| compound.fingerprint.include? f } - R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" - R.eval "names(fingerprint) <- features" # - R.eval "prediction <- predict(model,fingerprint)" - prediction = 10**R.eval("prediction").to_f - return {:value => prediction, :confidence => 1} # TODO confidence - #rescue - #p "Prediction failed" - #return {:value => nil, :confidence => nil} # TODO confidence - #end + compound_features = physchem.keys.collect{|pid| compound.physchem[pid]} + R.eval "fingerprint <- rbind(c(#{compound_features.join ','}))" + R.eval "names(fingerprint) <- features" # + R.eval "prediction <- predict(model,fingerprint)" + prediction = 10**R.eval("prediction").to_f + return {:value => prediction, :confidence => 1} # TODO confidence end end diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb index 932b91c..ae8f725 100644 --- a/test/lazar-regression.rb +++ b/test/lazar-regression.rb @@ -42,7 +42,7 @@ class LazarRegressionTest < MiniTest::Test prediction = model.predict compound model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression") prediction = model.predict compound - p prediction + # TODO assertions #assert_equal 13.6, prediction[:value].round(1) #assert_equal 0.83, prediction[:confidence].round(2) #assert_equal 1, prediction[:neighbors].size diff --git a/test/validation.rb b/test/validation.rb index b1dc95e..d8aae87 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -115,28 +115,14 @@ class ValidationTest < MiniTest::Test end def test_physchem_regression_crossvalidation - skip - - @descriptors = OpenTox::Algorithm::Descriptor::OBDESCRIPTORS.keys - refute_empty @descriptors # UPLOAD DATA training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv") - feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors - feature_dataset.save - scaled_feature_dataset = feature_dataset.scale - scaled_feature_dataset.save - model = Model::LazarRegression.create training_dataset - model.neighbor_algorithm = "physchem_neighbors" - model.neighbor_algorithm_parameters = { - :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.physchem", - :descriptors => @descriptors, - :feature_dataset_id => scaled_feature_dataset.id, - :min_sim => 0.3 - } - model.save + model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") cv = RegressionCrossValidation.create model p cv + p cv.id + p cv.statistics end def test_classification_loo_validation -- cgit v1.2.3