From e778475c578f13f30af4437845716d7e781c2609 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 13 Feb 2016 13:15:29 +0100 Subject: improved handling of duplicates in validations --- lib/crossvalidation.rb | 3 --- lib/dataset.rb | 1 + lib/model.rb | 30 ++++++++++-------------- lib/regression.rb | 62 ++++++++++++++++++++++++++++++-------------------- lib/validation.rb | 62 ++++++++++++++++++++++++++++++++++++++++++++++---- test/validation.rb | 16 +++---------- 6 files changed, 111 insertions(+), 63 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 9789882..0c5f0be 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -176,7 +176,6 @@ module OpenTox mae = 0 weighted_mae = 0 confidence_sum = 0 - p predictions predictions.each do |pred| compound_id,activity,prediction,confidence = pred if activity and prediction @@ -195,8 +194,6 @@ module OpenTox y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y - p x - p y R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" r = R.eval("r").to_ruby diff --git a/lib/dataset.rb b/lib/dataset.rb index 55cde63..7925bcd 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -93,6 +93,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n + # TODO fix splits for duplicates len = self.compound_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) diff --git a/lib/model.rb b/lib/model.rb index 44b36e6..0d2354f 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -48,7 +48,7 @@ module OpenTox self end - def predict object, use_database_values=true + def predict object t = Time.now at = Time.now @@ -79,31 +79,21 @@ module OpenTox # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) database_activities = nil + prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s] + prediction[:database_activities] = database_activities + prediction[:warning] = "#{database_activities.size} structures have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? - prediction = {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} + prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset."}) else - prediction = Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset.id,:prediction_feature_id => prediction_feature.id}) + prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset.id,:prediction_feature_id => prediction_feature.id})) end - prediction[:database_activities] = database_activities predictions << prediction - -=begin -# TODO scaled dataset for physchem - p neighbor_algorithm_parameters - p (neighbor_algorithm_parameters["feature_dataset_id"]) - d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]) - p d - p d.class - if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset - p "SCALED" - end -=end end # serialize result @@ -116,6 +106,8 @@ module OpenTox return predictions when "OpenTox::Dataset" # prepare prediction dataset + measurement_feature = prediction_feature + prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) prediction_dataset = LazarPrediction.new( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, @@ -125,9 +117,11 @@ module OpenTox confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" ) # TODO move into warnings field warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") - prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ] prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:database_activities] ? "measured" : p[:confidence] , p[:warning]]} + #prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:database_activities] ? "measured" : p[:confidence] , p[:warning]]} + # TODO fix dataset measurements + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence] , p[:dataset_activities].to_s, p[:warning]]} prediction_dataset.save_all return prediction_dataset end diff --git a/lib/regression.rb b/lib/regression.rb index 7c64d8f..2b41851 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -4,23 +4,19 @@ module OpenTox class Regression def self.weighted_average compound, params - #p params.keys weighted_sum = 0.0 sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] - #activities = [] neighbors.each do |row| - #if row["dataset_ids"].include? params[:training_dataset_id] - sim = row["tanimoto"] - confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors - row["features"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum += sim*Math.log10(act) - #activities << act # TODO: Transformation?? - sim_sum += sim - end - #end + sim = row["tanimoto"] + confidence = sim if sim > confidence # distance to nearest neighbor + # TODO add LOO errors + row["features"][params[:prediction_feature_id].to_s].each do |act| + weighted_sum += sim*Math.log10(act) + #activities << act # TODO: Transformation?? + sim_sum += sim + end end #R.assign "activities", activities #R.eval "cv = cv(activities)" @@ -35,7 +31,7 @@ module OpenTox def self.local_pls_regression compound, params neighbors = params[:neighbors] - return {:value => nil, :confidence => nil} unless neighbors.size > 0 + return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 activities = [] fingerprints = {} weights = [] @@ -62,21 +58,37 @@ module OpenTox fingerprints.each do |k,v| unless v.uniq.size == 1 data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" - variables << "'#{k}'" + variables << k end end - begin + if variables.empty? + result = weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result + return {:value => nil, :confidence => nil} # TODO confidence + else R.eval "data <- data.frame(#{data_frame.join ","})" - R.eval "names(data) <- c('activities',#{variables.join ','})" - R.eval "model <- plsr(activities ~ .,data = data, ncomp = 3, weights = weights)" - compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } - R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" - R.eval "names(fingerprint) <- c(#{variables.join ','})" - R.eval "prediction <- predict(model,fingerprint)" - prediction = 10**R.eval("prediction").to_f - {:value => prediction, :confidence => 1} # TODO confidence - rescue - {:value => nil, :confidence => nil} # TODO confidence + R.assign "features", variables + R.eval "names(data) <- append(c('activities'),features)" # + begin + R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)" + rescue # fall back to weighted average + result = weighted_average(compound, params) + result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return result + end + #begin + #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX + compound_features = variables.collect{|f| compound.fingerprint.include? f } + R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" + R.eval "names(fingerprint) <- features" # + R.eval "prediction <- predict(model,fingerprint)" + prediction = 10**R.eval("prediction").to_f + return {:value => prediction, :confidence => 1} # TODO confidence + #rescue + #p "Prediction failed" + #return {:value => nil, :confidence => nil} # TODO confidence + #end end end diff --git a/lib/validation.rb b/lib/validation.rb index c52ffc0..651860e 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -29,17 +29,22 @@ module OpenTox atts[:training_dataset_id] = training_set.id validation_model = model.class.create training_set, atts validation_model.save - test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used + cids = test_set.compound_ids + + test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used prediction_dataset = validation_model.predict test_set_without_activities predictions = [] nr_unpredicted = 0 activities = test_set.data_entries.collect{|de| de.first} prediction_dataset.data_entries.each_with_index do |de,i| - if de[0] and de[1] and de[1].numeric? - activity = activities[i] + if de[0] and de[1] + cid = prediction_dataset.compound_ids[i] + rows = cids.each_index.select{|r| cids[r] == cid } + activities = rows.collect{|r| test_set.data_entries[r][0]} + #activity = activities[i] prediction = de.first confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]] + predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] else nr_unpredicted += 1 end @@ -57,6 +62,55 @@ module OpenTox validation end + def statistics + rmse = 0 + weighted_rmse = 0 + rse = 0 + weighted_rse = 0 + mae = 0 + weighted_mae = 0 + confidence_sum = 0 + predictions.each do |pred| + compound_id,activity,prediction,confidence = pred + if activity and prediction + error = Math.log10(prediction)-Math.log10(activity.median) + rmse += error**2 + weighted_rmse += confidence*error**2 + mae += error.abs + weighted_mae += confidence*error.abs + confidence_sum += confidence + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + x = predictions.collect{|p| p[1].median} + y = predictions.collect{|p| p[2]} + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + weighted_mae = weighted_mae/confidence_sum + rmse = Math.sqrt(rmse/predictions.size) + weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) +=begin + update_attributes( + mae: mae, + rmse: rmse, + weighted_mae: weighted_mae, + weighted_rmse: weighted_rmse, + r_squared: r**2, + finished_at: Time.now + ) +=end + puts "R^2 #{r**2}" + puts "RMSE #{rmse}" + puts "MAE #{mae}" + return { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } + end + end class ClassificationValidation < Validation diff --git a/test/validation.rb b/test/validation.rb index 066ec95..b1dc95e 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -73,21 +73,11 @@ class ValidationTest < MiniTest::Test def test_pls_regression_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" - #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv" - params = { - :prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression", - } + params = { :prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression", } model = Model::LazarRegression.create dataset, params cv = RegressionCrossValidation.create model - #p cv - cv.validation_ids.each do |vid| - model = Model::Lazar.find(Validation.find(vid).model_id) - p model - #assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type] - #assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim] - #refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id] - end - + p cv.nr_instances + p cv.nr_unpredicted assert cv.rmse < 1.5, "RMSE > 1.5" assert cv.mae < 1 end -- cgit v1.2.3