From e778475c578f13f30af4437845716d7e781c2609 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 13 Feb 2016 13:15:29 +0100 Subject: improved handling of duplicates in validations --- lib/validation.rb | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'lib/validation.rb') diff --git a/lib/validation.rb b/lib/validation.rb index c52ffc0..651860e 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -29,17 +29,22 @@ module OpenTox atts[:training_dataset_id] = training_set.id validation_model = model.class.create training_set, atts validation_model.save - test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used + cids = test_set.compound_ids + + test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used prediction_dataset = validation_model.predict test_set_without_activities predictions = [] nr_unpredicted = 0 activities = test_set.data_entries.collect{|de| de.first} prediction_dataset.data_entries.each_with_index do |de,i| - if de[0] and de[1] and de[1].numeric? - activity = activities[i] + if de[0] and de[1] + cid = prediction_dataset.compound_ids[i] + rows = cids.each_index.select{|r| cids[r] == cid } + activities = rows.collect{|r| test_set.data_entries[r][0]} + #activity = activities[i] prediction = de.first confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]] + predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] else nr_unpredicted += 1 end @@ -57,6 +62,55 @@ module OpenTox validation end + def statistics + rmse = 0 + weighted_rmse = 0 + rse = 0 + weighted_rse = 0 + mae = 0 + weighted_mae = 0 + confidence_sum = 0 + predictions.each do |pred| + compound_id,activity,prediction,confidence = pred + if activity and prediction + error = Math.log10(prediction)-Math.log10(activity.median) + rmse += error**2 + weighted_rmse += confidence*error**2 + mae += error.abs + weighted_mae += confidence*error.abs + confidence_sum += confidence + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + x = predictions.collect{|p| p[1].median} + y = predictions.collect{|p| p[2]} + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + weighted_mae = weighted_mae/confidence_sum + rmse = Math.sqrt(rmse/predictions.size) + weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) +=begin + update_attributes( + mae: mae, + rmse: rmse, + weighted_mae: weighted_mae, + weighted_rmse: weighted_rmse, + r_squared: r**2, + finished_at: Time.now + ) +=end + puts "R^2 #{r**2}" + puts "RMSE #{rmse}" + puts "MAE #{mae}" + return { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } + end + end class ClassificationValidation < Validation -- cgit v1.2.3