From e63e97086ac05e7a86f1a53bdcbc72eec0cabf16 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 Nov 2015 14:58:34 +0100 Subject: leave one out validation implemented --- lib/compound.rb | 18 ++-- lib/lazar.rb | 3 +- lib/leave-one-out-validation.rb | 205 ++++++++++++++++++++++++++++++++++++++++ test/validation.rb | 25 +++++ 4 files changed, 243 insertions(+), 8 deletions(-) create mode 100644 lib/leave-one-out-validation.rb diff --git a/lib/compound.rb b/lib/compound.rb index ad0eaba..d5a4cbb 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -344,16 +344,20 @@ module OpenTox return mg end - # Get mg from mmol - # @return [Float] value in mg - def mmol_to_mg(value, mw) + # Get mg from mmol + # @return [Float] value in mg + def mmol_to_mg(value, mw) mg = (value.to_f)*(mw.to_f) return mg end - # Get mg from logmg - # @return [Float] value in mg - def logmg_to_mg(value) + def mg_to_mmol mg + mg.to_f/molecular_weight + end + + # Get mg from logmg + # @return [Float] value in mg + def logmg_to_mg(value) mg = 10**value.to_f return mg end @@ -364,7 +368,7 @@ module OpenTox if self["molecular_weight"]==0.0 || self["molecular_weight"].nil? update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first) end - self["molecular_weight"] + self["molecular_weight"].to_f end diff --git a/lib/lazar.rb b/lib/lazar.rb index cc66841..5d9bc19 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -60,7 +60,7 @@ ENV['FMINER_SILENT'] = 'true' ENV['FMINER_NR_HITS'] = 'true' # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -80,6 +80,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat "regression.rb", "validation.rb", "crossvalidation.rb", + "leave-one-out-validation.rb", "experiment.rb", ].each{ |f| require_relative f } diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb new file mode 100644 index 0000000..9db10c6 --- /dev/null +++ b/lib/leave-one-out-validation.rb @@ -0,0 +1,205 @@ +module OpenTox + + class LeaveOneOutValidation + + field :model_id, type: BSON::ObjectId + field :dataset_id, type: BSON::ObjectId + field :nr_instances, type: Integer + field :nr_unpredicted, type: Integer + field :predictions, type: Array + field :finished_at, type: Time + + def self.create model + model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation + loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id + compound_ids = model.training_dataset.compound_ids + predictions = model.predict model.training_dataset.compounds + predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]} + predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?} + loo.nr_instances = predictions.size + predictions.select!{|p| p[:value]} # remove unpredicted + loo.predictions = predictions.sort{|a,b| b[:confidence] <=> a[:confidence]} + loo.nr_unpredicted = loo.nr_instances - loo.predictions.size + loo.statistics + loo.save + loo + end + + def model + Model::Lazar.find model_id + end + end + + class ClassificationLeaveOneOutValidation < LeaveOneOutValidation + + field :accept_values, type: Array + field :confusion_matrix, type: Array, default: [] + field :weighted_confusion_matrix, type: Array, default: [] + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash, default: {} + field :predictivity, type: Hash, default: {} + field :confidence_plot_id, type: BSON::ObjectId + + def statistics + accept_values = Feature.find(model.prediction_feature_id).accept_values + confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + predictions.each do |pred| + pred[:database_activities].each do |db_act| + if pred[:value] + if pred[:value] == db_act + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:confidence] + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:confidence] + end + else + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:confidence] + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:confidence] + end + end + end + end + end + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + update_attributes( + accept_values: accept_values, + confusion_matrix: confusion_matrix, + weighted_confusion_matrix: weighted_confusion_matrix, + accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, + weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, + true_rate: true_rate, + predictivity: predictivity, + finished_at: Time.now + ) + $logger.debug "Accuracy #{accuracy}" + end + + def confidence_plot + unless confidence_plot_id + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + p[:database_activities].each do |db_act| + if p[:value] + p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[:confidence] + + end + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) + end + $gridfs.find_one(_id: confidence_plot_id).data + end + end + + + class RegressionLeaveOneOutValidation < LeaveOneOutValidation + + + field :rmse, type: Float, default: 0.0 + field :mae, type: Float, default: 0 + field :weighted_rmse, type: Float, default: 0 + field :weighted_mae, type: Float, default: 0 + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId + field :confidence_plot_id, type: BSON::ObjectId + + def statistics + confidence_sum = 0 + predicted_values = [] + measured_values = [] + predictions.each do |pred| + pred[:database_activities].each do |activity| + if pred[:value] + predicted_values << pred[:value] + measured_values << activity + error = Math.log10(pred[:value])-Math.log10(activity) + self.rmse += error**2 + self.weighted_rmse += pred[:confidence]*error**2 + self.mae += error.abs + self.weighted_mae += pred[:confidence]*error.abs + confidence_sum += pred[:confidence] + end + end + if pred[:database_activities].empty? + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + R.assign "measurement", measured_values + R.assign "prediction", predicted_values + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" + r = R.eval("r").to_ruby + + self.mae = self.mae/predictions.size + self.weighted_mae = self.weighted_mae/confidence_sum + self.rmse = Math.sqrt(self.rmse/predictions.size) + self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum) + self.r_squared = r**2 + self.finished_at = Time.now + save + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + end + + def correlation_plot + unless correlation_plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.svg" + predicted_values = [] + measured_values = [] + predictions.each do |pred| + pred[:database_activities].each do |activity| + if pred[:value] + predicted_values << pred[:value] + measured_values << activity + end + end + end + attributes = Model::Lazar.find(self.model_id).attributes + attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} + attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") + R.assign "measurement", measured_values + R.assign "prediction", predicted_values + R.eval "all = c(-log(measurement),-log(prediction))" + R.eval "range = c(min(all), max(all))" + R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => plot_id) + end + $gridfs.find_one(_id: correlation_plot_id).data + end + end + +end diff --git a/test/validation.rb b/test/validation.rb index 7de944c..95f9bc0 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -128,4 +128,29 @@ class ValidationTest < MiniTest::Test p cv end + def test_classification_loo_validation + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + model = Model::LazarClassification.create dataset + loo = ClassificationLeaveOneOutValidation.create model + assert_equal 14, loo.nr_unpredicted + refute_empty loo.confusion_matrix + assert loo.accuracy > 0.77 + assert loo.weighted_accuracy > 0.85 + assert loo.accuracy < loo.weighted_accuracy + end + + def test_regression_loo_validation + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv") + model = Model::LazarRegression.create dataset + loo = RegressionLeaveOneOutValidation.create model + assert_equal 11, loo.nr_unpredicted + assert loo.weighted_mae < loo.mae + assert loo.r_squared > 0.34 + #assert_equal 14, loo.nr_unpredicted + #p loo.confusion_matrix + #p loo.accuracy + #File.open("tmp.svg","w+"){|f| f.puts loo.correlation_plot} + #`inkview tmp.svg` + end + end -- cgit v1.2.3