From 6bde559981fa11ffd265af708956f9d4ee6c9a89 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 8 Oct 2015 10:32:31 +0200 Subject: crossvalidation plots, original classification confidence --- lib/classification.rb | 4 +- lib/crossvalidation.rb | 111 ++++++++++++++++++++++++++----------------------- lib/lazar.rb | 3 ++ lib/model.rb | 4 +- lib/overwrite.rb | 10 +++++ lib/regression.rb | 8 ++++ lib/validation.rb | 4 +- test/compound.rb | 13 ++++++ test/setup.rb | 4 +- test/validation.rb | 36 ++++++++++++---- 10 files changed, 129 insertions(+), 68 deletions(-) diff --git a/lib/classification.rb b/lib/classification.rb index 0a32126..b4b2e59 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -11,7 +11,7 @@ module OpenTox confidence = 0.0 neighbors.each do |row| n,sim,acts = row - confidence = sim if sim > confidence # distance to nearest neighbor + #confidence = sim if sim > confidence # distance to nearest neighbor acts.each do |act| weighted_sum[act] ||= 0 weighted_sum[act] += sim @@ -24,7 +24,7 @@ module OpenTox sim_sum = weighted_sum[weighted_sum.keys[0]] sim_sum -= weighted_sum[weighted_sum.keys[1]] sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] - #confidence = (sim_sum/neighbors.size).abs + confidence = (sim_sum/neighbors.size).abs return {:value => prediction,:confidence => confidence} else bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 6dc8d7f..cbffb7c 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -52,7 +52,7 @@ module OpenTox cv.update_attributes( nr_instances: nr_instances, nr_unpredicted: nr_unpredicted, - predictions: predictions + predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence ) $logger.debug "Nr unpredicted: #{nr_unpredicted}" cv.statistics @@ -69,6 +69,7 @@ module OpenTox field :weighted_accuracy, type: Float field :true_rate, type: Hash field :predictivity, type: Hash + field :confidence_plot_id, type: BSON::ObjectId # TODO auc, f-measure (usability??) def statistics @@ -126,6 +127,30 @@ module OpenTox $logger.debug "Accuracy #{accuracy}" end + def confidence_plot + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + if p[1] and p[2] + p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[3] + + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) + $gridfs.find_one(_id: confidence_plot_id).data + end + #Average area under roc 0.646 #Area under roc 0.646 #F measure carcinogen: 0.769, noncarcinogen: 0.348 @@ -176,16 +201,6 @@ module OpenTox weighted_mae = weighted_mae/confidence_sum rmse = Math.sqrt(rmse/predictions.size) weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - # TODO check!! -=begin - predictions.sort! do |a,b| - relative_error_a = (a[1]-a[2]).abs/a[1].to_f - relative_error_a = 1/relative_error_a if relative_error_a < 1 - relative_error_b = (b[1]-b[2]).abs/b[1].to_f - relative_error_b = 1/relative_error_b if relative_error_b < 1 - [relative_error_b,b[3]] <=> [relative_error_a,a[3]] - end -=end update_attributes( mae: mae, rmse: rmse, @@ -201,44 +216,46 @@ module OpenTox def misclassifications n=nil #n = predictions.size unless n - n = 20 unless n + n ||= 10 model = Model::Lazar.find(self.model_id) training_dataset = Dataset.find(model.training_dataset_id) prediction_feature = training_dataset.features.first - predictions[0..n-1].collect do |p| - compound = Compound.find(p[0]) - neighbors = compound.neighbors.collect do |n| - neighbor = Compound.find(n[0]) - values = training_dataset.values(neighbor,prediction_feature) - { :smiles => neighbor.smiles, :fingerprint => neighbor.fp4.collect{|id| Smarts.find(id).name},:similarity => n[1], :measurements => values} + predictions.collect do |p| + unless p.include? nil + compound = Compound.find(p[0]) + neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) + neighbors.collect! do |n| + neighbor = Compound.find(n[0]) + values = training_dataset.values(neighbor,prediction_feature) + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + end + { + :smiles => compound.smiles, + #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, + :measured => p[1], + :predicted => p[2], + #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, + :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, + :relative_error => (p[1]-p[2]).abs/p[1], + :confidence => p[3], + :neighbors => neighbors + } end - { - :smiles => compound.smiles, - :fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, - :measured => p[1], - :predicted => p[2], - :relative_error => (p[1]-p[2]).abs/p[1].to_f, - :confidence => p[3], - :neighbors => neighbors - } - end + end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1] end def confidence_plot tmpfile = "/tmp/#{id.to_s}_confidence.svg" - sorted_predictions = predictions.sort{|a,b| b[3]<=>a[3]}.collect{|p| [(Math.log10(p[1])-Math.log10(p[2]))**2,p[3]]} + sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact R.assign "error", sorted_predictions.collect{|p| p[0]} - #R.assign "p", predictions.collect{|p| p[2]} - R.assign "confidence", predictions.collect{|p| p[2]} - #R.eval "diff = log(m)-log(p)" - R.eval "library(ggplot2)" - R.eval "svg(filename='#{tmpfile}')" - R.eval "image = qplot(confidence,error)"#,main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.assign "confidence", sorted_predictions.collect{|p| p[1]} + # TODO fix axis names + R.eval "image = qplot(confidence,error)" + R.eval "image = image + stat_smooth(method='lm', se=FALSE)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - R.eval "dev.off()" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) $gridfs.find_one(_id: confidence_plot_id).data end @@ -250,29 +267,17 @@ module OpenTox attributes = Model::Lazar.find(self.model_id).attributes attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") - p "'"+attributes - R.eval "library(ggplot2)" - R.eval "library(grid)" - R.eval "library(gridExtra)" R.assign "measurement", x R.assign "prediction", y - #R.eval "error <- log(Measurement)-log(Prediction)" - #R.eval "rmse <- sqrt(mean(error^2, na.rm=T))" - #R.eval "mae <- mean(abs(error), na.rm=T)" - #R.eval "r <- cor(-log(prediction),-log(measurement))" - R.eval "svg(filename='#{tmpfile}')" R.eval "all = c(-log(measurement),-log(prediction))" R.eval "range = c(min(all), max(all))" R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1) + stat_smooth(method='lm', se=FALSE)" - R.eval "text = textGrob(paste('RMSE: ', '#{rmse.round(2)},','MAE:','#{mae.round(2)},','r^2: ','#{r_squared.round(2)}','\n\n','#{attributes}'),just=c('left','top'),check.overlap = T)" - R.eval "grid.arrange(image, text, ncol=2)" - R.eval "dev.off()" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) end - p correlation_plot_id $gridfs.find_one(_id: correlation_plot_id).data end end diff --git a/lib/lazar.rb b/lib/lazar.rb index 89b50f7..f801062 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -21,6 +21,9 @@ $gridfs = $mongo.database.fs # R setup R = Rserve::Connection.new +R.eval "library(ggplot2)" +R.eval "library(grid)" +R.eval "library(gridExtra)" # Logger setup STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files diff --git a/lib/model.rb b/lib/model.rb index cd88e0c..98433d0 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -48,7 +48,7 @@ module OpenTox self end - def predict object + def predict object, use_database_values=true t = Time.now at = Time.now @@ -75,7 +75,7 @@ module OpenTox compounds.each_with_index do |compound,c| t = Time.new database_activities = training_dataset.values(compound,prediction_feature) - if database_activities and !database_activities.empty? + if use_database_values and database_activities and !database_activities.empty? database_activities = database_activities.first if database_activities.size == 1 predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} next diff --git a/lib/overwrite.rb b/lib/overwrite.rb index be90c56..c92ad2b 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -96,6 +96,16 @@ class Array self.inject{ |sum, el| sum + el }.to_f / self.size end + def sample_variance + m = self.mean + sum = self.inject(0){|accum, i| accum +(i-m)**2 } + sum/(self.length - 1).to_f + end + + def standard_deviation + Math.sqrt(self.sample_variance) + end + end module URI diff --git a/lib/regression.rb b/lib/regression.rb index 9062a9e..868c25f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -24,16 +24,24 @@ module OpenTox sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] + activities = [] neighbors.each do |row| n,sim,acts = row confidence = sim if sim > confidence # distance to nearest neighbor # TODO add LOO errors acts.each do |act| weighted_sum += sim*Math.log10(act) + activities << act sim_sum += sim end end + #R.assign "activities", activities + #R.eval "cv = cv(activities)" + #confidence /= activities.standard_deviation#/activities.mean #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size] + #confidence = sim_sum/neighbors.size.to_f + #confidence = neighbors.size.to_f + confidence = 0 if confidence.nan? sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) {:value => prediction,:confidence => confidence} end diff --git a/lib/validation.rb b/lib/validation.rb index 9eebef8..c52ffc0 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -39,7 +39,7 @@ module OpenTox activity = activities[i] prediction = de.first confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence] + predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]] else nr_unpredicted += 1 end @@ -50,7 +50,7 @@ module OpenTox :test_dataset_id => test_set.id, :nr_instances => test_set.compound_ids.size, :nr_unpredicted => nr_unpredicted, - :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence ) validation.crossvalidation_id = crossvalidation.id if crossvalidation validation.save diff --git a/test/compound.rb b/test/compound.rb index 036f384..24356d3 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -160,4 +160,17 @@ print c.sdf end end end + + def test_fingerprint_db_neighbors + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv") + [ + "CC(=O)CC(C)C#N", + "CC(=O)CC(C)C", + "C(=O)CC(C)C#N", + ].each do |smi| + c = OpenTox::Compound.from_smiles smi + neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2) + p neighbors + end + end end diff --git a/test/setup.rb b/test/setup.rb index 3dad683..ba1b7af 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -3,5 +3,7 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") +Mongoid.configure.connect_to("test") +$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/test') #$mongo.database.drop -#$gridfs = $mongo.database.fs # recreate GridFS indexes +$gridfs = $mongo.database.fs diff --git a/test/validation.rb b/test/validation.rb index af5ea60..6764a32 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -16,11 +16,35 @@ class ValidationTest < MiniTest::Test model = Model::LazarClassification.create dataset#, features cv = ClassificationCrossValidation.create model assert cv.accuracy > 0.7 + File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot} + `inkview tmp.svg` p cv.nr_unpredicted p cv.accuracy #assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy should be larger than unweighted accuracy." end + def test_default_regression_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" + model = Model::LazarRegression.create dataset + cv = RegressionCrossValidation.create model + #cv = RegressionCrossValidation.find '561503262b72ed54fd000001' + p cv.id + File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot} + `inkview tmp.svg` + File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot} + `inkview tmp.svg` + + #puts cv.misclassifications.to_yaml + p cv.rmse + p cv.weighted_rmse + assert cv.rmse < 1.5, "RMSE > 1.5" + #assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) " + p cv.mae + p cv.weighted_mae + assert cv.mae < 1 + #assert cv.weighted_mae < cv.mae + end + def test_regression_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv" @@ -41,13 +65,8 @@ class ValidationTest < MiniTest::Test refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id] end - #`inkview #{cv.plot}` - #puts JSON.pretty_generate(cv.misclassifications)#.collect{|l| l.join ", "}.join "\n" - #`inkview #{cv.plot}` - assert cv.rmse < 30, "RMSE > 30" - #assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) " - assert cv.mae < 12 - #assert cv.weighted_mae < cv.mae + assert cv.rmse < 1.5, "RMSE > 30" + assert cv.mae < 1 end def test_repeated_crossvalidation @@ -55,7 +74,8 @@ class ValidationTest < MiniTest::Test model = Model::LazarClassification.create dataset repeated_cv = RepeatedCrossValidation.create model repeated_cv.crossvalidations.each do |cv| - assert cv.accuracy > 0.7 + assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" + assert_operator cv.weighted_accuracy, :>, cv.accuracy end end -- cgit v1.2.3