From 6bde559981fa11ffd265af708956f9d4ee6c9a89 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 8 Oct 2015 10:32:31 +0200 Subject: crossvalidation plots, original classification confidence --- lib/classification.rb | 4 +- lib/crossvalidation.rb | 111 ++++++++++++++++++++++++++----------------------- lib/lazar.rb | 3 ++ lib/model.rb | 4 +- lib/overwrite.rb | 10 +++++ lib/regression.rb | 8 ++++ lib/validation.rb | 4 +- 7 files changed, 85 insertions(+), 59 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 0a32126..b4b2e59 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -11,7 +11,7 @@ module OpenTox confidence = 0.0 neighbors.each do |row| n,sim,acts = row - confidence = sim if sim > confidence # distance to nearest neighbor + #confidence = sim if sim > confidence # distance to nearest neighbor acts.each do |act| weighted_sum[act] ||= 0 weighted_sum[act] += sim @@ -24,7 +24,7 @@ module OpenTox sim_sum = weighted_sum[weighted_sum.keys[0]] sim_sum -= weighted_sum[weighted_sum.keys[1]] sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] - #confidence = (sim_sum/neighbors.size).abs + confidence = (sim_sum/neighbors.size).abs return {:value => prediction,:confidence => confidence} else bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 6dc8d7f..cbffb7c 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -52,7 +52,7 @@ module OpenTox cv.update_attributes( nr_instances: nr_instances, nr_unpredicted: nr_unpredicted, - predictions: predictions + predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence ) $logger.debug "Nr unpredicted: #{nr_unpredicted}" cv.statistics @@ -69,6 +69,7 @@ module OpenTox field :weighted_accuracy, type: Float field :true_rate, type: Hash field :predictivity, type: Hash + field :confidence_plot_id, type: BSON::ObjectId # TODO auc, f-measure (usability??) def statistics @@ -126,6 +127,30 @@ module OpenTox $logger.debug "Accuracy #{accuracy}" end + def confidence_plot + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + if p[1] and p[2] + p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[3] + + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) + $gridfs.find_one(_id: confidence_plot_id).data + end + #Average area under roc 0.646 #Area under roc 0.646 #F measure carcinogen: 0.769, noncarcinogen: 0.348 @@ -176,16 +201,6 @@ module OpenTox weighted_mae = weighted_mae/confidence_sum rmse = Math.sqrt(rmse/predictions.size) weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - # TODO check!! -=begin - predictions.sort! do |a,b| - relative_error_a = (a[1]-a[2]).abs/a[1].to_f - relative_error_a = 1/relative_error_a if relative_error_a < 1 - relative_error_b = (b[1]-b[2]).abs/b[1].to_f - relative_error_b = 1/relative_error_b if relative_error_b < 1 - [relative_error_b,b[3]] <=> [relative_error_a,a[3]] - end -=end update_attributes( mae: mae, rmse: rmse, @@ -201,44 +216,46 @@ module OpenTox def misclassifications n=nil #n = predictions.size unless n - n = 20 unless n + n ||= 10 model = Model::Lazar.find(self.model_id) training_dataset = Dataset.find(model.training_dataset_id) prediction_feature = training_dataset.features.first - predictions[0..n-1].collect do |p| - compound = Compound.find(p[0]) - neighbors = compound.neighbors.collect do |n| - neighbor = Compound.find(n[0]) - values = training_dataset.values(neighbor,prediction_feature) - { :smiles => neighbor.smiles, :fingerprint => neighbor.fp4.collect{|id| Smarts.find(id).name},:similarity => n[1], :measurements => values} + predictions.collect do |p| + unless p.include? nil + compound = Compound.find(p[0]) + neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) + neighbors.collect! do |n| + neighbor = Compound.find(n[0]) + values = training_dataset.values(neighbor,prediction_feature) + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + end + { + :smiles => compound.smiles, + #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, + :measured => p[1], + :predicted => p[2], + #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, + :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, + :relative_error => (p[1]-p[2]).abs/p[1], + :confidence => p[3], + :neighbors => neighbors + } end - { - :smiles => compound.smiles, - :fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, - :measured => p[1], - :predicted => p[2], - :relative_error => (p[1]-p[2]).abs/p[1].to_f, - :confidence => p[3], - :neighbors => neighbors - } - end + end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1] end def confidence_plot tmpfile = "/tmp/#{id.to_s}_confidence.svg" - sorted_predictions = predictions.sort{|a,b| b[3]<=>a[3]}.collect{|p| [(Math.log10(p[1])-Math.log10(p[2]))**2,p[3]]} + sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact R.assign "error", sorted_predictions.collect{|p| p[0]} - #R.assign "p", predictions.collect{|p| p[2]} - R.assign "confidence", predictions.collect{|p| p[2]} - #R.eval "diff = log(m)-log(p)" - R.eval "library(ggplot2)" - R.eval "svg(filename='#{tmpfile}')" - R.eval "image = qplot(confidence,error)"#,main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.assign "confidence", sorted_predictions.collect{|p| p[1]} + # TODO fix axis names + R.eval "image = qplot(confidence,error)" + R.eval "image = image + stat_smooth(method='lm', se=FALSE)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - R.eval "dev.off()" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) $gridfs.find_one(_id: confidence_plot_id).data end @@ -250,29 +267,17 @@ module OpenTox attributes = Model::Lazar.find(self.model_id).attributes attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") - p "'"+attributes - R.eval "library(ggplot2)" - R.eval "library(grid)" - R.eval "library(gridExtra)" R.assign "measurement", x R.assign "prediction", y - #R.eval "error <- log(Measurement)-log(Prediction)" - #R.eval "rmse <- sqrt(mean(error^2, na.rm=T))" - #R.eval "mae <- mean(abs(error), na.rm=T)" - #R.eval "r <- cor(-log(prediction),-log(measurement))" - R.eval "svg(filename='#{tmpfile}')" R.eval "all = c(-log(measurement),-log(prediction))" R.eval "range = c(min(all), max(all))" R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1) + stat_smooth(method='lm', se=FALSE)" - R.eval "text = textGrob(paste('RMSE: ', '#{rmse.round(2)},','MAE:','#{mae.round(2)},','r^2: ','#{r_squared.round(2)}','\n\n','#{attributes}'),just=c('left','top'),check.overlap = T)" - R.eval "grid.arrange(image, text, ncol=2)" - R.eval "dev.off()" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) end - p correlation_plot_id $gridfs.find_one(_id: correlation_plot_id).data end end diff --git a/lib/lazar.rb b/lib/lazar.rb index 89b50f7..f801062 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -21,6 +21,9 @@ $gridfs = $mongo.database.fs # R setup R = Rserve::Connection.new +R.eval "library(ggplot2)" +R.eval "library(grid)" +R.eval "library(gridExtra)" # Logger setup STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files diff --git a/lib/model.rb b/lib/model.rb index cd88e0c..98433d0 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -48,7 +48,7 @@ module OpenTox self end - def predict object + def predict object, use_database_values=true t = Time.now at = Time.now @@ -75,7 +75,7 @@ module OpenTox compounds.each_with_index do |compound,c| t = Time.new database_activities = training_dataset.values(compound,prediction_feature) - if database_activities and !database_activities.empty? + if use_database_values and database_activities and !database_activities.empty? database_activities = database_activities.first if database_activities.size == 1 predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} next diff --git a/lib/overwrite.rb b/lib/overwrite.rb index be90c56..c92ad2b 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -96,6 +96,16 @@ class Array self.inject{ |sum, el| sum + el }.to_f / self.size end + def sample_variance + m = self.mean + sum = self.inject(0){|accum, i| accum +(i-m)**2 } + sum/(self.length - 1).to_f + end + + def standard_deviation + Math.sqrt(self.sample_variance) + end + end module URI diff --git a/lib/regression.rb b/lib/regression.rb index 9062a9e..868c25f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -24,16 +24,24 @@ module OpenTox sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] + activities = [] neighbors.each do |row| n,sim,acts = row confidence = sim if sim > confidence # distance to nearest neighbor # TODO add LOO errors acts.each do |act| weighted_sum += sim*Math.log10(act) + activities << act sim_sum += sim end end + #R.assign "activities", activities + #R.eval "cv = cv(activities)" + #confidence /= activities.standard_deviation#/activities.mean #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size] + #confidence = sim_sum/neighbors.size.to_f + #confidence = neighbors.size.to_f + confidence = 0 if confidence.nan? sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) {:value => prediction,:confidence => confidence} end diff --git a/lib/validation.rb b/lib/validation.rb index 9eebef8..c52ffc0 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -39,7 +39,7 @@ module OpenTox activity = activities[i] prediction = de.first confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence] + predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]] else nr_unpredicted += 1 end @@ -50,7 +50,7 @@ module OpenTox :test_dataset_id => test_set.id, :nr_instances => test_set.compound_ids.size, :nr_unpredicted => nr_unpredicted, - :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence ) validation.crossvalidation_id = crossvalidation.id if crossvalidation validation.save -- cgit v1.2.3