summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-10-08 10:32:31 +0200
committerChristoph Helma <helma@in-silico.ch>2015-10-08 10:32:31 +0200
commit6bde559981fa11ffd265af708956f9d4ee6c9a89 (patch)
tree0fdeff56c476bb2eb0e6a2af895a1e9306645904
parentc974ddec27b8e505a8dc22a7c99f2e4b8682aa48 (diff)
crossvalidation plots, original classification confidence
-rw-r--r--lib/classification.rb4
-rw-r--r--lib/crossvalidation.rb111
-rw-r--r--lib/lazar.rb3
-rw-r--r--lib/model.rb4
-rw-r--r--lib/overwrite.rb10
-rw-r--r--lib/regression.rb8
-rw-r--r--lib/validation.rb4
-rw-r--r--test/compound.rb13
-rw-r--r--test/setup.rb4
-rw-r--r--test/validation.rb36
10 files changed, 129 insertions, 68 deletions
diff --git a/lib/classification.rb b/lib/classification.rb
index 0a32126..b4b2e59 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -11,7 +11,7 @@ module OpenTox
confidence = 0.0
neighbors.each do |row|
n,sim,acts = row
- confidence = sim if sim > confidence # distance to nearest neighbor
+ #confidence = sim if sim > confidence # distance to nearest neighbor
acts.each do |act|
weighted_sum[act] ||= 0
weighted_sum[act] += sim
@@ -24,7 +24,7 @@ module OpenTox
sim_sum = weighted_sum[weighted_sum.keys[0]]
sim_sum -= weighted_sum[weighted_sum.keys[1]]
sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
- #confidence = (sim_sum/neighbors.size).abs
+ confidence = (sim_sum/neighbors.size).abs
return {:value => prediction,:confidence => confidence}
else
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 6dc8d7f..cbffb7c 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -52,7 +52,7 @@ module OpenTox
cv.update_attributes(
nr_instances: nr_instances,
nr_unpredicted: nr_unpredicted,
- predictions: predictions
+ predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
)
$logger.debug "Nr unpredicted: #{nr_unpredicted}"
cv.statistics
@@ -69,6 +69,7 @@ module OpenTox
field :weighted_accuracy, type: Float
field :true_rate, type: Hash
field :predictivity, type: Hash
+ field :confidence_plot_id, type: BSON::ObjectId
# TODO auc, f-measure (usability??)
def statistics
@@ -126,6 +127,30 @@ module OpenTox
$logger.debug "Accuracy #{accuracy}"
end
+ def confidence_plot
+ tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+ accuracies = []
+ confidences = []
+ correct_predictions = 0
+ incorrect_predictions = 0
+ predictions.each do |p|
+ if p[1] and p[2]
+ p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+ confidences << p[3]
+
+ end
+ end
+ R.assign "accuracy", accuracies
+ R.assign "confidence", confidences
+ R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+ plot_id = $gridfs.insert_one(file)
+ update(:confidence_plot_id => plot_id)
+ $gridfs.find_one(_id: confidence_plot_id).data
+ end
+
#Average area under roc 0.646
#Area under roc 0.646
#F measure carcinogen: 0.769, noncarcinogen: 0.348
@@ -176,16 +201,6 @@ module OpenTox
weighted_mae = weighted_mae/confidence_sum
rmse = Math.sqrt(rmse/predictions.size)
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
- # TODO check!!
-=begin
- predictions.sort! do |a,b|
- relative_error_a = (a[1]-a[2]).abs/a[1].to_f
- relative_error_a = 1/relative_error_a if relative_error_a < 1
- relative_error_b = (b[1]-b[2]).abs/b[1].to_f
- relative_error_b = 1/relative_error_b if relative_error_b < 1
- [relative_error_b,b[3]] <=> [relative_error_a,a[3]]
- end
-=end
update_attributes(
mae: mae,
rmse: rmse,
@@ -201,44 +216,46 @@ module OpenTox
def misclassifications n=nil
#n = predictions.size unless n
- n = 20 unless n
+ n ||= 10
model = Model::Lazar.find(self.model_id)
training_dataset = Dataset.find(model.training_dataset_id)
prediction_feature = training_dataset.features.first
- predictions[0..n-1].collect do |p|
- compound = Compound.find(p[0])
- neighbors = compound.neighbors.collect do |n|
- neighbor = Compound.find(n[0])
- values = training_dataset.values(neighbor,prediction_feature)
- { :smiles => neighbor.smiles, :fingerprint => neighbor.fp4.collect{|id| Smarts.find(id).name},:similarity => n[1], :measurements => values}
+ predictions.collect do |p|
+ unless p.include? nil
+ compound = Compound.find(p[0])
+ neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
+ neighbors.collect! do |n|
+ neighbor = Compound.find(n[0])
+ values = training_dataset.values(neighbor,prediction_feature)
+ { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+ end
+ {
+ :smiles => compound.smiles,
+ #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
+ :measured => p[1],
+ :predicted => p[2],
+ #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
+ :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+ :relative_error => (p[1]-p[2]).abs/p[1],
+ :confidence => p[3],
+ :neighbors => neighbors
+ }
end
- {
- :smiles => compound.smiles,
- :fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
- :measured => p[1],
- :predicted => p[2],
- :relative_error => (p[1]-p[2]).abs/p[1].to_f,
- :confidence => p[3],
- :neighbors => neighbors
- }
- end
+ end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1]
end
def confidence_plot
tmpfile = "/tmp/#{id.to_s}_confidence.svg"
- sorted_predictions = predictions.sort{|a,b| b[3]<=>a[3]}.collect{|p| [(Math.log10(p[1])-Math.log10(p[2]))**2,p[3]]}
+ sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
R.assign "error", sorted_predictions.collect{|p| p[0]}
- #R.assign "p", predictions.collect{|p| p[2]}
- R.assign "confidence", predictions.collect{|p| p[2]}
- #R.eval "diff = log(m)-log(p)"
- R.eval "library(ggplot2)"
- R.eval "svg(filename='#{tmpfile}')"
- R.eval "image = qplot(confidence,error)"#,main='#{self.name}',asp=1,xlim=range, ylim=range)"
+ R.assign "confidence", sorted_predictions.collect{|p| p[1]}
+ # TODO fix axis names
+ R.eval "image = qplot(confidence,error)"
+ R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
R.eval "ggsave(file='#{tmpfile}', plot=image)"
- R.eval "dev.off()"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
- plot_id = $gridfs.insert_one(file)
- update(:confidence_plot_id => plot_id)
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+ plot_id = $gridfs.insert_one(file)
+ update(:confidence_plot_id => plot_id)
$gridfs.find_one(_id: confidence_plot_id).data
end
@@ -250,29 +267,17 @@ module OpenTox
attributes = Model::Lazar.find(self.model_id).attributes
attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
- p "'"+attributes
- R.eval "library(ggplot2)"
- R.eval "library(grid)"
- R.eval "library(gridExtra)"
R.assign "measurement", x
R.assign "prediction", y
- #R.eval "error <- log(Measurement)-log(Prediction)"
- #R.eval "rmse <- sqrt(mean(error^2, na.rm=T))"
- #R.eval "mae <- mean(abs(error), na.rm=T)"
- #R.eval "r <- cor(-log(prediction),-log(measurement))"
- R.eval "svg(filename='#{tmpfile}')"
R.eval "all = c(-log(measurement),-log(prediction))"
R.eval "range = c(min(all), max(all))"
R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
- R.eval "image = image + geom_abline(intercept=0, slope=1) + stat_smooth(method='lm', se=FALSE)"
- R.eval "text = textGrob(paste('RMSE: ', '#{rmse.round(2)},','MAE:','#{mae.round(2)},','r^2: ','#{r_squared.round(2)}','\n\n','#{attributes}'),just=c('left','top'),check.overlap = T)"
- R.eval "grid.arrange(image, text, ncol=2)"
- R.eval "dev.off()"
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
plot_id = $gridfs.insert_one(file)
update(:correlation_plot_id => plot_id)
end
- p correlation_plot_id
$gridfs.find_one(_id: correlation_plot_id).data
end
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 89b50f7..f801062 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -21,6 +21,9 @@ $gridfs = $mongo.database.fs
# R setup
R = Rserve::Connection.new
+R.eval "library(ggplot2)"
+R.eval "library(grid)"
+R.eval "library(gridExtra)"
# Logger setup
STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
diff --git a/lib/model.rb b/lib/model.rb
index cd88e0c..98433d0 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -48,7 +48,7 @@ module OpenTox
self
end
- def predict object
+ def predict object, use_database_values=true
t = Time.now
at = Time.now
@@ -75,7 +75,7 @@ module OpenTox
compounds.each_with_index do |compound,c|
t = Time.new
database_activities = training_dataset.values(compound,prediction_feature)
- if database_activities and !database_activities.empty?
+ if use_database_values and database_activities and !database_activities.empty?
database_activities = database_activities.first if database_activities.size == 1
predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
next
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index be90c56..c92ad2b 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -96,6 +96,16 @@ class Array
self.inject{ |sum, el| sum + el }.to_f / self.size
end
+ def sample_variance
+ m = self.mean
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
+ sum/(self.length - 1).to_f
+ end
+
+ def standard_deviation
+ Math.sqrt(self.sample_variance)
+ end
+
end
module URI
diff --git a/lib/regression.rb b/lib/regression.rb
index 9062a9e..868c25f 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -24,16 +24,24 @@ module OpenTox
sim_sum = 0.0
confidence = 0.0
neighbors = params[:neighbors]
+ activities = []
neighbors.each do |row|
n,sim,acts = row
confidence = sim if sim > confidence # distance to nearest neighbor
# TODO add LOO errors
acts.each do |act|
weighted_sum += sim*Math.log10(act)
+ activities << act
sim_sum += sim
end
end
+ #R.assign "activities", activities
+ #R.eval "cv = cv(activities)"
+ #confidence /= activities.standard_deviation#/activities.mean
#confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size]
+ #confidence = sim_sum/neighbors.size.to_f
+ #confidence = neighbors.size.to_f
+ confidence = 0 if confidence.nan?
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
{:value => prediction,:confidence => confidence}
end
diff --git a/lib/validation.rb b/lib/validation.rb
index 9eebef8..c52ffc0 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -39,7 +39,7 @@ module OpenTox
activity = activities[i]
prediction = de.first
confidence = de[1]
- predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
+ predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]]
else
nr_unpredicted += 1
end
@@ -50,7 +50,7 @@ module OpenTox
:test_dataset_id => test_set.id,
:nr_instances => test_set.compound_ids.size,
:nr_unpredicted => nr_unpredicted,
- :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+ :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
)
validation.crossvalidation_id = crossvalidation.id if crossvalidation
validation.save
diff --git a/test/compound.rb b/test/compound.rb
index 036f384..24356d3 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -160,4 +160,17 @@ print c.sdf
end
end
end
+
+ def test_fingerprint_db_neighbors
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
+ [
+ "CC(=O)CC(C)C#N",
+ "CC(=O)CC(C)C",
+ "C(=O)CC(C)C#N",
+ ].each do |smi|
+ c = OpenTox::Compound.from_smiles smi
+ neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2)
+ p neighbors
+ end
+ end
end
diff --git a/test/setup.rb b/test/setup.rb
index 3dad683..ba1b7af 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -3,5 +3,7 @@ require_relative '../lib/lazar.rb'
include OpenTox
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
+Mongoid.configure.connect_to("test")
+$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/test')
#$mongo.database.drop
-#$gridfs = $mongo.database.fs # recreate GridFS indexes
+$gridfs = $mongo.database.fs
diff --git a/test/validation.rb b/test/validation.rb
index af5ea60..6764a32 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -16,11 +16,35 @@ class ValidationTest < MiniTest::Test
model = Model::LazarClassification.create dataset#, features
cv = ClassificationCrossValidation.create model
assert cv.accuracy > 0.7
+ File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
+ `inkview tmp.svg`
p cv.nr_unpredicted
p cv.accuracy
#assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy should be larger than unweighted accuracy."
end
+ def test_default_regression_crossvalidation
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
+ model = Model::LazarRegression.create dataset
+ cv = RegressionCrossValidation.create model
+ #cv = RegressionCrossValidation.find '561503262b72ed54fd000001'
+ p cv.id
+ File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot}
+ `inkview tmp.svg`
+ File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
+ `inkview tmp.svg`
+
+ #puts cv.misclassifications.to_yaml
+ p cv.rmse
+ p cv.weighted_rmse
+ assert cv.rmse < 1.5, "RMSE > 1.5"
+ #assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
+ p cv.mae
+ p cv.weighted_mae
+ assert cv.mae < 1
+ #assert cv.weighted_mae < cv.mae
+ end
+
def test_regression_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
#dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
@@ -41,13 +65,8 @@ class ValidationTest < MiniTest::Test
refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
end
- #`inkview #{cv.plot}`
- #puts JSON.pretty_generate(cv.misclassifications)#.collect{|l| l.join ", "}.join "\n"
- #`inkview #{cv.plot}`
- assert cv.rmse < 30, "RMSE > 30"
- #assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
- assert cv.mae < 12
- #assert cv.weighted_mae < cv.mae
+ assert cv.rmse < 1.5, "RMSE > 30"
+ assert cv.mae < 1
end
def test_repeated_crossvalidation
@@ -55,7 +74,8 @@ class ValidationTest < MiniTest::Test
model = Model::LazarClassification.create dataset
repeated_cv = RepeatedCrossValidation.create model
repeated_cv.crossvalidations.each do |cv|
- assert cv.accuracy > 0.7
+ assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
+ assert_operator cv.weighted_accuracy, :>, cv.accuracy
end
end