summaryrefslogtreecommitdiff
path: root/lib/crossvalidation.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-08-26 14:20:23 +0200
committerChristoph Helma <helma@in-silico.ch>2015-08-26 14:20:23 +0200
commitd542e9fe92567c54423f39904111bd5293236416 (patch)
tree68d04fe73e7012a2732a15703b25f5934c7e7dad /lib/crossvalidation.rb
parentf8faf510b4574df1a00fa61a9f0a1681fc2f4857 (diff)
Parallel Crossvalidations
Diffstat (limited to 'lib/crossvalidation.rb')
-rw-r--r--lib/crossvalidation.rb81
1 files changed, 42 insertions, 39 deletions
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 4407aeb..58a9664 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -6,13 +6,16 @@ module OpenTox
field :folds, type: Integer
field :nr_instances, type: Integer
field :nr_unpredicted, type: Integer
- field :predictions, type: Array
+ field :predictions, type: Array, default: []
field :finished_at, type: Time
- #belongs_to :prediction
def time
finished_at - created_at
end
+
+ def validations
+ validation_ids.collect{|vid| Validation.find vid}
+ end
end
class ClassificationCrossValidation < CrossValidation
@@ -45,7 +48,7 @@ module OpenTox
t = Time.now
$logger.debug "Fold #{fold_nr}"
validation = validation_class.create(model, fold[0], fold[1])
- validation_ids << validation.id
+ #validation_ids << validation.id
nr_instances += validation.nr_instances
nr_unpredicted += validation.nr_unpredicted
predictions += validation.predictions
@@ -74,7 +77,7 @@ module OpenTox
name: model.name,
model_id: model.id,
folds: n,
- validation_ids: validation_ids,
+ #validation_ids: validation_ids,
nr_instances: nr_instances,
nr_unpredicted: nr_unpredicted,
accept_values: accept_values,
@@ -103,29 +106,33 @@ module OpenTox
field :weighted_rmse, type: Float
field :weighted_mae, type: Float
field :weighted_mae, type: Float
+ field :r_squared, type: Float
field :correlation_plot_id, type: BSON::ObjectId
def self.create model, n=10
cv = self.new
cv.save # set created_at
- validation_ids = []
+ #validation_ids = []
nr_instances = 0
nr_unpredicted = 0
predictions = []
validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
fold_nr = 1
training_dataset = Dataset.find model.training_dataset_id
- training_dataset.folds(n).each do |fold|
- t = Time.now
- $logger.debug "Predicting fold #{fold_nr}"
-
- validation = validation_class.create(model, fold[0], fold[1])
- validation_ids << validation.id
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
+ fork do # parallel execution of validations
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+ t = Time.now
+ validation = validation_class.create(model, fold[0], fold[1],cv)
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
+ end
+ end
+ Process.waitall
+ cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
+ cv.validations.each do |validation|
nr_instances += validation.nr_instances
nr_unpredicted += validation.nr_unpredicted
predictions += validation.predictions
- $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
- fold_nr +=1
end
rmse = 0
weighted_rmse = 0
@@ -135,9 +142,8 @@ module OpenTox
weighted_mae = 0
rae = 0
weighted_rae = 0
- n = 0
confidence_sum = 0
- nil_activities = []
+ #nil_activities = []
predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction
@@ -146,34 +152,29 @@ module OpenTox
weighted_rmse += confidence*error**2
mae += error.abs
weighted_mae += confidence*error.abs
- n += 1
confidence_sum += confidence
+ cv.predictions << pred
else
# TODO: create warnings
+ cv.warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{training_dataset.id}."
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{training_dataset.id}."
- nil_activities << pred
+ #nil_activities << pred
end
end
- predictions -= nil_activities
- x = predictions.collect{|p| p[1]}
- y = predictions.collect{|p| p[2]}
- R.assign "Measurement", x
- R.assign "Prediction", y
- R.eval "corr <- lm(-log(Measurement) ~ -log(Prediction))"
- s = R.eval "summary <- summary(corr)"
- p R.eval("summary$r.squared").to_ruby
- #p s.to_ruby
- #p s.to_ruby.first
- s.to_ruby.each_with_index do |l,i|
- #p i
- #p l
- end
- mae = mae/n
+ #predictions -= nil_activities
+ x = cv.predictions.collect{|p| p[1]}
+ y = cv.predictions.collect{|p| p[2]}
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "r <- cor(-log(measurement),-log(prediction))"
+ r = R.eval("r").to_ruby
+
+ mae = mae/cv.predictions.size
weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/n)
+ rmse = Math.sqrt(rmse/cv.predictions.size)
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
# TODO check!!
- predictions.sort! do |a,b|
+ cv.predictions.sort! do |a,b|
relative_error_a = (a[1]-a[2]).abs/a[1].to_f
relative_error_a = 1/relative_error_a if relative_error_a < 1
relative_error_b = (b[1]-b[2]).abs/b[1].to_f
@@ -184,14 +185,15 @@ module OpenTox
name: model.name,
model_id: model.id,
folds: n,
- validation_ids: validation_ids,
+ #validation_ids: validation_ids,
nr_instances: nr_instances,
nr_unpredicted: nr_unpredicted,
- predictions: predictions,#.sort{|a,b| [(b[1]-b[2]).abs/b[1].to_f,b[3]] <=> [(a[1]-a[2]).abs/a[1].to_f,a[3]]},
+ #predictions: predictions,#.sort{|a,b| [(b[1]-b[2]).abs/b[1].to_f,b[3]] <=> [(a[1]-a[2]).abs/a[1].to_f,a[3]]},
mae: mae,
rmse: rmse,
weighted_mae: weighted_mae,
- weighted_rmse: weighted_rmse
+ weighted_rmse: weighted_rmse,
+ r_squared: r**2
)
cv.save
cv
@@ -239,19 +241,20 @@ module OpenTox
#R.eval "error <- log(Measurement)-log(Prediction)"
#R.eval "rmse <- sqrt(mean(error^2, na.rm=T))"
#R.eval "mae <- mean(abs(error), na.rm=T)"
- R.eval "r <- cor(-log(prediction),-log(measurement))"
+ #R.eval "r <- cor(-log(prediction),-log(measurement))"
R.eval "svg(filename='#{tmpfile}')"
R.eval "all = c(-log(measurement),-log(prediction))"
R.eval "range = c(min(all), max(all))"
R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
R.eval "image = image + geom_abline(intercept=0, slope=1) + stat_smooth(method='lm', se=FALSE)"
- R.eval "text = textGrob(paste('RMSE: ', '#{rmse.round(2)},','MAE:','#{mae.round(2)},','r^2: ',round(r^2,2),'\n\n','#{attributes}'),just=c('left','top'),check.overlap = T)"
+ R.eval "text = textGrob(paste('RMSE: ', '#{rmse.round(2)},','MAE:','#{mae.round(2)},','r^2: ','#{r_squared.round(2)}','\n\n','#{attributes}'),just=c('left','top'),check.overlap = T)"
R.eval "grid.arrange(image, text, ncol=2)"
R.eval "dev.off()"
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
plot_id = $gridfs.insert_one(file)
update(:correlation_plot_id => plot_id)
end
+ p correlation_plot_id
$gridfs.find_one(_id: correlation_plot_id).data
end
end