summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-08-09 13:42:18 +0200
committerChristoph Helma <helma@in-silico.ch>2015-08-09 13:42:18 +0200
commitb0be61443018f5058e976302498f8d1a8397e1f4 (patch)
treebd8d60359b250c1f7be7a929a8223114b842ae9f
parent65c7bdd2bc5de1c2f7bf44a4ed93cb80cc7b4b17 (diff)
crossvalidation.rb, neighbor.rb addedmongodb
-rw-r--r--lib/crossvalidation.rb183
-rw-r--r--lib/neighbor.rb25
2 files changed, 208 insertions, 0 deletions
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
new file mode 100644
index 0000000..ffb6fcd
--- /dev/null
+++ b/lib/crossvalidation.rb
@@ -0,0 +1,183 @@
+module OpenTox
+
+ class CrossValidation
+ field :validation_ids, type: Array, default: []
+ field :folds, type: Integer
+ field :nr_instances, type: Integer
+ field :nr_unpredicted, type: Integer
+ field :predictions, type: Array
+ end
+
+ class ClassificationCrossValidation < CrossValidation
+
+ field :accept_values, type: Array
+ field :confusion_matrix, type: Array
+ field :weighted_confusion_matrix, type: Array
+ field :accuracy, type: Float
+ field :weighted_accuracy, type: Float
+ field :true_rate, type: Hash
+ field :predictivity, type: Hash
+ # TODO auc, f-measure (usability??)
+
+ def self.create model, n=10
+ validation_ids = []
+ nr_instances = 0
+ nr_unpredicted = 0
+ predictions = []
+ validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+ accept_values = Feature.find(model.prediction_feature_id).accept_values
+ confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+ weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+ true_rate = {}
+ predictivity = {}
+ fold_nr = 1
+ training_dataset = Dataset.find model.training_dataset_id
+ training_dataset.folds(n).each do |fold|
+ t = Time.now
+ $logger.debug "Fold #{fold_nr}"
+ validation = validation_class.create(model, fold[0], fold[1])
+ validation_ids << validation.id
+ nr_instances += validation.nr_instances
+ nr_unpredicted += validation.nr_unpredicted
+ predictions += validation.predictions
+ validation.confusion_matrix.each_with_index do |r,i|
+ r.each_with_index do |c,j|
+ confusion_matrix[i][j] += c
+ weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
+ end
+ end
+ $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
+ fold_nr +=1
+ end
+ true_rate = {}
+ predictivity = {}
+ accept_values.each_with_index do |v,i|
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+ end
+ confidence_sum = 0
+ weighted_confusion_matrix.each do |r|
+ r.each do |c|
+ confidence_sum += c
+ end
+ end
+ cv = self.new(
+ :nr_instances => nr_instances,
+ :nr_unpredicted => nr_unpredicted,
+ :accept_values => accept_values,
+ :confusion_matrix => confusion_matrix,
+ :weighted_confusion_matrix => weighted_confusion_matrix,
+ :accuracy => (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
+ :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+ :true_rate => true_rate,
+ :predictivity => predictivity,
+ :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+ )
+ cv.save
+ cv
+ end
+
+ #Average area under roc 0.646
+ #Area under roc 0.646
+ #F measure carcinogen: 0.769, noncarcinogen: 0.348
+ end
+
+ class RegressionCrossValidation < Validation
+
+ field :validation_ids, type: Array, default: []
+ field :folds, type: Integer
+ field :rmse, type: Float
+ field :mae, type: Float
+ field :weighted_rmse, type: Float
+ field :weighted_mae, type: Float
+
+ def self.create model, n=10
+ validation_ids = []
+ nr_instances = 0
+ nr_unpredicted = 0
+ predictions = []
+ validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+ fold_nr = 1
+ training_dataset = Dataset.find model.training_dataset_id
+ training_dataset.folds(n).each do |fold|
+ t = Time.now
+ $logger.debug "Predicting fold #{fold_nr}"
+
+ validation = validation_class.create(model, fold[0], fold[1])
+ validation_ids << validation.id
+ nr_instances += validation.nr_instances
+ nr_unpredicted += validation.nr_unpredicted
+ predictions += validation.predictions
+ $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
+ fold_nr +=1
+ end
+ rmse = 0
+ weighted_rmse = 0
+ rse = 0
+ weighted_rse = 0
+ mae = 0
+ weighted_mae = 0
+ rae = 0
+ weighted_rae = 0
+ n = 0
+ confidence_sum = 0
+ predictions.each do |pred|
+ compound_id,activity,prediction,confidence = pred
+ if activity and prediction
+ error = prediction-activity
+ rmse += error**2
+ weighted_rmse += confidence*error**2
+ mae += error.abs
+ weighted_mae += confidence*error.abs
+ n += 1
+ confidence_sum += confidence
+ else
+ # TODO: create warnings
+ p pred
+ end
+ end
+ mae = mae/n
+ weighted_mae = weighted_mae/confidence_sum
+ rmse = Math.sqrt(rmse/n)
+ weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+ cv = self.new(
+ :folds => n,
+ :validation_ids => validation_ids,
+ :nr_instances => nr_instances,
+ :nr_unpredicted => nr_unpredicted,
+ :predictions => predictions.sort{|a,b| b[3] <=> a[3]},
+ :mae => mae,
+ :rmse => rmse,
+ :weighted_mae => weighted_mae,
+ :weighted_rmse => weighted_rmse
+ )
+ cv.save
+ cv
+ end
+
+ def plot
+ # RMSE
+ x = predictions.collect{|p| p[1]}
+ y = predictions.collect{|p| p[2]}
+ R.assign "Measurement", x
+ R.assign "Prediction", y
+ R.eval "par(pty='s')" # sets the plot type to be square
+ #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
+ #R.eval "error <- log(Measurement)-log(Prediction)"
+ R.eval "error <- Measurement-Prediction"
+ R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
+ R.eval "mae <- mean( abs(error), na.rm = TRUE)"
+ R.eval "r <- cor(log(Prediction),log(Measurement))"
+ R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
+ R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
+ #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
+ #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
+ R.eval "abline(0,1,col='blue')"
+ #R.eval "abline(fitline,col='red')"
+ R.eval "dev.off()"
+ "/tmp/#{id.to_s}.svg"
+ end
+ end
+
+
+end
diff --git a/lib/neighbor.rb b/lib/neighbor.rb
new file mode 100644
index 0000000..a2c28d4
--- /dev/null
+++ b/lib/neighbor.rb
@@ -0,0 +1,25 @@
+module OpenTox
+ module Algorithm
+ class Neighbor
+
+ def self.fingerprint_similarity compound, params={}
+ compound.neighbors params[:min_sim]
+ end
+
+ def self.fminer_similarity compound, params
+ feature_dataset = Dataset.find params[:feature_dataset_id]
+ query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features.collect{|f| f.smarts} )
+ neighbors = []
+
+ # find neighbors
+ feature_dataset.data_entries.each_with_index do |fingerprint, i|
+ sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
+ if sim > params[:min_sim]
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ neighbors
+ end
+ end
+ end
+end