summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-05-31 18:08:08 +0200
committerChristoph Helma <helma@in-silico.ch>2016-05-31 18:08:08 +0200
commitb515a0cfedb887a2af753db6e4a08ae1af430cad (patch)
tree5d69d89d0031d581e932272aeb741ee38a0106d6
parentf46ba3b7262f5b551c81fc9396c5b7f0cac7f030 (diff)
cleanup of validation modules/classes
-rw-r--r--lib/classification.rb2
-rw-r--r--lib/compound.rb6
-rw-r--r--lib/crossvalidation.rb251
-rw-r--r--lib/dataset.rb2
-rw-r--r--lib/lazar.rb7
-rw-r--r--lib/leave-one-out-validation.rb141
-rw-r--r--lib/model.rb26
-rw-r--r--lib/nanoparticle.rb80
-rw-r--r--lib/regression.rb6
-rw-r--r--lib/train-test-validation.rb58
-rw-r--r--lib/validation-statistics.rb292
-rw-r--r--lib/validation.rb72
-rw-r--r--test/classification.rb2
-rw-r--r--test/nanoparticles.rb70
-rw-r--r--test/setup.rb4
-rw-r--r--test/validation.rb5
16 files changed, 509 insertions, 515 deletions
diff --git a/lib/classification.rb b/lib/classification.rb
index 48ff8b3..0f3c6d9 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -7,7 +7,7 @@ module OpenTox
sims = {}
neighbors.each do |neighbor|
sim = neighbor["similarity"]
- activities = neighbor["toxicities"]
+ activities = neighbor["measurements"]
activities.each do |act|
sims[act] ||= []
sims[act] << sim
diff --git a/lib/compound.rb b/lib/compound.rb
index a87678e..4541816 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -260,7 +260,7 @@ module OpenTox
if type == DEFAULT_FINGERPRINT
neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id)
neighbors.each do |n|
- n["toxicities"] = dataset.values(n["_id"],prediction_feature_id)
+ n["measurements"] = dataset.values(n["_id"],prediction_feature_id)
end
else
query_fingerprint = self.fingerprint type
@@ -269,7 +269,7 @@ module OpenTox
if values
candidate_fingerprint = compound.fingerprint type
sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint)
- neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
+ neighbors << {"_id" => compound.id, "measurements" => values, "similarity" => sim} if sim >= min_sim
end
end
end
@@ -310,7 +310,7 @@ module OpenTox
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
}},
'_id' => 1,
- #'toxicities' => 1,
+ #'measurements' => 1,
'dataset_ids' => 1
}},
{'$match' => {'similarity' => {'$gte' => min_sim}}},
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 420dd8c..22071d8 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -1,193 +1,96 @@
module OpenTox
- class CrossValidation
- field :validation_ids, type: Array, default: []
- field :model_id, type: BSON::ObjectId
- field :folds, type: Integer
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :predictions, type: Hash, default: {}
- field :finished_at, type: Time
-
- def time
- finished_at - created_at
- end
-
- def validations
- validation_ids.collect{|vid| Validation.find vid}
- end
-
- def model
- Model::Lazar.find model_id
- end
-
- def self.create model, n=10
- klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
- klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
- bad_request_error "Unknown model class #{model.class}." unless klass
-
- cv = klass.new(
- name: model.name,
- model_id: model.id,
- folds: n
- )
- cv.save # set created_at
- nr_instances = 0
- nr_unpredicted = 0
- predictions = {}
- training_dataset = Dataset.find model.training_dataset_id
- training_dataset.folds(n).each_with_index do |fold,fold_nr|
- #fork do # parallel execution of validations can lead to Rserve and memory problems
- $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
- t = Time.now
- validation = Validation.create(model, fold[0], fold[1],cv)
- #p validation
- $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
- #end
- end
- #Process.waitall
- cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
- cv.validations.each do |validation|
- nr_instances += validation.nr_instances
- nr_unpredicted += validation.nr_unpredicted
- predictions.merge! validation.predictions
+ module Validation
+ class CrossValidation < Validation
+ field :validation_ids, type: Array, default: []
+ field :model_id, type: BSON::ObjectId
+ field :folds, type: Integer, default: 10
+ field :nr_instances, type: Integer, default: 0
+ field :nr_unpredicted, type: Integer, default: 0
+ field :predictions, type: Hash, default: {}
+
+ def time
+ finished_at - created_at
end
- cv.update_attributes(
- nr_instances: nr_instances,
- nr_unpredicted: nr_unpredicted,
- predictions: predictions
- )
- $logger.debug "Nr unpredicted: #{nr_unpredicted}"
- cv.statistics
- cv
- end
- end
- class ClassificationCrossValidation < CrossValidation
-
- field :accept_values, type: Array
- field :confusion_matrix, type: Array
- field :weighted_confusion_matrix, type: Array
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash
- field :predictivity, type: Hash
- field :confidence_plot_id, type: BSON::ObjectId
- # TODO auc, f-measure (usability??)
-
- def statistics
- stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values)
- update_attributes(stat)
- stat
- end
+ def validations
+ validation_ids.collect{|vid| TrainTest.find vid}
+ end
- def confidence_plot
- unless confidence_plot_id
- tmpfile = "/tmp/#{id.to_s}_confidence.png"
- accuracies = []
- confidences = []
- correct_predictions = 0
- incorrect_predictions = 0
- predictions.each do |p|
- if p[1] and p[2]
- p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
- confidences << p[3]
+ def model
+ Model::Lazar.find model_id
+ end
- end
+ def self.create model, n=10
+ klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
+ klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
+ bad_request_error "Unknown model class #{model.class}." unless klass
+
+ cv = klass.new(
+ name: model.name,
+ model_id: model.id,
+ folds: n
+ )
+ cv.save # set created_at
+ nr_instances = 0
+ nr_unpredicted = 0
+ predictions = {}
+ training_dataset = Dataset.find model.training_dataset_id
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
+ #fork do # parallel execution of validations can lead to Rserve and memory problems
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+ t = Time.now
+ validation = TrainTest.create(model, fold[0], fold[1])
+ cv.validation_ids << validation.id
+ cv.nr_instances += validation.nr_instances
+ cv.nr_unpredicted += validation.nr_unpredicted
+ cv.predictions.merge! validation.predictions
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
+ #end
end
- R.assign "accuracy", accuracies
- R.assign "confidence", confidences
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
- plot_id = $gridfs.insert_one(file)
- update(:confidence_plot_id => plot_id)
+ #Process.waitall
+ cv.save
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
+ cv.statistics
+ cv.update_attributes(finished_at: Time.now)
+ cv
end
- $gridfs.find_one(_id: confidence_plot_id).data
- end
-
- #Average area under roc 0.646
- #Area under roc 0.646
- #F measure carcinogen: 0.769, noncarcinogen: 0.348
- end
-
- class RegressionCrossValidation < CrossValidation
-
- field :rmse, type: Float
- field :mae, type: Float
- field :r_squared, type: Float
- field :correlation_plot_id, type: BSON::ObjectId
-
- def statistics
- stat = ValidationStatistics.regression predictions
- update_attributes(stat)
- stat
end
- def misclassifications n=nil
- n ||= 10
- model = Model::Lazar.find(self.model_id)
- training_dataset = Dataset.find(model.training_dataset_id)
- prediction_feature = training_dataset.features.first
- predictions.collect do |p|
- unless p.include? nil
- compound = Compound.find(p[0])
- neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
- neighbors.collect! do |n|
- neighbor = Compound.find(n[0])
- { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s][training_dataset.id.to_s]}
- end
- {
- :smiles => compound.smiles,
- :measured => p[1],
- :predicted => p[2],
- :error => (p[1]-p[2]).abs,
- :relative_error => (p[1]-p[2]).abs/p[1],
- :confidence => p[3],
- :neighbors => neighbors
- }
- end
- end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
+ class ClassificationCrossValidation < CrossValidation
+ include ClassificationStatistics
+ field :accept_values, type: Array
+ field :confusion_matrix, type: Array
+ field :weighted_confusion_matrix, type: Array
+ field :accuracy, type: Float
+ field :weighted_accuracy, type: Float
+ field :true_rate, type: Hash
+ field :predictivity, type: Hash
+ field :confidence_plot_id, type: BSON::ObjectId
end
- def confidence_plot
- tmpfile = "/tmp/#{id.to_s}_confidence.png"
- sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact
- R.assign "error", sorted_predictions.collect{|p| p[0]}
- R.assign "confidence", sorted_predictions.collect{|p| p[1]}
- # TODO fix axis names
- R.eval "image = qplot(confidence,error)"
- R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
- plot_id = $gridfs.insert_one(file)
- update(:confidence_plot_id => plot_id)
- $gridfs.find_one(_id: confidence_plot_id).data
+ class RegressionCrossValidation < CrossValidation
+ include RegressionStatistics
+ field :rmse, type: Float
+ field :mae, type: Float
+ field :r_squared, type: Float
+ field :correlation_plot_id, type: BSON::ObjectId
end
- def correlation_plot
- unless correlation_plot_id
- plot_id = ValidationStatistics.correlation_plot id, predictions
- update(:correlation_plot_id => plot_id)
+ class RepeatedCrossValidation < Validation
+ field :crossvalidation_ids, type: Array, default: []
+ def self.create model, folds=10, repeats=3
+ repeated_cross_validation = self.new
+ repeats.times do |n|
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+ end
+ repeated_cross_validation.save
+ repeated_cross_validation
end
- $gridfs.find_one(_id: correlation_plot_id).data
- end
- end
-
- class RepeatedCrossValidation
- field :crossvalidation_ids, type: Array, default: []
- def self.create model, folds=10, repeats=3
- repeated_cross_validation = self.new
- repeats.times do |n|
- $logger.debug "Crossvalidation #{n+1} for #{model.name}"
- repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+ def crossvalidations
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
end
- repeated_cross_validation.save
- repeated_cross_validation
- end
- def crossvalidations
- crossvalidation_ids.collect{|id| CrossValidation.find(id)}
end
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 0c65d61..2e21e5b 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -69,7 +69,7 @@ module OpenTox
training_idxs = indices-test_idxs
training_substances = training_idxs.collect{|i| substances[i]}
chunk = [training_substances,test_substances].collect do |substances|
- dataset = self.class.create(:source => self.id )
+ dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
substances.each do |substance|
substance.dataset_ids << dataset.id
substance.dataset_ids.uniq!
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 7bd87f4..1853aba 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -62,7 +62,7 @@ suppressPackageStartupMessages({
"
# OpenTox classes and includes
-CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
[ # be aware of the require sequence as it affects class/method overwrites
"overwrite.rb",
@@ -82,8 +82,9 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross
"regression.rb",
"validation-statistics.rb",
"validation.rb",
- "crossvalidation.rb",
+ "train-test-validation.rb",
"leave-one-out-validation.rb",
- "experiment.rb",
+ "crossvalidation.rb",
+ #"experiment.rb",
"import.rb",
].each{ |f| require_relative f }
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 9698e05..7ff65ff 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -1,110 +1,57 @@
module OpenTox
- class LeaveOneOutValidation
-
- field :model_id, type: BSON::ObjectId
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :predictions, type: Hash
- field :finished_at, type: Time
-
- def self.create model
- $logger.debug "#{model.name}: LOO validation started"
- t = Time.now
- model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
- loo = klass.new :model_id => model.id
- predictions = model.predict model.training_dataset.substances
- predictions.each{|cid,p| p.delete(:neighbors)}
- nr_unpredicted = 0
- predictions.each do |cid,prediction|
- if prediction[:value]
- prediction[:measured] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
- else
- nr_unpredicted += 1
+ module Validation
+
+ class LeaveOneOut < Validation
+
+ def self.create model
+ $logger.debug "#{model.name}: LOO validation started"
+ t = Time.now
+ model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
+ loo = klass.new :model_id => model.id
+ predictions = model.predict model.training_dataset.substances
+ predictions.each{|cid,p| p.delete(:neighbors)}
+ nr_unpredicted = 0
+ predictions.each do |cid,prediction|
+ if prediction[:value]
+ prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
+ else
+ nr_unpredicted += 1
+ end
+ predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
end
- predictions.delete(cid) unless prediction[:value] and prediction[:measured]
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
+ loo.nr_instances = predictions.size
+ loo.nr_unpredicted = nr_unpredicted
+ loo.predictions = predictions
+ loo.statistics
+ $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
+ loo
end
- predictions.select!{|cid,p| p[:value] and p[:measured]}
- loo.nr_instances = predictions.size
- loo.nr_unpredicted = nr_unpredicted
- loo.predictions = predictions
- loo.statistics
- loo.save
- $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
- loo
- end
- def model
- Model::Lazar.find model_id
end
- end
- class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
-
- field :accept_values, type: Array
- field :confusion_matrix, type: Array, default: []
- field :weighted_confusion_matrix, type: Array, default: []
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash, default: {}
- field :predictivity, type: Hash, default: {}
- field :confidence_plot_id, type: BSON::ObjectId
-
- def statistics
- stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values)
- update_attributes(stat)
+ class ClassificationLeaveOneOut < LeaveOneOut
+ include ClassificationStatistics
+ field :accept_values, type: Array
+ field :confusion_matrix, type: Array, default: []
+ field :weighted_confusion_matrix, type: Array, default: []
+ field :accuracy, type: Float
+ field :weighted_accuracy, type: Float
+ field :true_rate, type: Hash, default: {}
+ field :predictivity, type: Hash, default: {}
+ field :confidence_plot_id, type: BSON::ObjectId
end
-
- def confidence_plot
- unless confidence_plot_id
- tmpfile = "/tmp/#{id.to_s}_confidence.svg"
- accuracies = []
- confidences = []
- correct_predictions = 0
- incorrect_predictions = 0
- predictions.each do |p|
- p[:database_activities].each do |db_act|
- if p[:value]
- p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
- confidences << p[:confidence]
-
- end
- end
- end
- R.assign "accuracy", accuracies
- R.assign "confidence", confidences
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
- plot_id = $gridfs.insert_one(file)
- update(:confidence_plot_id => plot_id)
- end
- $gridfs.find_one(_id: confidence_plot_id).data
+
+ class RegressionLeaveOneOut < LeaveOneOut
+ include RegressionStatistics
+ field :rmse, type: Float, default: 0
+ field :mae, type: Float, default: 0
+ field :r_squared, type: Float
+ field :correlation_plot_id, type: BSON::ObjectId
+ field :confidence_plot_id, type: BSON::ObjectId
end
- end
-
-
- class RegressionLeaveOneOutValidation < LeaveOneOutValidation
-
- field :rmse, type: Float, default: 0
- field :mae, type: Float, default: 0
- field :r_squared, type: Float
- field :correlation_plot_id, type: BSON::ObjectId
- field :confidence_plot_id, type: BSON::ObjectId
- def statistics
- stat = ValidationStatistics.regression predictions
- update_attributes(stat)
- end
-
- def correlation_plot
- unless correlation_plot_id
- plot_id = ValidationStatistics.correlation_plot id, predictions
- update(:correlation_plot_id => plot_id)
- end
- $gridfs.find_one(_id: correlation_plot_id).data
- end
end
end
diff --git a/lib/model.rb b/lib/model.rb
index 18d621b..988cac9 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -22,7 +22,6 @@ module OpenTox
# @param [OpenTox::Dataset] training_dataset
# @return [OpenTox::Model::Lazar] Regression or classification model
def initialize prediction_feature, training_dataset, params={}
-
super params
# set defaults for empty parameters
@@ -39,15 +38,15 @@ module OpenTox
def correlation_filter
self.relevant_features = {}
- toxicities = []
+ measurements = []
substances = []
training_dataset.substances.each do |s|
training_dataset.values(s,prediction_feature_id).each do |act|
- toxicities << act
+ measurements << act
substances << s
end
end
- R.assign "tox", toxicities
+ R.assign "tox", measurements
feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
feature_ids.each do |feature_id|
feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]}
@@ -62,7 +61,7 @@ module OpenTox
self.relevant_features[feature_id]["r"] = r
end
rescue
- warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed."
+ warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed."
end
end
self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
@@ -71,22 +70,22 @@ module OpenTox
def predict_substance substance
neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols
neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters)
- database_activities = nil
+ measurements = nil
prediction = {}
# handle query substance
if neighbors.collect{|n| n["_id"]}.include? substance.id
query = neighbors.select{|n| n["_id"] == substance.id}.first
- database_activities = training_dataset.values(query["_id"],prediction_feature_id)
- prediction[:database_activities] = database_activities
- prediction[:warning] = "#{database_activities.size} substances have been removed from neighbors, because they are identical with the query substance."
+ measurements = training_dataset.values(query["_id"],prediction_feature_id)
+ prediction[:measurements] = measurements
+ prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance."
neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation)
end
if neighbors.empty?
prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
elsif neighbors.size == 1
value = nil
- tox = neighbors.first["toxicities"]
+ tox = neighbors.first["measurements"]
if tox.size == 1 # single measurement
value = tox.first
else # multiple measurement
@@ -141,7 +140,7 @@ module OpenTox
elsif object.is_a? Array
return predictions
elsif object.is_a? Dataset
- predictions.each{|cid,p| p.delete(:neighbors)}
+ #predictions.each{|cid,p| p.delete(:neighbors)}
# prepare prediction dataset
measurement_feature = Feature.find prediction_feature_id
@@ -187,6 +186,7 @@ module OpenTox
model.save
model
end
+
end
class LazarRegression < Lazar
@@ -197,19 +197,21 @@ module OpenTox
model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
model.neighbor_algorithm_parameters ||= {}
{
- :type => "MP2D",
:min_sim => 0.1,
:dataset_id => training_dataset.id,
:prediction_feature_id => prediction_feature.id,
}.each do |key,value|
model.neighbor_algorithm_parameters[key] ||= value
end
+ model.neighbor_algorithm_parameters[:type] = "MP2D" if training_dataset.substances.first.is_a? Compound
model.save
model
end
+
end
class Prediction
+
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 5c6d944..d0f8f51 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -6,58 +6,43 @@ module OpenTox
field :core, type: Hash, default: {}
field :coating, type: Array, default: []
field :proteomics, type: Hash, default: {}
-
- def nanoparticle_neighbors_old min_sim: 0.9, type:, dataset_id:, prediction_feature_id:
- dataset = Dataset.find(dataset_id)
- neighbors = []
- dataset.nanoparticles.each do |np|
- values = dataset.values(np,prediction_feature_id)
- if values
- common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys
- common_descriptors.select!{|id| NumericFeature.find(id) }
- query_descriptors = common_descriptors.collect{|d| physchem_descriptors[d].first}
- neighbor_descriptors = common_descriptors.collect{|d| np.physchem_descriptors[d].first}
- sim = Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)
- neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
- end
- end
- neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
- neighbors
- end
- def nanoparticle_neighbors min_sim: 0.9, type:, dataset_id:, prediction_feature_id:
+ def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:
p self.name
- #p self.physchem_descriptors.keys.size
dataset = Dataset.find(dataset_id)
relevant_features = {}
- toxicities = []
+ measurements = []
substances = []
# TODO: exclude query activities!!!
dataset.substances.each do |s|
- dataset.values(s,prediction_feature_id).each do |act|
- toxicities << act
- substances << s
+ if s.core == self.core # exclude nanoparticles with different core
+ dataset.values(s,prediction_feature_id).each do |act|
+ measurements << act
+ substances << s
+ end
end
end
- R.assign "tox", toxicities
+ R.assign "tox", measurements
feature_ids = physchem_descriptors.keys.select{|fid| Feature.find(fid).is_a? NumericFeature}
# identify relevant features
feature_ids.each do |feature_id|
feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]}
- R.assign "feature", feature_values
- begin
- R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
- pvalue = R.eval("cor$p.value").to_ruby
- if pvalue <= 0.05
- r = R.eval("cor$estimate").to_ruby
- relevant_features[feature_id] = {}
- relevant_features[feature_id]["pvalue"] = pvalue
- relevant_features[feature_id]["r"] = r
- relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
- relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
+ unless feature_values.uniq.size == 1
+ R.assign "feature", feature_values
+ begin
+ R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
+ p_value = R.eval("cor$p.value").to_ruby
+ if p_value <= 0.05
+ r = R.eval("cor$estimate").to_ruby
+ relevant_features[feature_id] = {}
+ relevant_features[feature_id]["p_value"] = p_value
+ relevant_features[feature_id]["r"] = r
+ relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
+ relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
+ end
+ rescue
+ warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed."
end
- rescue
- warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed."
end
end
neighbors = []
@@ -68,13 +53,17 @@ module OpenTox
# scale values
query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
- #weights = common_descriptors.collect{|d| 1-relevant_features[d]["pvalue"]}
+ #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]}
weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2}
- #p weights
sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights)
- ##p "SIM"
- #p [sim, Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)]
- neighbors << {"_id" => substance.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
+ neighbors << {
+ "_id" => substance.id,
+ "measurements" => values,
+ "similarity" => sim,
+ "common_descriptors" => common_descriptors.collect do |id|
+ {:id => id, :p_value => relevant_features[id]["p_value"], :r_squared => relevant_features[id]["r"]**2}
+ end
+ } if sim >= min_sim
end
end
p neighbors.size
@@ -94,10 +83,7 @@ module OpenTox
proteomics[feature.id.to_s] << value
proteomics[feature.id.to_s].uniq!
when "TOX"
- # TODO generic way of parsing TOX values
- if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)"
- dataset.add self, feature, Math.log2(value)
- elsif feature.name == "Total protein (BCA assay)"
+ if feature.name == "Total protein (BCA assay)"
physchem_descriptors[feature.id.to_s] ||= []
physchem_descriptors[feature.id.to_s] << value
physchem_descriptors[feature.id.to_s].uniq!
diff --git a/lib/regression.rb b/lib/regression.rb
index 6487557..cffcbbf 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -8,7 +8,7 @@ module OpenTox
sim_sum = 0.0
neighbors.each do |neighbor|
sim = neighbor["similarity"]
- activities = neighbor["toxicities"]
+ activities = neighbor["measurements"]
activities.each do |act|
weighted_sum += sim*act
sim_sum += sim
@@ -26,7 +26,7 @@ module OpenTox
neighbors.each do |n|
fingerprint = Substance.find(n["_id"]).fingerprint
- activities = n["toxicities"]
+ activities = n["measurements"]
activities.each do |act|
values << act
weights << n["similarity"]
@@ -79,7 +79,7 @@ module OpenTox
neighbors.each_with_index do |n,i|
neighbor = Substance.find(n["_id"])
- activities = neighbor["toxicities"]
+ activities = neighbor["measurements"]
activities.each do |act|
data_frame[0][i] = act
weights << n["similarity"]
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
new file mode 100644
index 0000000..286614a
--- /dev/null
+++ b/lib/train-test-validation.rb
@@ -0,0 +1,58 @@
+module OpenTox
+
+ module Validation
+
+ class TrainTest < Validation
+
+ field :training_dataset_id, type: BSON::ObjectId
+ field :test_dataset_id, type: BSON::ObjectId
+
+ def self.create model, training_set, test_set
+
+ atts = model.attributes.dup # do not modify attributes of the original model
+ atts["_id"] = BSON::ObjectId.new
+ atts[:training_dataset_id] = training_set.id
+ validation_model = model.class.create model.prediction_feature, training_set, atts
+ validation_model.save
+ predictions = validation_model.predict test_set.substances
+ nr_unpredicted = 0
+ predictions.each do |cid,prediction|
+ if prediction[:value]
+ prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
+ else
+ nr_unpredicted += 1
+ end
+ end
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
+ validation = self.new(
+ :model_id => validation_model.id,
+ :test_dataset_id => test_set.id,
+ :nr_instances => test_set.substances.size,
+ :nr_unpredicted => nr_unpredicted,
+ :predictions => predictions
+ )
+ validation.save
+ validation
+ end
+
+ def test_dataset
+ Dataset.find test_dataset_id
+ end
+
+ def training_dataset
+ Dataset.find training_dataset_id
+ end
+
+ end
+
+ class ClassificationTrainTest < TrainTest
+ include ClassificationStatistics
+ end
+
+ class RegressionTrainTest < TrainTest
+ include RegressionStatistics
+ end
+
+ end
+
+end
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index e61543b..816824b 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -1,123 +1,203 @@
module OpenTox
- class ValidationStatistics
- include OpenTox
- def self.classification predictions, accept_values
- confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
- weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
- true_rate = {}
- predictivity = {}
- nr_instances = 0
- predictions.each do |cid,pred|
- # TODO
- # use predictions without probabilities (single neighbor)??
- # use measured majority class??
- if pred[:measured].uniq.size == 1 and pred[:probabilities]
- m = pred[:measured].first
- if pred[:value] == m
- if pred[:value] == accept_values[0]
- confusion_matrix[0][0] += 1
- weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
- nr_instances += 1
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][1] += 1
- weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
- nr_instances += 1
- end
- elsif pred[:value] != m
- if pred[:value] == accept_values[0]
- confusion_matrix[0][1] += 1
- weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
- nr_instances += 1
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][0] += 1
- weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ module Validation
+ module ClassificationStatistics
+
+ def statistics
+ self.accept_values = model.prediction_feature.accept_values
+ self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+ self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+ true_rate = {}
+ predictivity = {}
+ nr_instances = 0
+ predictions.each do |cid,pred|
+ # TODO
+ # use predictions without probabilities (single neighbor)??
+ # use measured majority class??
+ if pred[:measurements].uniq.size == 1 and pred[:probabilities]
+ m = pred[:measurements].first
+ if pred[:value] == m
+ if pred[:value] == accept_values[0]
+ confusion_matrix[0][0] += 1
+ weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ elsif pred[:value] == accept_values[1]
+ confusion_matrix[1][1] += 1
+ weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ end
+ elsif pred[:value] != m
+ if pred[:value] == accept_values[0]
+ confusion_matrix[0][1] += 1
+ weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ elsif pred[:value] == accept_values[1]
+ confusion_matrix[1][0] += 1
+ weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ end
end
end
end
+ true_rate = {}
+ predictivity = {}
+ accept_values.each_with_index do |v,i|
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+ end
+ confidence_sum = 0
+ weighted_confusion_matrix.each do |r|
+ r.each do |c|
+ confidence_sum += c
+ end
+ end
+ self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
+ self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+ $logger.debug "Accuracy #{accuracy}"
+ save
+ {
+ :accept_values => accept_values,
+ :confusion_matrix => confusion_matrix,
+ :weighted_confusion_matrix => weighted_confusion_matrix,
+ :accuracy => accuracy,
+ :weighted_accuracy => weighted_accuracy,
+ :true_rate => true_rate,
+ :predictivity => predictivity,
+ }
end
- true_rate = {}
- predictivity = {}
- accept_values.each_with_index do |v,i|
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
+
+ def confidence_plot
+ unless confidence_plot_id
+ tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+ accuracies = []
+ confidences = []
+ correct_predictions = 0
+ incorrect_predictions = 0
+ predictions.each do |p|
+ p[:measurements].each do |db_act|
+ if p[:value]
+ p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+ confidences << p[:confidence]
+
+ end
+ end
+ end
+ R.assign "accuracy", accuracies
+ R.assign "confidence", confidences
+ R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+ plot_id = $gridfs.insert_one(file)
+ update(:confidence_plot_id => plot_id)
end
+ $gridfs.find_one(_id: confidence_plot_id).data
end
- accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
- weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
- $logger.debug "Accuracy #{accuracy}"
- {
- :accept_values => accept_values,
- :confusion_matrix => confusion_matrix,
- :weighted_confusion_matrix => weighted_confusion_matrix,
- :accuracy => accuracy,
- :weighted_accuracy => weighted_accuracy,
- :true_rate => true_rate,
- :predictivity => predictivity,
- :finished_at => Time.now
- }
end
- def self.regression predictions
- # TODO: predictions within prediction_interval
- rmse = 0
- mae = 0
- x = []
- y = []
- predictions.each do |cid,pred|
- if pred[:value] and pred[:measured]
- x << pred[:measured].median
- y << pred[:value]
- error = pred[:value]-pred[:measured].median
- rmse += error**2
- mae += error.abs
- else
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ module RegressionStatistics
+
+ def statistics
+ # TODO: predictions within prediction_interval
+ rmse = 0
+ mae = 0
+ x = []
+ y = []
+ predictions.each do |cid,pred|
+ if pred[:value] and pred[:measurements]
+ x << pred[:measurements].median
+ y << pred[:value]
+ error = pred[:value]-pred[:measurements].median
+ rmse += error**2
+ mae += error.abs
+ else
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ end
end
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "r <- cor(measurement,prediction,use='pairwise')"
+ r = R.eval("r").to_ruby
+
+ mae = mae/predictions.size
+ rmse = Math.sqrt(rmse/predictions.size)
+ $logger.debug "R^2 #{r**2}"
+ $logger.debug "RMSE #{rmse}"
+ $logger.debug "MAE #{mae}"
+ {
+ :mae => mae,
+ :rmse => rmse,
+ :r_squared => r**2,
+ }
end
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(measurement,prediction,use='pairwise')"
- r = R.eval("r").to_ruby
- mae = mae/predictions.size
- rmse = Math.sqrt(rmse/predictions.size)
- $logger.debug "R^2 #{r**2}"
- $logger.debug "RMSE #{rmse}"
- $logger.debug "MAE #{mae}"
- {
- :mae => mae,
- :rmse => rmse,
- :r_squared => r**2,
- :finished_at => Time.now
- }
- end
+ def correlation_plot
+ unless correlation_plot_id
+ tmpfile = "/tmp/#{id.to_s}_correlation.pdf"
+ x = []
+ y = []
+ feature = Feature.find(predictions.first.last["prediction_feature_id"])
+ predictions.each do |sid,p|
+ x << p["value"]
+ y << p["measurements"].median
+ end
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "all = c(measurement,prediction)"
+ R.eval "range = c(min(all), max(all))"
+ title = feature.name
+ title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
+ R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png")
+ plot_id = $gridfs.insert_one(file)
+ update(:correlation_plot_id => plot_id)
+ end
+ $gridfs.find_one(_id: correlation_plot_id).data
+ end
- def self.correlation_plot id, predictions
- tmpfile = "/tmp/#{id.to_s}_correlation.png"
- x = []
- y = []
- predictions.each do |sid,p|
- x << p["value"]
- y << p["measured"].median
+ def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
+ worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
+ worst_predictions.collect do |p|
+ substance = Substance.find(p.first)
+ prediction = p[1]
+ if show_neigbors
+ neighbors = prediction["neighbors"].collect do |n|
+ common_descriptors = []
+ if show_common_descriptors
+ common_descriptors = n["common_descriptors"].collect do |d|
+ f=Feature.find(d)
+ {
+ :id => f.id.to_s,
+ :name => "#{f.name} (#{f.conditions})",
+ :p_value => d[:p_value],
+ :r_squared => d[:r_squared],
+ }
+ end
+ else
+ common_descriptors = n["common_descriptors"].size
+ end
+ {
+ :name => Substance.find(n["_id"]).name,
+ :id => n["_id"].to_s,
+ :common_descriptors => common_descriptors
+ }
+ end
+ else
+ neighbors = prediction["neighbors"].size
+ end
+ {
+ :id => substance.id.to_s,
+ :name => substance.name,
+ :feature => Feature.find(prediction["prediction_feature_id"]).name,
+ :error => (prediction["value"] - prediction["measurements"].median).abs,
+ :prediction => prediction["value"],
+ :measurements => prediction["measurements"],
+ :neighbors => neighbors
+ }
+ end
end
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "all = c(measurement,prediction)"
- R.eval "range = c(min(all), max(all))"
- # TODO units
- R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
- R.eval "image = image + geom_abline(intercept=0, slope=1)"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png")
- plot_id = $gridfs.insert_one(file)
- plot_id
end
end
end
diff --git a/lib/validation.rb b/lib/validation.rb
index 9122df1..ff9a971 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -1,63 +1,25 @@
module OpenTox
- class Validation
-
- field :model_id, type: BSON::ObjectId
- field :prediction_dataset_id, type: BSON::ObjectId
- field :crossvalidation_id, type: BSON::ObjectId
- field :test_dataset_id, type: BSON::ObjectId
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :predictions, type: Hash
-
- def prediction_dataset
- Dataset.find prediction_dataset_id
- end
-
- def test_dataset
- Dataset.find test_dataset_id
- end
-
- def model
- Model::Lazar.find model_id
- end
-
- def self.create model, training_set, test_set, crossvalidation=nil
-
- atts = model.attributes.dup # do not modify attributes of the original model
- atts["_id"] = BSON::ObjectId.new
- atts[:training_dataset_id] = training_set.id
- validation_model = model.class.create model.prediction_feature, training_set, atts
- validation_model.save
- predictions = validation_model.predict test_set.substances
- predictions.each{|cid,p| p.delete(:neighbors)}
- nr_unpredicted = 0
- predictions.each do |cid,prediction|
- if prediction[:value]
- prediction[:measured] = test_set.values(cid, prediction[:prediction_feature_id])
- else
- nr_unpredicted += 1
- end
+ module Validation
+
+ class Validation
+ include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+ store_in collection: "validations"
+ field :name, type: String
+ field :model_id, type: BSON::ObjectId
+ field :nr_instances, type: Integer
+ field :nr_unpredicted, type: Integer
+ field :predictions, type: Hash
+ field :finished_at, type: Time
+
+ def model
+ Model::Lazar.find model_id
end
- predictions.select!{|cid,p| p[:value] and p[:measured]}
- validation = self.new(
- :model_id => validation_model.id,
- :test_dataset_id => test_set.id,
- :nr_instances => test_set.substances.size,
- :nr_unpredicted => nr_unpredicted,
- :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
- )
- validation.crossvalidation_id = crossvalidation.id if crossvalidation
- validation.save
- validation
- end
-
- end
- class ClassificationValidation < Validation
- end
+ end
- class RegressionValidation < Validation
end
end
diff --git a/test/classification.rb b/test/classification.rb
index df7cba9..9104022 100644
--- a/test/classification.rb
+++ b/test/classification.rb
@@ -20,7 +20,7 @@ class LazarClassificationTest < MiniTest::Test
compound = Compound.from_smiles "CCO"
prediction = model.predict compound
assert_equal "true", prediction[:value]
- assert_equal ["false"], prediction[:database_activities]
+ assert_equal ["false"], prediction[:measurements]
# make a dataset prediction
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 1cd1ff0..f0ded2f 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -11,7 +11,7 @@ class NanoparticleTest < MiniTest::Test
def test_create_model_with_feature_selection
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)")
- model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors", :feature_selection_algorithm => "correlation_filter"})
+ model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors", :feature_selection_algorithm => "correlation_filter"})
nanoparticle = training_dataset.nanoparticles[-34]
#p nanoparticle.neighbors
prediction = model.predict nanoparticle
@@ -23,7 +23,7 @@ class NanoparticleTest < MiniTest::Test
def test_create_model
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)")
- model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors"})
+ model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors"})
nanoparticle = training_dataset.nanoparticles[-34]
prediction = model.predict nanoparticle
refute_nil prediction[:value]
@@ -31,13 +31,67 @@ class NanoparticleTest < MiniTest::Test
model.delete
end
+ # TODO move to validation-statistics
+ def test_inspect_cv
+ cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last
+ cv.correlation_plot_id = nil
+ File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
+ #p cv
+=begin
+ #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
+ cv.predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,5].each do |sid,p|
+ s = Substance.find(sid)
+ puts
+ p s.name
+ p([p["value"],p["measurements"],(p["value"]-p["measured"].median).abs])
+ neighbors = s.physchem_neighbors dataset_id: cv.model.training_dataset_id, prediction_feature_id: cv.model.prediction_feature_id, type: nil
+ neighbors.each do |n|
+ neighbor = Substance.find(n["_id"])
+ p "=="
+ p neighbor.name, n["similarity"], n["measurements"]
+ p neighbor.core["name"]
+ p neighbor.coating.collect{|c| c["name"]}
+ n["common_descriptors"].each do |id|
+ f = Feature.find(id)
+ print "#{f.name} #{f.conditions["MEDIUM"]}"
+ print ", "
+ end
+ puts
+ end
+
+ end
+=end
+ end
+ def test_inspect_worst_prediction
+# TODO check/fix single/double neighbor prediction
+ cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last
+ worst_predictions = cv.worst_predictions(n: 3,show_neigbors: false)
+ assert_equal 3, worst_predictions.size
+ assert_kind_of Integer, worst_predictions.first[:neighbors]
+ worst_predictions = cv.worst_predictions
+ #puts worst_predictions.to_yaml
+ assert_equal 5, worst_predictions.size
+ assert_kind_of Array, worst_predictions.first[:neighbors]
+ assert_kind_of Integer, worst_predictions.first[:neighbors].first[:common_descriptors]
+ worst_predictions = cv.worst_predictions(n: 2, show_common_descriptors: true)
+ puts worst_predictions.to_yaml
+ assert_equal 2, worst_predictions.size
+ assert_kind_of Array, worst_predictions.first[:neighbors]
+ refute_nil worst_predictions.first[:neighbors].first[:common_descriptors]
+ #p cv.model.training_dataset.features
+ end
+
def test_validate_model
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
- feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)")
- model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}})
+ #feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)")
+ feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX")
+
+ model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}})
cv = RegressionCrossValidation.create model
- p cv
- File.open("tmp.png","w+"){|f| f.puts cv.correlation_plot}
+ p cv.predictions.sort_by{|sid,p| (p["value"] - p["measurements"].median).abs}
+ p cv.rmse
+ p cv.r_squared
+ File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
refute_nil cv.r_squared
refute_nil cv.rmse
end
@@ -45,7 +99,7 @@ class NanoparticleTest < MiniTest::Test
def test_validate_pls_model
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)")
- model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
+ model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "physchem_neighbors"})
cv = RegressionCrossValidation.create model
p cv
File.open("tmp.png","w+"){|f| f.puts cv.correlation_plot}
@@ -79,7 +133,7 @@ class NanoparticleTest < MiniTest::Test
toxcounts = {}
pccounts = {}
Nanoparticle.all.each do |np|
- np.toxicities.each do |t,v|
+ np.measurements.each do |t,v|
toxcounts[t] ||= 0
toxcounts[t] += 1#v.uniq.size
end
diff --git a/test/setup.rb b/test/setup.rb
index 6c97282..e7c32b4 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb'
include OpenTox
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
-#$mongo.database.drop
-#$gridfs = $mongo.database.fs
+$mongo.database.drop
+$gridfs = $mongo.database.fs
diff --git a/test/validation.rb b/test/validation.rb
index 39314da..a259472 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -1,6 +1,7 @@
require_relative "setup.rb"
class ValidationTest < MiniTest::Test
+ include OpenTox::Validation
# defaults
@@ -86,7 +87,7 @@ class ValidationTest < MiniTest::Test
def test_classification_loo_validation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
model = Model::LazarClassification.create dataset.features.first, dataset
- loo = ClassificationLeaveOneOutValidation.create model
+ loo = ClassificationLeaveOneOut.create model
assert_equal 14, loo.nr_unpredicted
refute_empty loo.confusion_matrix
assert loo.accuracy > 0.77
@@ -96,7 +97,7 @@ class ValidationTest < MiniTest::Test
def test_regression_loo_validation
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
model = Model::LazarRegression.create dataset.features.first, dataset
- loo = RegressionLeaveOneOutValidation.create model
+ loo = RegressionLeaveOneOut.create model
assert loo.r_squared > 0.34, "R^2 (#{loo.r_squared}) should be larger than 0.034"
end