From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 31 May 2016 18:08:08 +0200 Subject: cleanup of validation modules/classes --- lib/classification.rb | 2 +- lib/compound.rb | 6 +- lib/crossvalidation.rb | 251 +++++++++++----------------------- lib/dataset.rb | 2 +- lib/lazar.rb | 7 +- lib/leave-one-out-validation.rb | 141 ++++++------------- lib/model.rb | 26 ++-- lib/nanoparticle.rb | 80 +++++------ lib/regression.rb | 6 +- lib/train-test-validation.rb | 58 ++++++++ lib/validation-statistics.rb | 292 +++++++++++++++++++++++++--------------- lib/validation.rb | 72 +++------- test/classification.rb | 2 +- test/nanoparticles.rb | 70 ++++++++-- test/setup.rb | 4 +- test/validation.rb | 5 +- 16 files changed, 509 insertions(+), 515 deletions(-) create mode 100644 lib/train-test-validation.rb diff --git a/lib/classification.rb b/lib/classification.rb index 48ff8b3..0f3c6d9 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -7,7 +7,7 @@ module OpenTox sims = {} neighbors.each do |neighbor| sim = neighbor["similarity"] - activities = neighbor["toxicities"] + activities = neighbor["measurements"] activities.each do |act| sims[act] ||= [] sims[act] << sim diff --git a/lib/compound.rb b/lib/compound.rb index a87678e..4541816 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -260,7 +260,7 @@ module OpenTox if type == DEFAULT_FINGERPRINT neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id) neighbors.each do |n| - n["toxicities"] = dataset.values(n["_id"],prediction_feature_id) + n["measurements"] = dataset.values(n["_id"],prediction_feature_id) end else query_fingerprint = self.fingerprint type @@ -269,7 +269,7 @@ module OpenTox if values candidate_fingerprint = compound.fingerprint type sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) - neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + neighbors << {"_id" => compound.id, "measurements" => values, "similarity" => sim} if sim >= min_sim end end end @@ -310,7 +310,7 @@ module OpenTox 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, '_id' => 1, - #'toxicities' => 1, + #'measurements' => 1, 'dataset_ids' => 1 }}, {'$match' => {'similarity' => {'$gte' => min_sim}}}, diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 420dd8c..22071d8 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -1,193 +1,96 @@ module OpenTox - class CrossValidation - field :validation_ids, type: Array, default: [] - field :model_id, type: BSON::ObjectId - field :folds, type: Integer - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Hash, default: {} - field :finished_at, type: Time - - def time - finished_at - created_at - end - - def validations - validation_ids.collect{|vid| Validation.find vid} - end - - def model - Model::Lazar.find model_id - end - - def self.create model, n=10 - klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification - klass = RegressionCrossValidation if model.is_a? Model::LazarRegression - bad_request_error "Unknown model class #{model.class}." unless klass - - cv = klass.new( - name: model.name, - model_id: model.id, - folds: n - ) - cv.save # set created_at - nr_instances = 0 - nr_unpredicted = 0 - predictions = {} - training_dataset = Dataset.find model.training_dataset_id - training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations can lead to Rserve and memory problems - $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" - t = Time.now - validation = Validation.create(model, fold[0], fold[1],cv) - #p validation - $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - #end - end - #Process.waitall - cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) - cv.validations.each do |validation| - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - predictions.merge! validation.predictions + module Validation + class CrossValidation < Validation + field :validation_ids, type: Array, default: [] + field :model_id, type: BSON::ObjectId + field :folds, type: Integer, default: 10 + field :nr_instances, type: Integer, default: 0 + field :nr_unpredicted, type: Integer, default: 0 + field :predictions, type: Hash, default: {} + + def time + finished_at - created_at end - cv.update_attributes( - nr_instances: nr_instances, - nr_unpredicted: nr_unpredicted, - predictions: predictions - ) - $logger.debug "Nr unpredicted: #{nr_unpredicted}" - cv.statistics - cv - end - end - class ClassificationCrossValidation < CrossValidation - - field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash - field :predictivity, type: Hash - field :confidence_plot_id, type: BSON::ObjectId - # TODO auc, f-measure (usability??) - - def statistics - stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) - update_attributes(stat) - stat - end + def validations + validation_ids.collect{|vid| TrainTest.find vid} + end - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.png" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - if p[1] and p[2] - p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[3] + def model + Model::Lazar.find model_id + end - end + def self.create model, n=10 + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + + cv = klass.new( + name: model.name, + model_id: model.id, + folds: n + ) + cv.save # set created_at + nr_instances = 0 + nr_unpredicted = 0 + predictions = {} + training_dataset = Dataset.find model.training_dataset_id + training_dataset.folds(n).each_with_index do |fold,fold_nr| + #fork do # parallel execution of validations can lead to Rserve and memory problems + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = TrainTest.create(model, fold[0], fold[1]) + cv.validation_ids << validation.id + cv.nr_instances += validation.nr_instances + cv.nr_unpredicted += validation.nr_unpredicted + cv.predictions.merge! validation.predictions + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" + #end end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) + #Process.waitall + cv.save + $logger.debug "Nr unpredicted: #{nr_unpredicted}" + cv.statistics + cv.update_attributes(finished_at: Time.now) + cv end - $gridfs.find_one(_id: confidence_plot_id).data - end - - #Average area under roc 0.646 - #Area under roc 0.646 - #F measure carcinogen: 0.769, noncarcinogen: 0.348 - end - - class RegressionCrossValidation < CrossValidation - - field :rmse, type: Float - field :mae, type: Float - field :r_squared, type: Float - field :correlation_plot_id, type: BSON::ObjectId - - def statistics - stat = ValidationStatistics.regression predictions - update_attributes(stat) - stat end - def misclassifications n=nil - n ||= 10 - model = Model::Lazar.find(self.model_id) - training_dataset = Dataset.find(model.training_dataset_id) - prediction_feature = training_dataset.features.first - predictions.collect do |p| - unless p.include? nil - compound = Compound.find(p[0]) - neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) - neighbors.collect! do |n| - neighbor = Compound.find(n[0]) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s][training_dataset.id.to_s]} - end - { - :smiles => compound.smiles, - :measured => p[1], - :predicted => p[2], - :error => (p[1]-p[2]).abs, - :relative_error => (p[1]-p[2]).abs/p[1], - :confidence => p[3], - :neighbors => neighbors - } - end - end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] + class ClassificationCrossValidation < CrossValidation + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :confidence_plot_id, type: BSON::ObjectId end - def confidence_plot - tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact - R.assign "error", sorted_predictions.collect{|p| p[0]} - R.assign "confidence", sorted_predictions.collect{|p| p[1]} - # TODO fix axis names - R.eval "image = qplot(confidence,error)" - R.eval "image = image + stat_smooth(method='lm', se=FALSE)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - $gridfs.find_one(_id: confidence_plot_id).data + class RegressionCrossValidation < CrossValidation + include RegressionStatistics + field :rmse, type: Float + field :mae, type: Float + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId end - def correlation_plot - unless correlation_plot_id - plot_id = ValidationStatistics.correlation_plot id, predictions - update(:correlation_plot_id => plot_id) + class RepeatedCrossValidation < Validation + field :crossvalidation_ids, type: Array, default: [] + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + end + repeated_cross_validation.save + repeated_cross_validation end - $gridfs.find_one(_id: correlation_plot_id).data - end - end - - class RepeatedCrossValidation - field :crossvalidation_ids, type: Array, default: [] - def self.create model, folds=10, repeats=3 - repeated_cross_validation = self.new - repeats.times do |n| - $logger.debug "Crossvalidation #{n+1} for #{model.name}" - repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} end - repeated_cross_validation.save - repeated_cross_validation - end - def crossvalidations - crossvalidation_ids.collect{|id| CrossValidation.find(id)} end end diff --git a/lib/dataset.rb b/lib/dataset.rb index 0c65d61..2e21e5b 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -69,7 +69,7 @@ module OpenTox training_idxs = indices-test_idxs training_substances = training_idxs.collect{|i| substances[i]} chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:source => self.id ) + dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) substances.each do |substance| substance.dataset_ids << dataset.id substance.dataset_ids.uniq! diff --git a/lib/lazar.rb b/lib/lazar.rb index 7bd87f4..1853aba 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -62,7 +62,7 @@ suppressPackageStartupMessages({ " # OpenTox classes and includes -CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -82,8 +82,9 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "regression.rb", "validation-statistics.rb", "validation.rb", - "crossvalidation.rb", + "train-test-validation.rb", "leave-one-out-validation.rb", - "experiment.rb", + "crossvalidation.rb", + #"experiment.rb", "import.rb", ].each{ |f| require_relative f } diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 9698e05..7ff65ff 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -1,110 +1,57 @@ module OpenTox - class LeaveOneOutValidation - - field :model_id, type: BSON::ObjectId - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Hash - field :finished_at, type: Time - - def self.create model - $logger.debug "#{model.name}: LOO validation started" - t = Time.now - model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation - loo = klass.new :model_id => model.id - predictions = model.predict model.training_dataset.substances - predictions.each{|cid,p| p.delete(:neighbors)} - nr_unpredicted = 0 - predictions.each do |cid,prediction| - if prediction[:value] - prediction[:measured] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) - else - nr_unpredicted += 1 + module Validation + + class LeaveOneOut < Validation + + def self.create model + $logger.debug "#{model.name}: LOO validation started" + t = Time.now + model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut + loo = klass.new :model_id => model.id + predictions = model.predict model.training_dataset.substances + predictions.each{|cid,p| p.delete(:neighbors)} + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) + else + nr_unpredicted += 1 + end + predictions.delete(cid) unless prediction[:value] and prediction[:measurements] end - predictions.delete(cid) unless prediction[:value] and prediction[:measured] + predictions.select!{|cid,p| p[:value] and p[:measurements]} + loo.nr_instances = predictions.size + loo.nr_unpredicted = nr_unpredicted + loo.predictions = predictions + loo.statistics + $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" + loo end - predictions.select!{|cid,p| p[:value] and p[:measured]} - loo.nr_instances = predictions.size - loo.nr_unpredicted = nr_unpredicted - loo.predictions = predictions - loo.statistics - loo.save - $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" - loo - end - def model - Model::Lazar.find model_id end - end - class ClassificationLeaveOneOutValidation < LeaveOneOutValidation - - field :accept_values, type: Array - field :confusion_matrix, type: Array, default: [] - field :weighted_confusion_matrix, type: Array, default: [] - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash, default: {} - field :predictivity, type: Hash, default: {} - field :confidence_plot_id, type: BSON::ObjectId - - def statistics - stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) - update_attributes(stat) + class ClassificationLeaveOneOut < LeaveOneOut + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array, default: [] + field :weighted_confusion_matrix, type: Array, default: [] + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash, default: {} + field :predictivity, type: Hash, default: {} + field :confidence_plot_id, type: BSON::ObjectId end - - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.svg" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - p[:database_activities].each do |db_act| - if p[:value] - p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[:confidence] - - end - end - end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - end - $gridfs.find_one(_id: confidence_plot_id).data + + class RegressionLeaveOneOut < LeaveOneOut + include RegressionStatistics + field :rmse, type: Float, default: 0 + field :mae, type: Float, default: 0 + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId + field :confidence_plot_id, type: BSON::ObjectId end - end - - - class RegressionLeaveOneOutValidation < LeaveOneOutValidation - - field :rmse, type: Float, default: 0 - field :mae, type: Float, default: 0 - field :r_squared, type: Float - field :correlation_plot_id, type: BSON::ObjectId - field :confidence_plot_id, type: BSON::ObjectId - def statistics - stat = ValidationStatistics.regression predictions - update_attributes(stat) - end - - def correlation_plot - unless correlation_plot_id - plot_id = ValidationStatistics.correlation_plot id, predictions - update(:correlation_plot_id => plot_id) - end - $gridfs.find_one(_id: correlation_plot_id).data - end end end diff --git a/lib/model.rb b/lib/model.rb index 18d621b..988cac9 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -22,7 +22,6 @@ module OpenTox # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model def initialize prediction_feature, training_dataset, params={} - super params # set defaults for empty parameters @@ -39,15 +38,15 @@ module OpenTox def correlation_filter self.relevant_features = {} - toxicities = [] + measurements = [] substances = [] training_dataset.substances.each do |s| training_dataset.values(s,prediction_feature_id).each do |act| - toxicities << act + measurements << act substances << s end end - R.assign "tox", toxicities + R.assign "tox", measurements feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} @@ -62,7 +61,7 @@ module OpenTox self.relevant_features[feature_id]["r"] = r end rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end end self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h @@ -71,22 +70,22 @@ module OpenTox def predict_substance substance neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) - database_activities = nil + measurements = nil prediction = {} # handle query substance if neighbors.collect{|n| n["_id"]}.include? substance.id query = neighbors.select{|n| n["_id"] == substance.id}.first - database_activities = training_dataset.values(query["_id"],prediction_feature_id) - prediction[:database_activities] = database_activities - prediction[:warning] = "#{database_activities.size} substances have been removed from neighbors, because they are identical with the query substance." + measurements = training_dataset.values(query["_id"],prediction_feature_id) + prediction[:measurements] = measurements + prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance." neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) end if neighbors.empty? prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) elsif neighbors.size == 1 value = nil - tox = neighbors.first["toxicities"] + tox = neighbors.first["measurements"] if tox.size == 1 # single measurement value = tox.first else # multiple measurement @@ -141,7 +140,7 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - predictions.each{|cid,p| p.delete(:neighbors)} + #predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id @@ -187,6 +186,7 @@ module OpenTox model.save model end + end class LazarRegression < Lazar @@ -197,19 +197,21 @@ module OpenTox model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} { - :type => "MP2D", :min_sim => 0.1, :dataset_id => training_dataset.id, :prediction_feature_id => prediction_feature.id, }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end + model.neighbor_algorithm_parameters[:type] = "MP2D" if training_dataset.substances.first.is_a? Compound model.save model end + end class Prediction + include OpenTox include Mongoid::Document include Mongoid::Timestamps diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 5c6d944..d0f8f51 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,58 +6,43 @@ module OpenTox field :core, type: Hash, default: {} field :coating, type: Array, default: [] field :proteomics, type: Hash, default: {} - - def nanoparticle_neighbors_old min_sim: 0.9, type:, dataset_id:, prediction_feature_id: - dataset = Dataset.find(dataset_id) - neighbors = [] - dataset.nanoparticles.each do |np| - values = dataset.values(np,prediction_feature_id) - if values - common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys - common_descriptors.select!{|id| NumericFeature.find(id) } - query_descriptors = common_descriptors.collect{|d| physchem_descriptors[d].first} - neighbor_descriptors = common_descriptors.collect{|d| np.physchem_descriptors[d].first} - sim = Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors) - neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim - end - end - neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} - neighbors - end - def nanoparticle_neighbors min_sim: 0.9, type:, dataset_id:, prediction_feature_id: + def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id: p self.name - #p self.physchem_descriptors.keys.size dataset = Dataset.find(dataset_id) relevant_features = {} - toxicities = [] + measurements = [] substances = [] # TODO: exclude query activities!!! dataset.substances.each do |s| - dataset.values(s,prediction_feature_id).each do |act| - toxicities << act - substances << s + if s.core == self.core # exclude nanoparticles with different core + dataset.values(s,prediction_feature_id).each do |act| + measurements << act + substances << s + end end end - R.assign "tox", toxicities + R.assign "tox", measurements feature_ids = physchem_descriptors.keys.select{|fid| Feature.find(fid).is_a? NumericFeature} # identify relevant features feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - relevant_features[feature_id] = {} - relevant_features[feature_id]["pvalue"] = pvalue - relevant_features[feature_id]["r"] = r - relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + unless feature_values.uniq.size == 1 + R.assign "feature", feature_values + begin + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" + p_value = R.eval("cor$p.value").to_ruby + if p_value <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature_id] = {} + relevant_features[feature_id]["p_value"] = p_value + relevant_features[feature_id]["r"] = r + relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby + relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." end end neighbors = [] @@ -68,13 +53,17 @@ module OpenTox # scale values query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - #weights = common_descriptors.collect{|d| 1-relevant_features[d]["pvalue"]} + #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} - #p weights sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) - ##p "SIM" - #p [sim, Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)] - neighbors << {"_id" => substance.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + neighbors << { + "_id" => substance.id, + "measurements" => values, + "similarity" => sim, + "common_descriptors" => common_descriptors.collect do |id| + {:id => id, :p_value => relevant_features[id]["p_value"], :r_squared => relevant_features[id]["r"]**2} + end + } if sim >= min_sim end end p neighbors.size @@ -94,10 +83,7 @@ module OpenTox proteomics[feature.id.to_s] << value proteomics[feature.id.to_s].uniq! when "TOX" - # TODO generic way of parsing TOX values - if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)" - dataset.add self, feature, Math.log2(value) - elsif feature.name == "Total protein (BCA assay)" + if feature.name == "Total protein (BCA assay)" physchem_descriptors[feature.id.to_s] ||= [] physchem_descriptors[feature.id.to_s] << value physchem_descriptors[feature.id.to_s].uniq! diff --git a/lib/regression.rb b/lib/regression.rb index 6487557..cffcbbf 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -8,7 +8,7 @@ module OpenTox sim_sum = 0.0 neighbors.each do |neighbor| sim = neighbor["similarity"] - activities = neighbor["toxicities"] + activities = neighbor["measurements"] activities.each do |act| weighted_sum += sim*act sim_sum += sim @@ -26,7 +26,7 @@ module OpenTox neighbors.each do |n| fingerprint = Substance.find(n["_id"]).fingerprint - activities = n["toxicities"] + activities = n["measurements"] activities.each do |act| values << act weights << n["similarity"] @@ -79,7 +79,7 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - activities = neighbor["toxicities"] + activities = neighbor["measurements"] activities.each do |act| data_frame[0][i] = act weights << n["similarity"] diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb new file mode 100644 index 0000000..286614a --- /dev/null +++ b/lib/train-test-validation.rb @@ -0,0 +1,58 @@ +module OpenTox + + module Validation + + class TrainTest < Validation + + field :training_dataset_id, type: BSON::ObjectId + field :test_dataset_id, type: BSON::ObjectId + + def self.create model, training_set, test_set + + atts = model.attributes.dup # do not modify attributes of the original model + atts["_id"] = BSON::ObjectId.new + atts[:training_dataset_id] = training_set.id + validation_model = model.class.create model.prediction_feature, training_set, atts + validation_model.save + predictions = validation_model.predict test_set.substances + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) + else + nr_unpredicted += 1 + end + end + predictions.select!{|cid,p| p[:value] and p[:measurements]} + validation = self.new( + :model_id => validation_model.id, + :test_dataset_id => test_set.id, + :nr_instances => test_set.substances.size, + :nr_unpredicted => nr_unpredicted, + :predictions => predictions + ) + validation.save + validation + end + + def test_dataset + Dataset.find test_dataset_id + end + + def training_dataset + Dataset.find training_dataset_id + end + + end + + class ClassificationTrainTest < TrainTest + include ClassificationStatistics + end + + class RegressionTrainTest < TrainTest + include RegressionStatistics + end + + end + +end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index e61543b..816824b 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -1,123 +1,203 @@ module OpenTox - class ValidationStatistics - include OpenTox - def self.classification predictions, accept_values - confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - nr_instances = 0 - predictions.each do |cid,pred| - # TODO - # use predictions without probabilities (single neighbor)?? - # use measured majority class?? - if pred[:measured].uniq.size == 1 and pred[:probabilities] - m = pred[:measured].first - if pred[:value] == m - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 - end - elsif pred[:value] != m - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 + module Validation + module ClassificationStatistics + + def statistics + self.accept_values = model.prediction_feature.accept_values + self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + nr_instances = 0 + predictions.each do |cid,pred| + # TODO + # use predictions without probabilities (single neighbor)?? + # use measured majority class?? + if pred[:measurements].uniq.size == 1 and pred[:probabilities] + m = pred[:measurements].first + if pred[:value] == m + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end + elsif pred[:value] != m + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end end end end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f + self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + $logger.debug "Accuracy #{accuracy}" + save + { + :accept_values => accept_values, + :confusion_matrix => confusion_matrix, + :weighted_confusion_matrix => weighted_confusion_matrix, + :accuracy => accuracy, + :weighted_accuracy => weighted_accuracy, + :true_rate => true_rate, + :predictivity => predictivity, + } end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c + + def confidence_plot + unless confidence_plot_id + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + p[:measurements].each do |db_act| + if p[:value] + p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[:confidence] + + end + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) end + $gridfs.find_one(_id: confidence_plot_id).data end - accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f - weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f - $logger.debug "Accuracy #{accuracy}" - { - :accept_values => accept_values, - :confusion_matrix => confusion_matrix, - :weighted_confusion_matrix => weighted_confusion_matrix, - :accuracy => accuracy, - :weighted_accuracy => weighted_accuracy, - :true_rate => true_rate, - :predictivity => predictivity, - :finished_at => Time.now - } end - def self.regression predictions - # TODO: predictions within prediction_interval - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |cid,pred| - if pred[:value] and pred[:measured] - x << pred[:measured].median - y << pred[:value] - error = pred[:value]-pred[:measured].median - rmse += error**2 - mae += error.abs - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + module RegressionStatistics + + def statistics + # TODO: predictions within prediction_interval + rmse = 0 + mae = 0 + x = [] + y = [] + predictions.each do |cid,pred| + if pred[:value] and pred[:measurements] + x << pred[:measurements].median + y << pred[:value] + error = pred[:value]-pred[:measurements].median + rmse += error**2 + mae += error.abs + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end end + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(measurement,prediction,use='pairwise')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + rmse = Math.sqrt(rmse/predictions.size) + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + { + :mae => mae, + :rmse => rmse, + :r_squared => r**2, + } end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='pairwise')" - r = R.eval("r").to_ruby - mae = mae/predictions.size - rmse = Math.sqrt(rmse/predictions.size) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" - { - :mae => mae, - :rmse => rmse, - :r_squared => r**2, - :finished_at => Time.now - } - end + def correlation_plot + unless correlation_plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.pdf" + x = [] + y = [] + feature = Feature.find(predictions.first.last["prediction_feature_id"]) + predictions.each do |sid,p| + x << p["value"] + y << p["measurements"].median + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + title = feature.name + title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank? + R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") + plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => plot_id) + end + $gridfs.find_one(_id: correlation_plot_id).data + end - def self.correlation_plot id, predictions - tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = [] - y = [] - predictions.each do |sid,p| - x << p["value"] - y << p["measured"].median + def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false + worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n] + worst_predictions.collect do |p| + substance = Substance.find(p.first) + prediction = p[1] + if show_neigbors + neighbors = prediction["neighbors"].collect do |n| + common_descriptors = [] + if show_common_descriptors + common_descriptors = n["common_descriptors"].collect do |d| + f=Feature.find(d) + { + :id => f.id.to_s, + :name => "#{f.name} (#{f.conditions})", + :p_value => d[:p_value], + :r_squared => d[:r_squared], + } + end + else + common_descriptors = n["common_descriptors"].size + end + { + :name => Substance.find(n["_id"]).name, + :id => n["_id"].to_s, + :common_descriptors => common_descriptors + } + end + else + neighbors = prediction["neighbors"].size + end + { + :id => substance.id.to_s, + :name => substance.name, + :feature => Feature.find(prediction["prediction_feature_id"]).name, + :error => (prediction["value"] - prediction["measurements"].median).abs, + :prediction => prediction["value"], + :measurements => prediction["measurements"], + :neighbors => neighbors + } + end end - R.assign "measurement", x - R.assign "prediction", y - R.eval "all = c(measurement,prediction)" - R.eval "range = c(min(all), max(all))" - # TODO units - R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") - plot_id = $gridfs.insert_one(file) - plot_id end end end diff --git a/lib/validation.rb b/lib/validation.rb index 9122df1..ff9a971 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -1,63 +1,25 @@ module OpenTox - class Validation - - field :model_id, type: BSON::ObjectId - field :prediction_dataset_id, type: BSON::ObjectId - field :crossvalidation_id, type: BSON::ObjectId - field :test_dataset_id, type: BSON::ObjectId - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Hash - - def prediction_dataset - Dataset.find prediction_dataset_id - end - - def test_dataset - Dataset.find test_dataset_id - end - - def model - Model::Lazar.find model_id - end - - def self.create model, training_set, test_set, crossvalidation=nil - - atts = model.attributes.dup # do not modify attributes of the original model - atts["_id"] = BSON::ObjectId.new - atts[:training_dataset_id] = training_set.id - validation_model = model.class.create model.prediction_feature, training_set, atts - validation_model.save - predictions = validation_model.predict test_set.substances - predictions.each{|cid,p| p.delete(:neighbors)} - nr_unpredicted = 0 - predictions.each do |cid,prediction| - if prediction[:value] - prediction[:measured] = test_set.values(cid, prediction[:prediction_feature_id]) - else - nr_unpredicted += 1 - end + module Validation + + class Validation + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "validations" + field :name, type: String + field :model_id, type: BSON::ObjectId + field :nr_instances, type: Integer + field :nr_unpredicted, type: Integer + field :predictions, type: Hash + field :finished_at, type: Time + + def model + Model::Lazar.find model_id end - predictions.select!{|cid,p| p[:value] and p[:measured]} - validation = self.new( - :model_id => validation_model.id, - :test_dataset_id => test_set.id, - :nr_instances => test_set.substances.size, - :nr_unpredicted => nr_unpredicted, - :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence - ) - validation.crossvalidation_id = crossvalidation.id if crossvalidation - validation.save - validation - end - - end - class ClassificationValidation < Validation - end + end - class RegressionValidation < Validation end end diff --git a/test/classification.rb b/test/classification.rb index df7cba9..9104022 100644 --- a/test/classification.rb +++ b/test/classification.rb @@ -20,7 +20,7 @@ class LazarClassificationTest < MiniTest::Test compound = Compound.from_smiles "CCO" prediction = model.predict compound assert_equal "true", prediction[:value] - assert_equal ["false"], prediction[:database_activities] + assert_equal ["false"], prediction[:measurements] # make a dataset prediction compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 1cd1ff0..f0ded2f 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -11,7 +11,7 @@ class NanoparticleTest < MiniTest::Test def test_create_model_with_feature_selection training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") - model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors", :feature_selection_algorithm => "correlation_filter"}) + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors", :feature_selection_algorithm => "correlation_filter"}) nanoparticle = training_dataset.nanoparticles[-34] #p nanoparticle.neighbors prediction = model.predict nanoparticle @@ -23,7 +23,7 @@ class NanoparticleTest < MiniTest::Test def test_create_model training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") - model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors"}) + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors"}) nanoparticle = training_dataset.nanoparticles[-34] prediction = model.predict nanoparticle refute_nil prediction[:value] @@ -31,13 +31,67 @@ class NanoparticleTest < MiniTest::Test model.delete end + # TODO move to validation-statistics + def test_inspect_cv + cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last + cv.correlation_plot_id = nil + File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} + #p cv +=begin + #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} + cv.predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,5].each do |sid,p| + s = Substance.find(sid) + puts + p s.name + p([p["value"],p["measurements"],(p["value"]-p["measured"].median).abs]) + neighbors = s.physchem_neighbors dataset_id: cv.model.training_dataset_id, prediction_feature_id: cv.model.prediction_feature_id, type: nil + neighbors.each do |n| + neighbor = Substance.find(n["_id"]) + p "==" + p neighbor.name, n["similarity"], n["measurements"] + p neighbor.core["name"] + p neighbor.coating.collect{|c| c["name"]} + n["common_descriptors"].each do |id| + f = Feature.find(id) + print "#{f.name} #{f.conditions["MEDIUM"]}" + print ", " + end + puts + end + + end +=end + end + def test_inspect_worst_prediction +# TODO check/fix single/double neighbor prediction + cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last + worst_predictions = cv.worst_predictions(n: 3,show_neigbors: false) + assert_equal 3, worst_predictions.size + assert_kind_of Integer, worst_predictions.first[:neighbors] + worst_predictions = cv.worst_predictions + #puts worst_predictions.to_yaml + assert_equal 5, worst_predictions.size + assert_kind_of Array, worst_predictions.first[:neighbors] + assert_kind_of Integer, worst_predictions.first[:neighbors].first[:common_descriptors] + worst_predictions = cv.worst_predictions(n: 2, show_common_descriptors: true) + puts worst_predictions.to_yaml + assert_equal 2, worst_predictions.size + assert_kind_of Array, worst_predictions.first[:neighbors] + refute_nil worst_predictions.first[:neighbors].first[:common_descriptors] + #p cv.model.training_dataset.features + end + def test_validate_model training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") - model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}}) + #feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") + feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX") + + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}}) cv = RegressionCrossValidation.create model - p cv - File.open("tmp.png","w+"){|f| f.puts cv.correlation_plot} + p cv.predictions.sort_by{|sid,p| (p["value"] - p["measurements"].median).abs} + p cv.rmse + p cv.r_squared + File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} refute_nil cv.r_squared refute_nil cv.rmse end @@ -45,7 +99,7 @@ class NanoparticleTest < MiniTest::Test def test_validate_pls_model training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") - model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"}) + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "physchem_neighbors"}) cv = RegressionCrossValidation.create model p cv File.open("tmp.png","w+"){|f| f.puts cv.correlation_plot} @@ -79,7 +133,7 @@ class NanoparticleTest < MiniTest::Test toxcounts = {} pccounts = {} Nanoparticle.all.each do |np| - np.toxicities.each do |t,v| + np.measurements.each do |t,v| toxcounts[t] ||= 0 toxcounts[t] += 1#v.uniq.size end diff --git a/test/setup.rb b/test/setup.rb index 6c97282..e7c32b4 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -#$mongo.database.drop -#$gridfs = $mongo.database.fs +$mongo.database.drop +$gridfs = $mongo.database.fs diff --git a/test/validation.rb b/test/validation.rb index 39314da..a259472 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -1,6 +1,7 @@ require_relative "setup.rb" class ValidationTest < MiniTest::Test + include OpenTox::Validation # defaults @@ -86,7 +87,7 @@ class ValidationTest < MiniTest::Test def test_classification_loo_validation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::LazarClassification.create dataset.features.first, dataset - loo = ClassificationLeaveOneOutValidation.create model + loo = ClassificationLeaveOneOut.create model assert_equal 14, loo.nr_unpredicted refute_empty loo.confusion_matrix assert loo.accuracy > 0.77 @@ -96,7 +97,7 @@ class ValidationTest < MiniTest::Test def test_regression_loo_validation dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") model = Model::LazarRegression.create dataset.features.first, dataset - loo = RegressionLeaveOneOutValidation.create model + loo = RegressionLeaveOneOut.create model assert loo.r_squared > 0.34, "R^2 (#{loo.r_squared}) should be larger than 0.034" end -- cgit v1.2.3