From 9d17895ab9e8cd31e0f32e8e622e13612ea5ff77 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 12 Oct 2018 21:58:36 +0200 Subject: validation statistic fixes --- lib/classification.rb | 6 -- lib/crossvalidation.rb | 3 +- lib/dataset.rb | 108 +-------------------------------- lib/leave-one-out-validation.rb | 30 +++++----- lib/validation-statistics.rb | 128 +++++++++++++++++++++------------------- 5 files changed, 86 insertions(+), 189 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 468c06a..e78783b 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -18,12 +18,6 @@ module OpenTox class_weights.each do |a,w| probabilities[a] = w.sum/weights.sum end - # DG: hack to ensure always two probability values - # TODO: does not work for arbitrary feature names FIX!! -# if probabilities.keys.uniq.size == 1 -# missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0] -# probabilities[missing_key] = 0.0 -# end probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index d1347a5..2e44ff2 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -35,13 +35,12 @@ module OpenTox cv.validation_ids << validation.id cv.nr_instances += validation.nr_instances cv.nr_unpredicted += validation.nr_unpredicted - #cv.predictions.merge! validation.predictions $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end #Process.waitall cv.save - $logger.debug "Nr unpredicted: #{nr_unpredicted}" + $logger.debug "Nr unpredicted: #{cv.nr_unpredicted}" cv.statistics cv.update_attributes(finished_at: Time.now) cv diff --git a/lib/dataset.rb b/lib/dataset.rb index b6c6173..bbb20be 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -384,6 +384,9 @@ module OpenTox end chunks end + + def transform # TODO + end # Delete dataset def delete @@ -419,109 +422,4 @@ module OpenTox end - class Batch - - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "batch" - field :name, type: String - field :source, type: String - field :identifiers, type: Array - field :ids, type: Array - field :compounds, type: Array - field :warnings, type: Array, default: [] - - def self.from_csv_file file - source = file - name = File.basename(file,".*") - batch = self.find_by(:source => source, :name => name) - if batch - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - # check delimiter - line = File.readlines(file).first - if line.match(/\t/) - table = CSV.read file, :col_sep => "\t", :skip_blanks => true, :encoding => 'windows-1251:utf-8' - else - table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - end - batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => []) - - # original IDs - if table[0][0] =~ /ID/i - @original_ids = table.collect{|row| row.shift} - @original_ids.shift - end - - # features - feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - compound_format = feature_names.shift.strip - unless compound_format =~ /SMILES|InChI/i - File.delete file - bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \ - "Accepted formats: SMILES, InChI. Please take a look on the help page." - end - #numeric = [] - features = [] - # guess feature types - feature_names.each_with_index do |f,i| - metadata = {:name => f} - values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - feature = nil - if values.size == 0 # empty feature - elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - #numeric[i] = true - feature = NumericFeature.find_or_create_by(metadata) - else - metadata["accept_values"] = values.sort - #numeric[i] = false - feature = NominalFeature.find_or_create_by(metadata) - end - features << feature if feature - end - - table.each_with_index do |vals,i| - identifier = vals.shift.strip.gsub(/^'|'$/,"") - begin - case compound_format - when /SMILES/i - compound = OpenTox::Compound.from_smiles(identifier) - when /InChI/i - compound = OpenTox::Compound.from_inchi(identifier) - end - rescue - compound = nil - end - # collect only for present compounds - unless compound.nil? - batch.identifiers << identifier - batch.compounds << compound.id - batch.ids << @original_ids[i] if @original_ids - else - batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}." - end - end - batch.compounds.duplicates.each do |duplicate| - $logger.debug "Duplicates found in #{name}." - dup = Compound.find duplicate - positions = [] - batch.compounds.each_with_index do |co,i| - c = Compound.find co - if !c.blank? and c.inchi and c.inchi == dup.inchi - positions << i+1 - end - end - batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}." - end - batch.save - end - batch - end - - end - end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index c33c92b..b0905b8 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -12,7 +12,7 @@ module OpenTox bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection] $logger.debug "#{model.name}: LOO validation started" t = Time.now - model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut + model.training_dataset.features.collect{|f| f.class}.include?(NominalBioActivity) ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut loo = klass.new :model_id => model.id predictions = model.predict model.training_dataset.substances predictions.each{|cid,p| p.delete(:neighbors)} @@ -40,25 +40,27 @@ module OpenTox class ClassificationLeaveOneOut < LeaveOneOut include ClassificationStatistics field :accept_values, type: Array - field :confusion_matrix, type: Array, default: [] - field :weighted_confusion_matrix, type: Array, default: [] - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash, default: {} - field :predictivity, type: Hash, default: {} - field :confidence_plot_id, type: BSON::ObjectId + field :confusion_matrix, type: Hash + field :weighted_confusion_matrix, type: Hash + field :accuracy, type: Hash + field :weighted_accuracy, type: Hash + field :true_rate, type: Hash + field :predictivity, type: Hash + field :nr_predictions, type: Hash + field :probability_plot_id, type: BSON::ObjectId end # Leave one out validation for regression models class RegressionLeaveOneOut < LeaveOneOut include RegressionStatistics - field :rmse, type: Float, default: 0 - field :mae, type: Float, default: 0 - field :r_squared, type: Float - field :within_prediction_interval, type: Integer, default:0 - field :out_of_prediction_interval, type: Integer, default:0 - field :correlation_plot_id, type: BSON::ObjectId + field :rmse, type: Hash + field :mae, type: Hash + field :r_squared, type: Hash + field :within_prediction_interval, type: Hash + field :out_of_prediction_interval, type: Hash + field :nr_predictions, type: Hash field :warnings, type: Array + field :correlation_plot_id, type: BSON::ObjectId end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index a69ede3..e440731 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -9,8 +9,7 @@ module OpenTox self.accept_values = model.prediction_feature.accept_values self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :without_warnings => Array.new(accept_values.size){Array.new(accept_values.size,0)}} self.weighted_confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :without_warnings => Array.new(accept_values.size){Array.new(accept_values.size,0)}} - #self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - self.nr_predictions = {:all => 0,:without_warnings => 0} + self.nr_predictions = {:all => 0,:without_warnings => 0} predictions.each do |cid,pred| # TODO # use predictions without probabilities (single neighbor)?? @@ -21,41 +20,41 @@ module OpenTox if pred[:value] == accept_values[0] confusion_matrix[:all][0][0] += 1 weighted_confusion_matrix[:all][0][0] += pred[:probabilities][pred[:value]] - self.nr_predictions[:all] += 1 - if pred[:warnings].empty? + self.nr_predictions[:all] += 1 + if pred[:warnings].empty? confusion_matrix[:without_warnings][0][0] += 1 weighted_confusion_matrix[:without_warnings][0][0] += pred[:probabilities][pred[:value]] - self.nr_predictions[:without_warnings] += 1 - end + self.nr_predictions[:without_warnings] += 1 + end elsif pred[:value] == accept_values[1] confusion_matrix[:all][1][1] += 1 weighted_confusion_matrix[:all][1][1] += pred[:probabilities][pred[:value]] - self.nr_predictions[:all] += 1 - if pred[:warnings].empty? + self.nr_predictions[:all] += 1 + if pred[:warnings].empty? confusion_matrix[:without_warnings][1][1] += 1 weighted_confusion_matrix[:without_warnings][1][1] += pred[:probabilities][pred[:value]] - self.nr_predictions[:without_warnings] += 1 - end + self.nr_predictions[:without_warnings] += 1 + end end elsif pred[:value] != m if pred[:value] == accept_values[0] confusion_matrix[:all][0][1] += 1 weighted_confusion_matrix[:all][0][1] += pred[:probabilities][pred[:value]] - self.nr_predictions[:all] += 1 - if pred[:warnings].empty? + self.nr_predictions[:all] += 1 + if pred[:warnings].empty? confusion_matrix[:without_warnings][0][1] += 1 weighted_confusion_matrix[:without_warnings][0][1] += pred[:probabilities][pred[:value]] - self.nr_predictions[:without_warnings] += 1 - end + self.nr_predictions[:without_warnings] += 1 + end elsif pred[:value] == accept_values[1] confusion_matrix[:all][1][0] += 1 weighted_confusion_matrix[:all][1][0] += pred[:probabilities][pred[:value]] - self.nr_predictions[:all] += 1 - if pred[:warnings].empty? + self.nr_predictions[:all] += 1 + if pred[:warnings].empty? confusion_matrix[:without_warnings][1][0] += 1 weighted_confusion_matrix[:without_warnings][1][0] += pred[:probabilities][pred[:value]] - self.nr_predictions[:without_warnings] += 1 - end + self.nr_predictions[:without_warnings] += 1 + end end end end @@ -63,25 +62,25 @@ module OpenTox self.true_rate = {:all => {}, :without_warnings => {}} self.predictivity = {:all => {}, :without_warnings => {}} accept_values.each_with_index do |v,i| - [:all,:without_warnings].each do |a| - self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f - self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f - end + [:all,:without_warnings].each do |a| + self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f + self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f + end end confidence_sum = {:all => 0, :without_warnings => 0} [:all,:without_warnings].each do |a| weighted_confusion_matrix[a].each do |r| r.each do |c| confidence_sum[a] += c - end + end end end - self.accuracy = {} - self.weighted_accuracy = {} + self.accuracy = {} + self.weighted_accuracy = {} [:all,:without_warnings].each do |a| self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f self.weighted_accuracy[a] = (weighted_confusion_matrix[a][0][0]+weighted_confusion_matrix[a][1][1])/confidence_sum[a].to_f - end + end $logger.debug "Accuracy #{accuracy}" save { @@ -92,7 +91,7 @@ module OpenTox :weighted_accuracy => weighted_accuracy, :true_rate => self.true_rate, :predictivity => self.predictivity, - :nr_predictions => nr_predictions, + :nr_predictions => nr_predictions, } end @@ -143,19 +142,20 @@ module OpenTox def statistics self.warnings = [] self.rmse = {:all =>0,:without_warnings => 0} + self.r_squared = {:all =>0,:without_warnings => 0} self.mae = {:all =>0,:without_warnings => 0} self.within_prediction_interval = {:all =>0,:without_warnings => 0} self.out_of_prediction_interval = {:all =>0,:without_warnings => 0} x = {:all => [],:without_warnings => []} y = {:all => [],:without_warnings => []} self.nr_predictions = {:all =>0,:without_warnings => 0} - error = {} predictions.each do |cid,pred| + p pred if pred[:value] and pred[:measurements] - self.nr_predictions[:all] +=1 + self.nr_predictions[:all] +=1 x[:all] << pred[:measurements].median y[:all] << pred[:value] - error[:all] = pred[:value]-pred[:measurements].median + error = pred[:value]-pred[:measurements].median self.rmse[:all] += error**2 self.mae[:all] += error.abs if pred[:prediction_interval] @@ -165,21 +165,21 @@ module OpenTox self.out_of_prediction_interval[:all] += 1 end end - if pred[:warnings].empty? - self.nr_predictions[:without_warnings] +=1 - x[:without_warnings] << pred[:measurements].median - y[:without_warnings] << pred[:value] - error[:without_warnings] = pred[:value]-pred[:measurements].median - self.rmse[:without_warnings] += error**2 - self.mae[:without_warnings] += error.abs - if pred[:prediction_interval] - if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] - self.within_prediction_interval[:without_warnings] += 1 - else - self.out_of_prediction_interval[:without_warnings] += 1 - end - end - end + if pred[:warnings].empty? + self.nr_predictions[:without_warnings] +=1 + x[:without_warnings] << pred[:measurements].median + y[:without_warnings] << pred[:value] + error = pred[:value]-pred[:measurements].median + self.rmse[:without_warnings] += error**2 + self.mae[:without_warnings] += error.abs + if pred[:prediction_interval] + if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] + self.within_prediction_interval[:without_warnings] += 1 + else + self.out_of_prediction_interval[:without_warnings] += 1 + end + end + end else trd_id = model.training_dataset_id smiles = Compound.find(cid).smiles @@ -187,36 +187,40 @@ module OpenTox $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}." end end - [:all,:without_warnings].each do |a| - R.assign "measurement", x[a] - R.assign "prediction", y[a] - R.eval "r <- cor(measurement,prediction,use='pairwise')" - self.r_squared[a] = R.eval("r").to_ruby**2 - self.mae[a] = self.mae[a]/self.nr_predictions[a] - self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a]) - end + [:all,:without_warnings].each do |a| + if x[a].size > 2 + R.assign "measurement", x[a] + R.assign "prediction", y[a] + R.eval "r <- cor(measurement,prediction,use='pairwise')" + self.r_squared[a] = R.eval("r").to_ruby**2 + else + self.r_squared[a] = 0 + end + if self.nr_predictions[a] > 0 + self.mae[a] = self.mae[a]/self.nr_predictions[a] + self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a]) + else + self.mae[a] = nil + self.rmse[a] = nil + end + end $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" - $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval" + $logger.debug "Nr predictions #{nr_predictions}" + $logger.debug "#{within_prediction_interval} measurements within prediction interval" $logger.debug "#{warnings}" save { :mae => mae, :rmse => rmse, :r_squared => r_squared, - :within_prediction_interval => within_prediction_interval, + :within_prediction_interval => self.within_prediction_interval, :out_of_prediction_interval => out_of_prediction_interval, - :nr_predictions => nr_predictions, + :nr_predictions => nr_predictions, } end - # Get percentage of measurements within the prediction interval - # @return [Float] - def percent_within_prediction_interval - 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval) - end - # Plot predicted vs measured values # @param [String,nil] format # @return [Blob] -- cgit v1.2.3