From 0882c2cd0de934d7377fc9d08c306be98612c88a Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 18:42:42 +0100 Subject: real datasets for testing, test data cleanup, Daphnia import, upper and lower similarity thresholds --- lib/crossvalidation.rb | 2 - lib/download.rb | 12 +++- lib/model.rb | 42 +++++++---- lib/validation-statistics.rb | 163 ++++++++++++++++++------------------------- 4 files changed, 104 insertions(+), 115 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 8719dca..e1761bc 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -65,9 +65,7 @@ module OpenTox include ClassificationStatistics field :accept_values, type: Array field :confusion_matrix, type: Hash - field :weighted_confusion_matrix, type: Hash field :accuracy, type: Hash - field :weighted_accuracy, type: Hash field :true_rate, type: Hash field :predictivity, type: Hash field :nr_predictions, type: Hash diff --git a/lib/download.rb b/lib/download.rb index 5467167..f17d060 100644 --- a/lib/download.rb +++ b/lib/download.rb @@ -249,11 +249,17 @@ module OpenTox # Download Daphnia dataset from http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/ into the public folder # The original file requires an email request, this is a temporary workaround def self.daphnia - url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv" + #url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv" + src = File.join(DATA,"parts","toxicity_data.xlsx") name = "Acute_toxicity-Daphnia_magna" $logger.debug name File.open(File.join(DATA,name+".csv"),"w+") do |f| - f.puts RestClientWrapper.get(url).to_s + i = 0 + CSV.parse(`xlsx2csv #{src}`) do |row| + i == 0 ? v = "-log[LC50_mmol/L]" : v = -Math.log10(10**-row[3].to_f*1000) + f.puts [row[0],row[1],v].join(",") + i += 1 + end end meta = { "species": "Daphnia magna", "endpoint": "Acute toxicity", @@ -289,7 +295,7 @@ module OpenTox :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} } ].each do |assay| - Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogen", inactive: "non-carcinogen", qmrf: assay[:qmrf] + Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogenic", inactive: "non-carcinogenic", qmrf: assay[:qmrf] end Download.mutagenicity Download.blood_brain_barrier diff --git a/lib/model.rb b/lib/model.rb index caf8a6e..08ca07e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -32,7 +32,7 @@ module OpenTox # @param [OpenTox::Feature, nil] prediction_feature # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature # @param [Hash, nil] algorithms - # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. + # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. # # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:, algorithms:{} @@ -80,7 +80,7 @@ module OpenTox } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", - :min => 0.5, + :min => [0.5,0.2], } elsif model.class == LazarRegression model.algorithms[:prediction] = { @@ -88,7 +88,7 @@ module OpenTox } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", - :min => 0.5, + :min => [0.5,0.2], } end @@ -100,7 +100,7 @@ module OpenTox }, :similarity => { :method => "Algorithm::Similarity.weighted_cosine", - :min => 0.5, + :min => [0.5,0.2], }, :prediction => { :method => "Algorithm::Caret.rf", @@ -197,7 +197,7 @@ module OpenTox # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] - def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil + def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] @@ -228,7 +228,7 @@ module OpenTox end prediction ||= {:warnings => [], :measurements => []} - prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min] + prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first neighbor_ids = [] neighbor_similarities = [] neighbor_dependent_variables = [] @@ -238,7 +238,7 @@ module OpenTox substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s - prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass + prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? @@ -264,11 +264,19 @@ module OpenTox if neighbor_similarities.empty? prediction[:value] = nil - prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset." + prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset." + if threshold == algorithms[:similarity][:min].last + prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})." + return prediction + end elsif neighbor_similarities.size == 1 prediction[:value] = nil - prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set." + prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})." prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}] + if threshold == algorithms[:similarity][:min].last + prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set." + return prediction + end else query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm @@ -276,11 +284,17 @@ module OpenTox prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end - if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 - prediction - else # try again with a lower threshold - prediction[:warnings] << "Lowering similarity threshold to 0.2." - predict_substance substance, 0.2, prediction + if threshold == algorithms[:similarity][:min].first + if prediction[:warnings].empty? + prediction[:confidence] = "High (close to bioassay results)" + return prediction + else # try again with a lower threshold + prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." + predict_substance substance, algorithms[:similarity][:min].last, prediction + end + elsif threshold < algorithms[:similarity][:min].first + prediction[:confidence] = "Low (lower than bioassay results)" + return prediction end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index f3e3af8..8a8970e 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -7,79 +7,55 @@ module OpenTox # @return [Hash] def statistics self.accept_values = model.prediction_feature.accept_values - self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :without_warnings => Array.new(accept_values.size){Array.new(accept_values.size,0)}} - self.weighted_confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :without_warnings => Array.new(accept_values.size){Array.new(accept_values.size,0)}} - self.nr_predictions = {:all => 0,:without_warnings => 0} + self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_high => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_low => Array.new(accept_values.size){Array.new(accept_values.size,0)}} + self.nr_predictions = {:all => 0,:confidence_high => 0,:confidence_low => 0} predictions.each do |cid,pred| - # TODO - # use predictions without probabilities (single neighbor)?? - # use measured majority class?? + # TODO: use measured majority class or all measurements?? if pred[:measurements].uniq.size == 1 and pred[:probabilities] m = pred[:measurements].first if pred[:value] == m - if pred[:value] == accept_values[0] - confusion_matrix[:all][0][0] += 1 - weighted_confusion_matrix[:all][0][0] += pred[:probabilities][pred[:value]] - self.nr_predictions[:all] += 1 - if pred[:warnings].empty? - confusion_matrix[:without_warnings][0][0] += 1 - weighted_confusion_matrix[:without_warnings][0][0] += pred[:probabilities][pred[:value]] - self.nr_predictions[:without_warnings] += 1 - end - elsif pred[:value] == accept_values[1] - confusion_matrix[:all][1][1] += 1 - weighted_confusion_matrix[:all][1][1] += pred[:probabilities][pred[:value]] - self.nr_predictions[:all] += 1 - if pred[:warnings].empty? - confusion_matrix[:without_warnings][1][1] += 1 - weighted_confusion_matrix[:without_warnings][1][1] += pred[:probabilities][pred[:value]] - self.nr_predictions[:without_warnings] += 1 + accept_values.each_with_index do |v,i| + if pred[:value] == v + confusion_matrix[:all][i][i] += 1 + self.nr_predictions[:all] += 1 + if pred[:confidence].match(/High/i) + confusion_matrix[:confidence_high][i][i] += 1 + self.nr_predictions[:confidence_high] += 1 + elsif pred[:confidence].match(/Low/i) + confusion_matrix[:confidence_low][i][i] += 1 + self.nr_predictions[:confidence_low] += 1 + end end end elsif pred[:value] != m - if pred[:value] == accept_values[0] - confusion_matrix[:all][0][1] += 1 - weighted_confusion_matrix[:all][0][1] += pred[:probabilities][pred[:value]] - self.nr_predictions[:all] += 1 - if pred[:warnings].empty? - confusion_matrix[:without_warnings][0][1] += 1 - weighted_confusion_matrix[:without_warnings][0][1] += pred[:probabilities][pred[:value]] - self.nr_predictions[:without_warnings] += 1 - end - elsif pred[:value] == accept_values[1] - confusion_matrix[:all][1][0] += 1 - weighted_confusion_matrix[:all][1][0] += pred[:probabilities][pred[:value]] - self.nr_predictions[:all] += 1 - if pred[:warnings].empty? - confusion_matrix[:without_warnings][1][0] += 1 - weighted_confusion_matrix[:without_warnings][1][0] += pred[:probabilities][pred[:value]] - self.nr_predictions[:without_warnings] += 1 + accept_values.each_with_index do |v,i| + if pred[:value] == v + confusion_matrix[:all][i][(i+1)%2] += 1 + self.nr_predictions[:all] += 1 + if pred[:confidence].match(/High/i) + confusion_matrix[:confidence_high][i][(i+1)%2] += 1 + self.nr_predictions[:confidence_high] += 1 + elsif pred[:confidence].match(/Low/i) + confusion_matrix[:confidence_low][i][(i+1)%2] += 1 + self.nr_predictions[:confidence_low] += 1 + end end end end end end - self.true_rate = {:all => {}, :without_warnings => {}} - self.predictivity = {:all => {}, :without_warnings => {}} + + self.true_rate = {:all => {}, :confidence_high => {}, :confidence_low => {}} + self.predictivity = {:all => {}, :confidence_high => {}, :confidence_low => {}} accept_values.each_with_index do |v,i| - [:all,:without_warnings].each do |a| + [:all,:confidence_high,:confidence_low].each do |a| self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f end end - confidence_sum = {:all => 0, :without_warnings => 0} - [:all,:without_warnings].each do |a| - weighted_confusion_matrix[a].each do |r| - r.each do |c| - confidence_sum[a] += c - end - end - end self.accuracy = {} - self.weighted_accuracy = {} - [:all,:without_warnings].each do |a| + [:all,:confidence_high,:confidence_low].each do |a| self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f - self.weighted_accuracy[a] = (weighted_confusion_matrix[a][0][0]+weighted_confusion_matrix[a][1][1])/confidence_sum[a].to_f end $logger.debug "Accuracy #{accuracy}" $logger.debug "Nr Predictions #{nr_predictions}" @@ -87,9 +63,7 @@ module OpenTox { :accept_values => accept_values, :confusion_matrix => confusion_matrix, - :weighted_confusion_matrix => weighted_confusion_matrix, :accuracy => accuracy, - :weighted_accuracy => weighted_accuracy, :true_rate => self.true_rate, :predictivity => self.predictivity, :nr_predictions => nr_predictions, @@ -138,47 +112,27 @@ module OpenTox # Statistical evaluation of regression validations module RegressionStatistics + attr_accessor :x, :y + # Get statistics # @return [Hash] def statistics self.warnings = [] - self.rmse = {:all =>0,:without_warnings => 0} - self.r_squared = {:all =>0,:without_warnings => 0} - self.mae = {:all =>0,:without_warnings => 0} - self.within_prediction_interval = {:all =>0,:without_warnings => 0} - self.out_of_prediction_interval = {:all =>0,:without_warnings => 0} - x = {:all => [],:without_warnings => []} - y = {:all => [],:without_warnings => []} - self.nr_predictions = {:all =>0,:without_warnings => 0} + self.rmse = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.r_squared = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.mae = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.within_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.out_of_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0} + @x = {:all => [],:confidence_high => [],:confidence_low => []} + @y = {:all => [],:confidence_high => [],:confidence_low => []} + self.nr_predictions = {:all =>0,:confidence_high => 0,:confidence_low => 0} predictions.each do |cid,pred| !if pred[:value] and pred[:measurements] and !pred[:measurements].empty? - self.nr_predictions[:all] +=1 - x[:all] << pred[:measurements].median - y[:all] << pred[:value] - error = pred[:value]-pred[:measurements].median - self.rmse[:all] += error**2 - self.mae[:all] += error.abs - if pred[:prediction_interval] - if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] - self.within_prediction_interval[:all] += 1 - else - self.out_of_prediction_interval[:all] += 1 - end - end - if pred[:warnings].empty? - self.nr_predictions[:without_warnings] +=1 - x[:without_warnings] << pred[:measurements].median - y[:without_warnings] << pred[:value] - error = pred[:value]-pred[:measurements].median - self.rmse[:without_warnings] += error**2 - self.mae[:without_warnings] += error.abs - if pred[:prediction_interval] - if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] - self.within_prediction_interval[:without_warnings] += 1 - else - self.out_of_prediction_interval[:without_warnings] += 1 - end - end + insert_prediction pred, :all + if pred[:confidence].match(/High/i) + insert_prediction pred, :confidence_high + elsif pred[:confidence].match(/Low/i) + insert_prediction pred, :confidence_low end else trd_id = model.training_dataset_id @@ -187,10 +141,10 @@ module OpenTox $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}." end end - [:all,:without_warnings].each do |a| - if x[a].size > 2 - R.assign "measurement", x[a] - R.assign "prediction", y[a] + [:all,:confidence_high,:confidence_low].each do |a| + if @x[a].size > 2 + R.assign "measurement", @x[a] + R.assign "prediction", @y[a] R.eval "r <- cor(measurement,prediction,use='pairwise')" self.r_squared[a] = R.eval("r").to_ruby**2 else @@ -209,7 +163,6 @@ module OpenTox $logger.debug "MAE #{mae}" $logger.debug "Nr predictions #{nr_predictions}" $logger.debug "#{within_prediction_interval} measurements within prediction interval" - $logger.debug "#{warnings}" save { :mae => mae, @@ -270,6 +223,24 @@ module OpenTox end worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h end + + private + + def insert_prediction prediction, type + self.nr_predictions[type] +=1 + @x[type] << prediction[:measurements].median + @y[type] << prediction[:value] + error = prediction[:value]-prediction[:measurements].median + self.rmse[type] += error**2 + self.mae[type] += error.abs + if prediction[:prediction_interval] + if prediction[:measurements].median >= prediction[:prediction_interval][0] and prediction[:measurements].median <= prediction[:prediction_interval][1] + self.within_prediction_interval[type] += 1 + else + self.out_of_prediction_interval[type] += 1 + end + end + end end end end -- cgit v1.2.3