summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-11-16 18:42:42 +0100
committerhelma@in-silico.ch <helma@in-silico.ch>2018-11-16 18:42:42 +0100
commit0882c2cd0de934d7377fc9d08c306be98612c88a (patch)
tree683da6042a5cc4d1786c79fa94d02111ca4af67a /lib
parent7e547fd4a296f497615a7805d565b378cb1bd7cd (diff)
real datasets for testing, test data cleanup, Daphnia import, upper and lower similarity thresholds
Diffstat (limited to 'lib')
-rw-r--r--lib/crossvalidation.rb2
-rw-r--r--lib/download.rb12
-rw-r--r--lib/model.rb42
-rw-r--r--lib/validation-statistics.rb163
4 files changed, 104 insertions, 115 deletions
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 8719dca..e1761bc 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -65,9 +65,7 @@ module OpenTox
include ClassificationStatistics
field :accept_values, type: Array
field :confusion_matrix, type: Hash
- field :weighted_confusion_matrix, type: Hash
field :accuracy, type: Hash
- field :weighted_accuracy, type: Hash
field :true_rate, type: Hash
field :predictivity, type: Hash
field :nr_predictions, type: Hash
diff --git a/lib/download.rb b/lib/download.rb
index 5467167..f17d060 100644
--- a/lib/download.rb
+++ b/lib/download.rb
@@ -249,11 +249,17 @@ module OpenTox
# Download Daphnia dataset from http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/ into the public folder
# The original file requires an email request, this is a temporary workaround
def self.daphnia
- url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv"
+ #url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv"
+ src = File.join(DATA,"parts","toxicity_data.xlsx")
name = "Acute_toxicity-Daphnia_magna"
$logger.debug name
File.open(File.join(DATA,name+".csv"),"w+") do |f|
- f.puts RestClientWrapper.get(url).to_s
+ i = 0
+ CSV.parse(`xlsx2csv #{src}`) do |row|
+ i == 0 ? v = "-log[LC50_mmol/L]" : v = -Math.log10(10**-row[3].to_f*1000)
+ f.puts [row[0],row[1],v].join(",")
+ i += 1
+ end
end
meta = { "species": "Daphnia magna",
"endpoint": "Acute toxicity",
@@ -289,7 +295,7 @@ module OpenTox
:qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
}
].each do |assay|
- Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogen", inactive: "non-carcinogen", qmrf: assay[:qmrf]
+ Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogenic", inactive: "non-carcinogenic", qmrf: assay[:qmrf]
end
Download.mutagenicity
Download.blood_brain_barrier
diff --git a/lib/model.rb b/lib/model.rb
index caf8a6e..08ca07e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -32,7 +32,7 @@ module OpenTox
# @param [OpenTox::Feature, nil] prediction_feature
# By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
# @param [Hash, nil] algorithms
- # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
+ # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
#
# @return [OpenTox::Model::Lazar]
def self.create prediction_feature:nil, training_dataset:, algorithms:{}
@@ -80,7 +80,7 @@ module OpenTox
}
model.algorithms[:similarity] = {
:method => "Algorithm::Similarity.tanimoto",
- :min => 0.5,
+ :min => [0.5,0.2],
}
elsif model.class == LazarRegression
model.algorithms[:prediction] = {
@@ -88,7 +88,7 @@ module OpenTox
}
model.algorithms[:similarity] = {
:method => "Algorithm::Similarity.tanimoto",
- :min => 0.5,
+ :min => [0.5,0.2],
}
end
@@ -100,7 +100,7 @@ module OpenTox
},
:similarity => {
:method => "Algorithm::Similarity.weighted_cosine",
- :min => 0.5,
+ :min => [0.5,0.2],
},
:prediction => {
:method => "Algorithm::Caret.rf",
@@ -197,7 +197,7 @@ module OpenTox
# Predict a substance (compound or nanoparticle)
# @param [OpenTox::Substance]
# @return [Hash]
- def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil
+ def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
@@ -228,7 +228,7 @@ module OpenTox
end
prediction ||= {:warnings => [], :measurements => []}
- prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
+ prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first
neighbor_ids = []
neighbor_similarities = []
neighbor_dependent_variables = []
@@ -238,7 +238,7 @@ module OpenTox
substance_ids.each_with_index do |s,i|
# handle query substance
if substance.id.to_s == s
- prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass
+ prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass
prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
else
if fingerprints?
@@ -264,11 +264,19 @@ module OpenTox
if neighbor_similarities.empty?
prediction[:value] = nil
- prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
+ prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset."
+ if threshold == algorithms[:similarity][:min].last
+ prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})."
+ return prediction
+ end
elsif neighbor_similarities.size == 1
prediction[:value] = nil
- prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
+ prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})."
prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
+ if threshold == algorithms[:similarity][:min].last
+ prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set."
+ return prediction
+ end
else
query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
# call prediction algorithm
@@ -276,11 +284,17 @@ module OpenTox
prediction.merge! result
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
end
- if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
- prediction
- else # try again with a lower threshold
- prediction[:warnings] << "Lowering similarity threshold to 0.2."
- predict_substance substance, 0.2, prediction
+ if threshold == algorithms[:similarity][:min].first
+ if prediction[:warnings].empty?
+ prediction[:confidence] = "High (close to bioassay results)"
+ return prediction
+ else # try again with a lower threshold
+ prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
+ predict_substance substance, algorithms[:similarity][:min].last, prediction
+ end
+ elsif threshold < algorithms[:similarity][:min].first
+ prediction[:confidence] = "Low (lower than bioassay results)"
+ return prediction
end
end
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index f3e3af8..8a8970e 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -7,79 +7,55 @@ module OpenTox
# @return [Hash]
def statistics
self.accept_values = model.prediction_feature.accept_values
- self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :without_warnings => Array.new(accept_values.size){Array.new(accept_values.size,0)}}
- self.weighted_confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :without_warnings => Array.new(accept_values.size){Array.new(accept_values.size,0)}}
- self.nr_predictions = {:all => 0,:without_warnings => 0}
+ self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_high => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_low => Array.new(accept_values.size){Array.new(accept_values.size,0)}}
+ self.nr_predictions = {:all => 0,:confidence_high => 0,:confidence_low => 0}
predictions.each do |cid,pred|
- # TODO
- # use predictions without probabilities (single neighbor)??
- # use measured majority class??
+ # TODO: use measured majority class or all measurements??
if pred[:measurements].uniq.size == 1 and pred[:probabilities]
m = pred[:measurements].first
if pred[:value] == m
- if pred[:value] == accept_values[0]
- confusion_matrix[:all][0][0] += 1
- weighted_confusion_matrix[:all][0][0] += pred[:probabilities][pred[:value]]
- self.nr_predictions[:all] += 1
- if pred[:warnings].empty?
- confusion_matrix[:without_warnings][0][0] += 1
- weighted_confusion_matrix[:without_warnings][0][0] += pred[:probabilities][pred[:value]]
- self.nr_predictions[:without_warnings] += 1
- end
- elsif pred[:value] == accept_values[1]
- confusion_matrix[:all][1][1] += 1
- weighted_confusion_matrix[:all][1][1] += pred[:probabilities][pred[:value]]
- self.nr_predictions[:all] += 1
- if pred[:warnings].empty?
- confusion_matrix[:without_warnings][1][1] += 1
- weighted_confusion_matrix[:without_warnings][1][1] += pred[:probabilities][pred[:value]]
- self.nr_predictions[:without_warnings] += 1
+ accept_values.each_with_index do |v,i|
+ if pred[:value] == v
+ confusion_matrix[:all][i][i] += 1
+ self.nr_predictions[:all] += 1
+ if pred[:confidence].match(/High/i)
+ confusion_matrix[:confidence_high][i][i] += 1
+ self.nr_predictions[:confidence_high] += 1
+ elsif pred[:confidence].match(/Low/i)
+ confusion_matrix[:confidence_low][i][i] += 1
+ self.nr_predictions[:confidence_low] += 1
+ end
end
end
elsif pred[:value] != m
- if pred[:value] == accept_values[0]
- confusion_matrix[:all][0][1] += 1
- weighted_confusion_matrix[:all][0][1] += pred[:probabilities][pred[:value]]
- self.nr_predictions[:all] += 1
- if pred[:warnings].empty?
- confusion_matrix[:without_warnings][0][1] += 1
- weighted_confusion_matrix[:without_warnings][0][1] += pred[:probabilities][pred[:value]]
- self.nr_predictions[:without_warnings] += 1
- end
- elsif pred[:value] == accept_values[1]
- confusion_matrix[:all][1][0] += 1
- weighted_confusion_matrix[:all][1][0] += pred[:probabilities][pred[:value]]
- self.nr_predictions[:all] += 1
- if pred[:warnings].empty?
- confusion_matrix[:without_warnings][1][0] += 1
- weighted_confusion_matrix[:without_warnings][1][0] += pred[:probabilities][pred[:value]]
- self.nr_predictions[:without_warnings] += 1
+ accept_values.each_with_index do |v,i|
+ if pred[:value] == v
+ confusion_matrix[:all][i][(i+1)%2] += 1
+ self.nr_predictions[:all] += 1
+ if pred[:confidence].match(/High/i)
+ confusion_matrix[:confidence_high][i][(i+1)%2] += 1
+ self.nr_predictions[:confidence_high] += 1
+ elsif pred[:confidence].match(/Low/i)
+ confusion_matrix[:confidence_low][i][(i+1)%2] += 1
+ self.nr_predictions[:confidence_low] += 1
+ end
end
end
end
end
end
- self.true_rate = {:all => {}, :without_warnings => {}}
- self.predictivity = {:all => {}, :without_warnings => {}}
+
+ self.true_rate = {:all => {}, :confidence_high => {}, :confidence_low => {}}
+ self.predictivity = {:all => {}, :confidence_high => {}, :confidence_low => {}}
accept_values.each_with_index do |v,i|
- [:all,:without_warnings].each do |a|
+ [:all,:confidence_high,:confidence_low].each do |a|
self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f
self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f
end
end
- confidence_sum = {:all => 0, :without_warnings => 0}
- [:all,:without_warnings].each do |a|
- weighted_confusion_matrix[a].each do |r|
- r.each do |c|
- confidence_sum[a] += c
- end
- end
- end
self.accuracy = {}
- self.weighted_accuracy = {}
- [:all,:without_warnings].each do |a|
+ [:all,:confidence_high,:confidence_low].each do |a|
self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f
- self.weighted_accuracy[a] = (weighted_confusion_matrix[a][0][0]+weighted_confusion_matrix[a][1][1])/confidence_sum[a].to_f
end
$logger.debug "Accuracy #{accuracy}"
$logger.debug "Nr Predictions #{nr_predictions}"
@@ -87,9 +63,7 @@ module OpenTox
{
:accept_values => accept_values,
:confusion_matrix => confusion_matrix,
- :weighted_confusion_matrix => weighted_confusion_matrix,
:accuracy => accuracy,
- :weighted_accuracy => weighted_accuracy,
:true_rate => self.true_rate,
:predictivity => self.predictivity,
:nr_predictions => nr_predictions,
@@ -138,47 +112,27 @@ module OpenTox
# Statistical evaluation of regression validations
module RegressionStatistics
+ attr_accessor :x, :y
+
# Get statistics
# @return [Hash]
def statistics
self.warnings = []
- self.rmse = {:all =>0,:without_warnings => 0}
- self.r_squared = {:all =>0,:without_warnings => 0}
- self.mae = {:all =>0,:without_warnings => 0}
- self.within_prediction_interval = {:all =>0,:without_warnings => 0}
- self.out_of_prediction_interval = {:all =>0,:without_warnings => 0}
- x = {:all => [],:without_warnings => []}
- y = {:all => [],:without_warnings => []}
- self.nr_predictions = {:all =>0,:without_warnings => 0}
+ self.rmse = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.r_squared = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.mae = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.within_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.out_of_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ @x = {:all => [],:confidence_high => [],:confidence_low => []}
+ @y = {:all => [],:confidence_high => [],:confidence_low => []}
+ self.nr_predictions = {:all =>0,:confidence_high => 0,:confidence_low => 0}
predictions.each do |cid,pred|
!if pred[:value] and pred[:measurements] and !pred[:measurements].empty?
- self.nr_predictions[:all] +=1
- x[:all] << pred[:measurements].median
- y[:all] << pred[:value]
- error = pred[:value]-pred[:measurements].median
- self.rmse[:all] += error**2
- self.mae[:all] += error.abs
- if pred[:prediction_interval]
- if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
- self.within_prediction_interval[:all] += 1
- else
- self.out_of_prediction_interval[:all] += 1
- end
- end
- if pred[:warnings].empty?
- self.nr_predictions[:without_warnings] +=1
- x[:without_warnings] << pred[:measurements].median
- y[:without_warnings] << pred[:value]
- error = pred[:value]-pred[:measurements].median
- self.rmse[:without_warnings] += error**2
- self.mae[:without_warnings] += error.abs
- if pred[:prediction_interval]
- if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
- self.within_prediction_interval[:without_warnings] += 1
- else
- self.out_of_prediction_interval[:without_warnings] += 1
- end
- end
+ insert_prediction pred, :all
+ if pred[:confidence].match(/High/i)
+ insert_prediction pred, :confidence_high
+ elsif pred[:confidence].match(/Low/i)
+ insert_prediction pred, :confidence_low
end
else
trd_id = model.training_dataset_id
@@ -187,10 +141,10 @@ module OpenTox
$logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
end
end
- [:all,:without_warnings].each do |a|
- if x[a].size > 2
- R.assign "measurement", x[a]
- R.assign "prediction", y[a]
+ [:all,:confidence_high,:confidence_low].each do |a|
+ if @x[a].size > 2
+ R.assign "measurement", @x[a]
+ R.assign "prediction", @y[a]
R.eval "r <- cor(measurement,prediction,use='pairwise')"
self.r_squared[a] = R.eval("r").to_ruby**2
else
@@ -209,7 +163,6 @@ module OpenTox
$logger.debug "MAE #{mae}"
$logger.debug "Nr predictions #{nr_predictions}"
$logger.debug "#{within_prediction_interval} measurements within prediction interval"
- $logger.debug "#{warnings}"
save
{
:mae => mae,
@@ -270,6 +223,24 @@ module OpenTox
end
worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
end
+
+ private
+
+ def insert_prediction prediction, type
+ self.nr_predictions[type] +=1
+ @x[type] << prediction[:measurements].median
+ @y[type] << prediction[:value]
+ error = prediction[:value]-prediction[:measurements].median
+ self.rmse[type] += error**2
+ self.mae[type] += error.abs
+ if prediction[:prediction_interval]
+ if prediction[:measurements].median >= prediction[:prediction_interval][0] and prediction[:measurements].median <= prediction[:prediction_interval][1]
+ self.within_prediction_interval[type] += 1
+ else
+ self.out_of_prediction_interval[type] += 1
+ end
+ end
+ end
end
end
end