summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-09-06 17:24:25 +0200
committerhelma@in-silico.ch <helma@in-silico.ch>2018-09-06 17:24:25 +0200
commit6d6be53a110e71d0d56ae5ea9a2675f76f7c84ec (patch)
tree14b31f061bec2fbb829fe84f675951ea8cda31b2
parent13e7865f386603fb784e62feef2ee2a56c015b45 (diff)
adjusted classification similarities, dataset sdf export
-rw-r--r--lib/compound.rb10
-rw-r--r--lib/crossvalidation.rb22
-rw-r--r--lib/dataset.rb13
-rw-r--r--lib/opentox.rb2
-rw-r--r--lib/validation-statistics.rb139
5 files changed, 126 insertions, 60 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index bfe69e3..e8f6bc4 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -131,12 +131,12 @@ module OpenTox
# @return [OpenTox::Compound]
def self.from_smiles smiles
if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
- $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
+ warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
return nil
end
smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
if smiles.empty?
- $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
+ warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
return nil
else
Compound.find_or_create_by :smiles => smiles
@@ -259,7 +259,7 @@ module OpenTox
self["chemblid"]
end
- def db_neighbors min_sim: 0.1, dataset_id:
+ def db_neighbors min_sim: 0.2, dataset_id:
#p fingerprints[DEFAULT_FINGERPRINT]
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
@@ -332,11 +332,11 @@ module OpenTox
print sdf
if sdf.match(/.nan/)
- $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
+ warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
sdf = obconversion.write_string(obmol)
if sdf.match(/.nan/)
- $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
+ warn "2D generation failed for compound #{identifier}, rendering without coordinates."
obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
sdf = obconversion.write_string(obmol)
end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index e94864a..d1347a5 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -72,25 +72,27 @@ module OpenTox
class ClassificationCrossValidation < CrossValidation
include ClassificationStatistics
field :accept_values, type: Array
- field :confusion_matrix, type: Array
- field :weighted_confusion_matrix, type: Array
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
+ field :confusion_matrix, type: Hash
+ field :weighted_confusion_matrix, type: Hash
+ field :accuracy, type: Hash
+ field :weighted_accuracy, type: Hash
field :true_rate, type: Hash
field :predictivity, type: Hash
+ field :nr_predictions, type: Hash
field :probability_plot_id, type: BSON::ObjectId
end
# Crossvalidation of regression models
class RegressionCrossValidation < CrossValidation
include RegressionStatistics
- field :rmse, type: Float, default:0
- field :mae, type: Float, default:0
- field :r_squared, type: Float
- field :within_prediction_interval, type: Integer, default:0
- field :out_of_prediction_interval, type: Integer, default:0
- field :correlation_plot_id, type: BSON::ObjectId
+ field :rmse, type: Hash
+ field :mae, type: Hash
+ field :r_squared, type: Hash
+ field :within_prediction_interval, type: Hash
+ field :out_of_prediction_interval, type: Hash
+ field :nr_predictions, type: Hash
field :warnings, type: Array
+ field :correlation_plot_id, type: BSON::ObjectId
end
# Independent repeated crossvalidations
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 6e7d67f..b32d526 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -135,6 +135,19 @@ module OpenTox
end
end
+ # Convert dataset to SDF file
+ # @return [String]
+ def to_sdf
+ substances.each do |substance|
+ puts substance.sdf.sub(/\$\$\$\$\n/,"")
+ features.each do |f|
+ puts "> <#{f.name}>"
+ puts values(substance,f).uniq.join ","
+ puts "\n$$$$"
+ end
+ end
+ end
+
# Parsers
# Create a dataset from file (csv,sdf,...)
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 5c300cf..03d65b0 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -15,7 +15,7 @@ module OpenTox
field :warnings, type: Array, default: []
def warn warning
- $logger.warn warning
+ #$logger.warn warning
warnings << warning
end
end
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 69e7992..a69ede3 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -7,9 +7,10 @@ module OpenTox
# @return [Hash]
def statistics
self.accept_values = model.prediction_feature.accept_values
- self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
- self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
- nr_instances = 0
+ self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :without_warnings => Array.new(accept_values.size){Array.new(accept_values.size,0)}}
+ self.weighted_confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :without_warnings => Array.new(accept_values.size){Array.new(accept_values.size,0)}}
+ #self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+ self.nr_predictions = {:all => 0,:without_warnings => 0}
predictions.each do |cid,pred|
# TODO
# use predictions without probabilities (single neighbor)??
@@ -18,41 +19,69 @@ module OpenTox
m = pred[:measurements].first
if pred[:value] == m
if pred[:value] == accept_values[0]
- confusion_matrix[0][0] += 1
- weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ confusion_matrix[:all][0][0] += 1
+ weighted_confusion_matrix[:all][0][0] += pred[:probabilities][pred[:value]]
+ self.nr_predictions[:all] += 1
+ if pred[:warnings].empty?
+ confusion_matrix[:without_warnings][0][0] += 1
+ weighted_confusion_matrix[:without_warnings][0][0] += pred[:probabilities][pred[:value]]
+ self.nr_predictions[:without_warnings] += 1
+ end
elsif pred[:value] == accept_values[1]
- confusion_matrix[1][1] += 1
- weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ confusion_matrix[:all][1][1] += 1
+ weighted_confusion_matrix[:all][1][1] += pred[:probabilities][pred[:value]]
+ self.nr_predictions[:all] += 1
+ if pred[:warnings].empty?
+ confusion_matrix[:without_warnings][1][1] += 1
+ weighted_confusion_matrix[:without_warnings][1][1] += pred[:probabilities][pred[:value]]
+ self.nr_predictions[:without_warnings] += 1
+ end
end
elsif pred[:value] != m
if pred[:value] == accept_values[0]
- confusion_matrix[0][1] += 1
- weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ confusion_matrix[:all][0][1] += 1
+ weighted_confusion_matrix[:all][0][1] += pred[:probabilities][pred[:value]]
+ self.nr_predictions[:all] += 1
+ if pred[:warnings].empty?
+ confusion_matrix[:without_warnings][0][1] += 1
+ weighted_confusion_matrix[:without_warnings][0][1] += pred[:probabilities][pred[:value]]
+ self.nr_predictions[:without_warnings] += 1
+ end
elsif pred[:value] == accept_values[1]
- confusion_matrix[1][0] += 1
- weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ confusion_matrix[:all][1][0] += 1
+ weighted_confusion_matrix[:all][1][0] += pred[:probabilities][pred[:value]]
+ self.nr_predictions[:all] += 1
+ if pred[:warnings].empty?
+ confusion_matrix[:without_warnings][1][0] += 1
+ weighted_confusion_matrix[:without_warnings][1][0] += pred[:probabilities][pred[:value]]
+ self.nr_predictions[:without_warnings] += 1
+ end
end
end
end
end
- self.true_rate = {}
- self.predictivity = {}
+ self.true_rate = {:all => {}, :without_warnings => {}}
+ self.predictivity = {:all => {}, :without_warnings => {}}
accept_values.each_with_index do |v,i|
- self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+ [:all,:without_warnings].each do |a|
+ self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f
+ self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f
+ end
end
- confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
+ confidence_sum = {:all => 0, :without_warnings => 0}
+ [:all,:without_warnings].each do |a|
+ weighted_confusion_matrix[a].each do |r|
+ r.each do |c|
+ confidence_sum[a] += c
+ end
end
end
- self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
- self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+ self.accuracy = {}
+ self.weighted_accuracy = {}
+ [:all,:without_warnings].each do |a|
+ self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f
+ self.weighted_accuracy[a] = (weighted_confusion_matrix[a][0][0]+weighted_confusion_matrix[a][1][1])/confidence_sum[a].to_f
+ end
$logger.debug "Accuracy #{accuracy}"
save
{
@@ -63,6 +92,7 @@ module OpenTox
:weighted_accuracy => weighted_accuracy,
:true_rate => self.true_rate,
:predictivity => self.predictivity,
+ :nr_predictions => nr_predictions,
}
end
@@ -112,26 +142,44 @@ module OpenTox
# @return [Hash]
def statistics
self.warnings = []
- self.rmse = 0
- self.mae = 0
- self.within_prediction_interval = 0
- self.out_of_prediction_interval = 0
- x = []
- y = []
+ self.rmse = {:all =>0,:without_warnings => 0}
+ self.mae = {:all =>0,:without_warnings => 0}
+ self.within_prediction_interval = {:all =>0,:without_warnings => 0}
+ self.out_of_prediction_interval = {:all =>0,:without_warnings => 0}
+ x = {:all => [],:without_warnings => []}
+ y = {:all => [],:without_warnings => []}
+ self.nr_predictions = {:all =>0,:without_warnings => 0}
+ error = {}
predictions.each do |cid,pred|
if pred[:value] and pred[:measurements]
- x << pred[:measurements].median
- y << pred[:value]
- error = pred[:value]-pred[:measurements].median
- self.rmse += error**2
- self.mae += error.abs
+ self.nr_predictions[:all] +=1
+ x[:all] << pred[:measurements].median
+ y[:all] << pred[:value]
+ error[:all] = pred[:value]-pred[:measurements].median
+ self.rmse[:all] += error**2
+ self.mae[:all] += error.abs
if pred[:prediction_interval]
if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
- self.within_prediction_interval += 1
+ self.within_prediction_interval[:all] += 1
else
- self.out_of_prediction_interval += 1
+ self.out_of_prediction_interval[:all] += 1
end
end
+ if pred[:warnings].empty?
+ self.nr_predictions[:without_warnings] +=1
+ x[:without_warnings] << pred[:measurements].median
+ y[:without_warnings] << pred[:value]
+ error[:without_warnings] = pred[:value]-pred[:measurements].median
+ self.rmse[:without_warnings] += error**2
+ self.mae[:without_warnings] += error.abs
+ if pred[:prediction_interval]
+ if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
+ self.within_prediction_interval[:without_warnings] += 1
+ else
+ self.out_of_prediction_interval[:without_warnings] += 1
+ end
+ end
+ end
else
trd_id = model.training_dataset_id
smiles = Compound.find(cid).smiles
@@ -139,12 +187,14 @@ module OpenTox
$logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
end
end
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(measurement,prediction,use='pairwise')"
- self.r_squared = R.eval("r").to_ruby**2
- self.mae = self.mae/predictions.size
- self.rmse = Math.sqrt(self.rmse/predictions.size)
+ [:all,:without_warnings].each do |a|
+ R.assign "measurement", x[a]
+ R.assign "prediction", y[a]
+ R.eval "r <- cor(measurement,prediction,use='pairwise')"
+ self.r_squared[a] = R.eval("r").to_ruby**2
+ self.mae[a] = self.mae[a]/self.nr_predictions[a]
+ self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a])
+ end
$logger.debug "R^2 #{r_squared}"
$logger.debug "RMSE #{rmse}"
$logger.debug "MAE #{mae}"
@@ -157,6 +207,7 @@ module OpenTox
:r_squared => r_squared,
:within_prediction_interval => within_prediction_interval,
:out_of_prediction_interval => out_of_prediction_interval,
+ :nr_predictions => nr_predictions,
}
end