summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/crossvalidation.rb57
-rw-r--r--lib/dataset.rb1
-rw-r--r--lib/error.rb2
-rw-r--r--lib/experiment.rb81
-rw-r--r--lib/lazar.rb5
-rw-r--r--lib/model.rb3
-rw-r--r--test/experiment.rb62
-rw-r--r--test/validation.rb12
8 files changed, 141 insertions, 82 deletions
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 90c0d75..f480932 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -22,7 +22,9 @@ module OpenTox
end
def self.create model, n=10
- cv = self.new(
+ model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
+ bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+ cv = klass.new(
name: model.name,
model_id: model.id,
folds: n
@@ -55,6 +57,7 @@ module OpenTox
nr_unpredicted: nr_unpredicted,
predictions: predictions
)
+ cv.statistics
cv
end
end
@@ -70,14 +73,13 @@ module OpenTox
field :predictivity, type: Hash
# TODO auc, f-measure (usability??)
- def self.create model, n=10
- cv = super model, n
+ def statistics
accept_values = Feature.find(model.prediction_feature_id).accept_values
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
true_rate = {}
predictivity = {}
- cv.predictions.each do |pred|
+ predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction and confidence.numeric?
if prediction == activity
@@ -113,18 +115,16 @@ module OpenTox
confidence_sum += c
end
end
- cv.update_attributes(
+ update_attributes(
accept_values: accept_values,
confusion_matrix: confusion_matrix,
weighted_confusion_matrix: weighted_confusion_matrix,
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(cv.nr_instances-cv.nr_unpredicted).to_f,
+ accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
true_rate: true_rate,
predictivity: predictivity,
finished_at: Time.now
)
- cv.save
- cv
end
#Average area under roc 0.646
@@ -142,8 +142,7 @@ module OpenTox
field :correlation_plot_id, type: BSON::ObjectId
field :confidence_plot_id, type: BSON::ObjectId
- def self.create model, n=10
- cv = super model, n
+ def statistics
rmse = 0
weighted_rmse = 0
rse = 0
@@ -153,7 +152,7 @@ module OpenTox
rae = 0
weighted_rae = 0
confidence_sum = 0
- cv.predictions.each do |pred|
+ predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction
error = Math.log10(prediction)-Math.log10(activity)
@@ -163,24 +162,24 @@ module OpenTox
weighted_mae += confidence*error.abs
confidence_sum += confidence
else
- cv.warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}."
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
end
end
- x = cv.predictions.collect{|p| p[1]}
- y = cv.predictions.collect{|p| p[2]}
+ x = predictions.collect{|p| p[1]}
+ y = predictions.collect{|p| p[2]}
R.assign "measurement", x
R.assign "prediction", y
R.eval "r <- cor(-log(measurement),-log(prediction))"
r = R.eval("r").to_ruby
- mae = mae/cv.predictions.size
+ mae = mae/predictions.size
weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/cv.predictions.size)
+ rmse = Math.sqrt(rmse/predictions.size)
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
# TODO check!!
=begin
- cv.predictions.sort! do |a,b|
+ predictions.sort! do |a,b|
relative_error_a = (a[1]-a[2]).abs/a[1].to_f
relative_error_a = 1/relative_error_a if relative_error_a < 1
relative_error_b = (b[1]-b[2]).abs/b[1].to_f
@@ -188,15 +187,14 @@ module OpenTox
[relative_error_b,b[3]] <=> [relative_error_a,a[3]]
end
=end
- cv.update_attributes(
+ update_attributes(
mae: mae,
rmse: rmse,
weighted_mae: weighted_mae,
weighted_rmse: weighted_rmse,
- r_squared: r**2
+ r_squared: r**2,
+ finished_at: Time.now
)
- cv.save
- cv
end
def misclassifications n=nil
@@ -277,5 +275,20 @@ module OpenTox
end
end
+ class RepeatedCrossValidation
+ field :crossvalidation_ids, type: Array, default: []
+ def self.create model, folds=10, repeats=3
+ repeated_cross_validation = self.new
+ repeats.times do
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+ end
+ repeated_cross_validation.save
+ repeated_cross_validation
+ end
+ def crossvalidations
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
+ end
+ end
+
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 851fabd..d884716 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -47,6 +47,7 @@ module OpenTox
@data_entries = Marshal.load(data_entry_file.data)
bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
+ # TODO: data_entries can be empty, poorly reproducible, mongo problem?
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
#$logger.debug "Retrieving data: #{Time.now-t}"
end
diff --git a/lib/error.rb b/lib/error.rb
index 8fe8a1e..39b3c76 100644
--- a/lib/error.rb
+++ b/lib/error.rb
@@ -58,7 +58,7 @@ module OpenTox
OpenTox.const_set error[:class],c
# define global methods for raising errors, eg. bad_request_error
- Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
+ Object.send(:define_method, error[:method]) do |message|
raise c.new(message)
end
end
diff --git a/lib/experiment.rb b/lib/experiment.rb
index 2f51756..7849337 100644
--- a/lib/experiment.rb
+++ b/lib/experiment.rb
@@ -2,45 +2,22 @@ module OpenTox
class Experiment
field :dataset_ids, type: Array
- field :model_algorithms, type: Array
- field :model_ids, type: Array, default: []
- field :crossvalidation_ids, type: Array, default: []
- field :prediction_algorithms, type: Array
- field :neighbor_algorithms, type: Array
- field :neighbor_algorithm_parameters, type: Array
+ field :model_settings, type: Array
+ field :results, type: Hash, default: {}
end
- # TODO more sophisticated experimental design
def run
dataset_ids.each do |dataset_id|
dataset = Dataset.find(dataset_id)
- model_algorithms.each do |model_algorithm|
- prediction_algorithms.each do |prediction_algorithm|
- neighbor_algorithms.each do |neighbor_algorithm|
- neighbor_algorithm_parameters.each do |neighbor_algorithm_parameter|
- $logger.debug "Creating #{model_algorithm} model for dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}."
- model = Object.const_get(model_algorithm).create dataset
- model.prediction_algorithm = prediction_algorithm
- model.neighbor_algorithm = neighbor_algorithm
- model.neighbor_algorithm_parameters = neighbor_algorithm_parameter
- model.save
- model_ids << model.id
- cv = nil
- if dataset.features.first.nominal
- cv = ClassificationCrossValidation
- elsif dataset.features.first.numeric
- cv = RegressionCrossValidation
- end
- if cv
- $logger.debug "Creating #{cv} for #{model_algorithm}, dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}."
- crossvalidation = cv.create model
- self.crossvalidation_ids << crossvalidation.id
- else
- $logger.warn "#{dataset.features.first} is neither nominal nor numeric."
- end
- end
- end
- end
+ results[dataset_id.to_s] = []
+ model_settings.each do |setting|
+ model = Object.const_get(setting[:algorithm]).create dataset
+ model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm]
+ model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm]
+ model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter]
+ model.save
+ repeated_crossvalidation = RepeatedCrossValidation.create model
+ results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
end
end
save
@@ -54,13 +31,37 @@ module OpenTox
end
def report
- # TODO create ggplot2 report
- self.crossvalidation_ids.each do |id|
- cv = CrossValidation.find(id)
- file = "/tmp/#{id}.svg"
- File.open(file,"w+"){|f| f.puts cv.correlation_plot}
- `inkview '#{file}'`
+ # TODO significances
+ report = {}
+ report[:name] = name
+ report[:experiment_id] = self.id.to_s
+ dataset_ids.each do |dataset_id|
+ dataset_name = Dataset.find(dataset_id).name
+ report[dataset_name] = []
+ results[dataset_id.to_s].each do |result|
+ model = Model::Lazar.find(result[:model_id])
+ repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
+ crossvalidations = repeated_cv.crossvalidations
+ summary = {}
+ [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
+ summary[key] = model[key]
+ end
+ summary[:nr_instances] = crossvalidations.first.nr_instances
+ summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
+ summary[:time] = crossvalidations.collect{|cv| cv.time}
+ if crossvalidations.first.is_a? ClassificationCrossValidation
+ summary[:accuracies] = crossvalidations.collect{|cv| cv.accuracy}
+ elsif crossvalidations.first.is_a? RegressionCrossValidation
+ summary[:r_squared] = crossvalidations.collect{|cv| cv.r_squared}
+ end
+ report[dataset_name] << summary
+ #p repeated_cv.crossvalidations.collect{|cv| cv.accuracy}
+ #file = "/tmp/#{id}.svg"
+ #File.open(file,"w+"){|f| f.puts cv.correlation_plot}
+ #`inkview '#{file}'`
+ end
end
+ report
end
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index decbe69..9b02053 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -15,7 +15,8 @@ ENV["MONGOID_ENV"] ||= "development"
# TODO remove config files, change default via ENV or directly in Mongoid class
Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
Mongoid.raise_not_found_error = false # return nil if no document is found
-$mongo = Mongoid.default_client
+$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
+#$mongo = Mongoid.default_client
$gridfs = $mongo.database.fs
# R setup
@@ -42,7 +43,7 @@ ENV['FMINER_SILENT'] = 'true'
ENV['FMINER_NR_HITS'] = 'true'
# OpenTox classes and includes
-CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
[ # be aware of the require sequence as it affects class/method overwrites
"overwrite.rb",
diff --git a/lib/model.rb b/lib/model.rb
index 0155fc8..ddb69e4 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -28,9 +28,6 @@ module OpenTox
field :neighbor_algorithm, type: String
field :neighbor_algorithm_parameters, type: Hash
- #attr_accessor :prediction_dataset
- #attr_accessor :training_dataset
-
# Create a lazar model from a training_dataset and a feature_dataset
# @param [OpenTox::Dataset] training_dataset
# @return [OpenTox::Model::Lazar] Regression or classification model
diff --git a/test/experiment.rb b/test/experiment.rb
index c465d7b..cad4fa7 100644
--- a/test/experiment.rb
+++ b/test/experiment.rb
@@ -4,27 +4,61 @@ class ExperimentTest < MiniTest::Test
def test_regression_experiment
datasets = [
- "EPAFHM.csv",
- "FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv",
+ "EPAFHM.medi.csv",
+ #"EPAFHM.csv",
+ #"FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv",
"LOAEL_mmol_corrected_smiles.csv"
+ ]
+ experiment = Experiment.create(
+ :name => "Default regression for datasets #{datasets}.",
+ :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
+ :model_settings => [
+ {
+ :algorithm => "OpenTox::Model::LazarRegression",
+ }
]
- model_algorithms = ["OpenTox::Model::LazarRegression"]
- neighbor_algorithms = ["OpenTox::Algorithm::Neighbor.fingerprint_similarity"]
- prediction_algorithms = ["OpenTox::Algorithm::Regression.weighted_average"]
- neighbor_algorithm_parameters = [{:min_sim => 0.7}]
+ )
+ experiment.run
+ puts experiment.report.to_yaml
+ assert_equal datasets.size, experiment.results.size
+ experiment.results.each do |dataset_id, result|
+ assert_equal 1, result.size
+ result.each do |r|
+ assert_kind_of BSON::ObjectId, r[:model_id]
+ assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id]
+ end
+ end
+ end
+
+ def test_classification_experiment
+
+ datasets = [ "hamster_carcinogenicity.csv" ]
experiment = Experiment.create(
- :name => "Regression for datasets #{datasets}.",
+ :name => "Fminer vs fingerprint classification for datasets #{datasets}.",
:dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
- :model_algorithms => model_algorithms,
- :neighbor_algorithms => neighbor_algorithms,
- :neighbor_algorithm_parameters => neighbor_algorithm_parameters,
- :prediction_algorithms => prediction_algorithms,
+ :model_settings => [
+ {
+ :algorithm => "OpenTox::Model::LazarClassification",
+ },{
+ :algorithm => "OpenTox::Model::LazarClassification",
+ :neighbor_algorithm_parameter => {:min_sim => 0.3}
+ },
+ #{
+ #:algorithm => "OpenTox::Model::LazarFminerClassification",
+ #}
+ ]
)
experiment.run
=begin
- p experiment
- experiment.report
+ experiment = Experiment.find "55f944a22b72ed7de2000000"
=end
- refute_empty experiment.crossvalidation_ids
+ puts experiment.report.to_yaml
+ experiment.results.each do |dataset_id, result|
+ assert_equal 2, result.size
+ result.each do |r|
+ assert_kind_of BSON::ObjectId, r[:model_id]
+ assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id]
+ end
+ end
end
end
diff --git a/test/validation.rb b/test/validation.rb
index a4c3d80..dfa2c81 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -33,4 +33,16 @@ class ValidationTest < MiniTest::Test
#assert cv.weighted_mae < cv.mae
end
+ def test_repeated_crossvalidation
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+ model = Model::LazarClassification.create dataset
+ repeated_cv = RepeatedCrossValidation.create model
+ p repeated_cv
+ repeated_cv.crossvalidations.each do |cv|
+ p cv
+ p cv.accuracy
+ assert cv.accuracy > 0.7
+ end
+ end
+
end