summaryrefslogtreecommitdiff
path: root/lib/validation.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/validation.rb')
-rw-r--r--lib/validation.rb136
1 files changed, 43 insertions, 93 deletions
diff --git a/lib/validation.rb b/lib/validation.rb
index c2250de..bcbe49a 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -1,36 +1,41 @@
module OpenTox
class Validation
- include OpenTox
- include Mongoid::Document
- include Mongoid::Timestamps
- store_in collection: "validations"
field :prediction_dataset_id, type: BSON::ObjectId
field :test_dataset_id, type: BSON::ObjectId
field :nr_instances, type: Integer
field :nr_unpredicted, type: Integer
+ field :predictions, type: Array
+
+ def prediction_dataset
+ Dataset.find prediction_dataset_id
+ end
+
+ def test_dataset
+ Dataset.find test_dataset_id
+ end
+
+ end
+
+ class ClassificationValidation < Validation
field :accept_values, type: String
field :confusion_matrix, type: Array
field :weighted_confusion_matrix, type: Array
- field :predictions, type: Array
- # TODO classification und regression in subclasses
def self.create model, training_set, test_set
validation = self.class.new
- feature_dataset = Dataset.find model.feature_dataset_id
- if feature_dataset.is_a? FminerDataset
- features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
- else
- # TODO search for descriptors
- end
- validation_model = Model::Lazar.create training_set, features
+ #feature_dataset = Dataset.find model.feature_dataset_id
+ # TODO check and delegate to Algorithm
+ #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
+ validation_model = model.class.create training_set#, features
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
prediction_dataset = validation_model.predict test_set_without_activities
accept_values = prediction_dataset.prediction_feature.accept_values
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
predictions = []
+ nr_unpredicted = 0
prediction_dataset.data_entries.each_with_index do |pe,i|
if pe[0] and pe[1] and pe[1].numeric?
prediction = pe[0]
@@ -56,13 +61,15 @@ module OpenTox
weighted_confusion_matrix[1][0] += confidence
end
end
+ else
+ nr_unpredicted += 1 if pe[0].nil?
end
end
validation = self.new(
:prediction_dataset_id => prediction_dataset.id,
:test_dataset_id => test_set.id,
:nr_instances => test_set.compound_ids.size,
- :nr_unpredicted => prediction_dataset.data_entries.count{|de| de.first.nil?},
+ :nr_unpredicted => nr_unpredicted,
:accept_values => accept_values,
:confusion_matrix => confusion_matrix,
:weighted_confusion_matrix => weighted_confusion_matrix,
@@ -71,94 +78,37 @@ module OpenTox
validation.save
validation
end
-
- def prediction_dataset
- Dataset.find prediction_dataset_id
- end
-
- def test_dataset
- Dataset.find test_dataset_id
- end
-
end
- class CrossValidation
- include OpenTox
- include Mongoid::Document
- include Mongoid::Timestamps
- store_in collection: "crossvalidations"
-
- field :validation_ids, type: Array, default: []
- field :folds, type: Integer
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :accept_values, type: Array
- field :confusion_matrix, type: Array
- field :weighted_confusion_matrix, type: Array
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash
- field :predictivity, type: Hash
- field :predictions, type: Array
- # TODO auc, f-measure (usability??)
-
- def self.create model, n=10
- validation_ids = []
- nr_instances = 0
- nr_unpredicted = 0
- accept_values = model.prediction_feature.accept_values
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- true_rate = {}
- predictivity = {}
+ class RegressionValidation < Validation
+ def self.create model, training_set, test_set
+
+ validation_model = Model::LazarRegression.create training_set
+ test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+ prediction_dataset = validation_model.predict test_set_without_activities
predictions = []
- model.training_dataset.folds(n).each do |fold|
- validation = Validation.create(model, fold[0], fold[1])
- validation_ids << validation.id
- nr_instances += validation.nr_instances
- nr_unpredicted += validation.nr_unpredicted
- validation.confusion_matrix.each_with_index do |r,i|
- r.each_with_index do |c,j|
- confusion_matrix[i][j] += c
- weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
- end
- end
- predictions << validation.predictions
- end
- true_rate = {}
- predictivity = {}
- accept_values.each_with_index do |v,i|
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
+ nr_unpredicted = 0
+ activities = test_set.data_entries.collect{|de| de.first}
+ prediction_dataset.data_entries.each_with_index do |de,i|
+ if de[0] and de[1] and de[1].numeric?
+ activity = activities[i]
+ prediction = de.first
+ confidence = de[1]
+ predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
+ else
+ nr_unpredicted += 1
end
end
- cv = CrossValidation.new(
- :folds => n,
- :validation_ids => validation_ids,
- :nr_instances => nr_instances,
+ validation = self.new(
+ :prediction_dataset_id => prediction_dataset.id,
+ :test_dataset_id => test_set.id,
+ :nr_instances => test_set.compound_ids.size,
:nr_unpredicted => nr_unpredicted,
- :accept_values => accept_values,
- :confusion_matrix => confusion_matrix,
- :weighted_confusion_matrix => weighted_confusion_matrix,
- :accuracy => (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
- :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
- :true_rate => true_rate,
- :predictivity => predictivity,
:predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
)
- cv.save
- cv
+ validation.save
+ validation
end
-
- #Average area under roc 0.646
- #Area under roc 0.646
- #F measure carcinogen: 0.769, noncarcinogen: 0.348
-
end
end