diff options
Diffstat (limited to 'lib/validation.rb')
-rw-r--r-- | lib/validation.rb | 136 |
1 files changed, 43 insertions, 93 deletions
diff --git a/lib/validation.rb b/lib/validation.rb index c2250de..bcbe49a 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -1,36 +1,41 @@ module OpenTox class Validation - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "validations" field :prediction_dataset_id, type: BSON::ObjectId field :test_dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer + field :predictions, type: Array + + def prediction_dataset + Dataset.find prediction_dataset_id + end + + def test_dataset + Dataset.find test_dataset_id + end + + end + + class ClassificationValidation < Validation field :accept_values, type: String field :confusion_matrix, type: Array field :weighted_confusion_matrix, type: Array - field :predictions, type: Array - # TODO classification und regression in subclasses def self.create model, training_set, test_set validation = self.class.new - feature_dataset = Dataset.find model.feature_dataset_id - if feature_dataset.is_a? FminerDataset - features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters - else - # TODO search for descriptors - end - validation_model = Model::Lazar.create training_set, features + #feature_dataset = Dataset.find model.feature_dataset_id + # TODO check and delegate to Algorithm + #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters + validation_model = model.class.create training_set#, features test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used prediction_dataset = validation_model.predict test_set_without_activities accept_values = prediction_dataset.prediction_feature.accept_values confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} predictions = [] + nr_unpredicted = 0 prediction_dataset.data_entries.each_with_index do |pe,i| if pe[0] and pe[1] and pe[1].numeric? prediction = pe[0] @@ -56,13 +61,15 @@ module OpenTox weighted_confusion_matrix[1][0] += confidence end end + else + nr_unpredicted += 1 if pe[0].nil? end end validation = self.new( :prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, :nr_instances => test_set.compound_ids.size, - :nr_unpredicted => prediction_dataset.data_entries.count{|de| de.first.nil?}, + :nr_unpredicted => nr_unpredicted, :accept_values => accept_values, :confusion_matrix => confusion_matrix, :weighted_confusion_matrix => weighted_confusion_matrix, @@ -71,94 +78,37 @@ module OpenTox validation.save validation end - - def prediction_dataset - Dataset.find prediction_dataset_id - end - - def test_dataset - Dataset.find test_dataset_id - end - end - class CrossValidation - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "crossvalidations" - - field :validation_ids, type: Array, default: [] - field :folds, type: Integer - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash - field :predictivity, type: Hash - field :predictions, type: Array - # TODO auc, f-measure (usability??) - - def self.create model, n=10 - validation_ids = [] - nr_instances = 0 - nr_unpredicted = 0 - accept_values = model.prediction_feature.accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} + class RegressionValidation < Validation + def self.create model, training_set, test_set + + validation_model = Model::LazarRegression.create training_set + test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used + prediction_dataset = validation_model.predict test_set_without_activities predictions = [] - model.training_dataset.folds(n).each do |fold| - validation = Validation.create(model, fold[0], fold[1]) - validation_ids << validation.id - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - validation.confusion_matrix.each_with_index do |r,i| - r.each_with_index do |c,j| - confusion_matrix[i][j] += c - weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j] - end - end - predictions << validation.predictions - end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c + nr_unpredicted = 0 + activities = test_set.data_entries.collect{|de| de.first} + prediction_dataset.data_entries.each_with_index do |de,i| + if de[0] and de[1] and de[1].numeric? + activity = activities[i] + prediction = de.first + confidence = de[1] + predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence] + else + nr_unpredicted += 1 end end - cv = CrossValidation.new( - :folds => n, - :validation_ids => validation_ids, - :nr_instances => nr_instances, + validation = self.new( + :prediction_dataset_id => prediction_dataset.id, + :test_dataset_id => test_set.id, + :nr_instances => test_set.compound_ids.size, :nr_unpredicted => nr_unpredicted, - :accept_values => accept_values, - :confusion_matrix => confusion_matrix, - :weighted_confusion_matrix => weighted_confusion_matrix, - :accuracy => (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - :true_rate => true_rate, - :predictivity => predictivity, :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence ) - cv.save - cv + validation.save + validation end - - #Average area under roc 0.646 - #Area under roc 0.646 - #F measure carcinogen: 0.769, noncarcinogen: 0.348 - end end |