From 65c7bdd2bc5de1c2f7bf44a4ed93cb80cc7b4b17 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 9 Aug 2015 13:40:52 +0200 Subject: customized prediction algorithms implemented --- lib/algorithm.rb | 12 +++- lib/classification.rb | 28 +++++++- lib/lazar.rb | 166 ++++++++++++++++++++++++++++++++--------------- lib/opentox-algorithm.rb | 3 +- lib/regression.rb | 33 ++++++++-- lib/similarity.rb | 2 +- lib/validation.rb | 136 ++++++++++++-------------------------- 7 files changed, 224 insertions(+), 156 deletions(-) diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 0e227d6..113f847 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -2,10 +2,18 @@ module OpenTox module Algorithm - def self.run algorithm, object, parameters={} + # Generic method to execute algorithms + # Algorithms should: + # - accept a Compound, an Array of Compounds or a Dataset as first argument + # - optional parameters as second argument + # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values + # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object + # @param [Hash] Algorithm parameters + # @return Algorithm result + def self.run algorithm, object, parameters=nil bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/ klass,method = algorithm.split('.') - parameters.empty? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters) + parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters) end end diff --git a/lib/classification.rb b/lib/classification.rb index d71ab77..fc6fa77 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,10 +3,35 @@ module OpenTox class Classification + def self.weighted_majority_vote neighbors + return [nil,nil] if neighbors.empty? + weighted_sum = {} + sim_sum = 0.0 + neighbors.each do |row| + n,sim,acts = row + acts.each do |act| + weighted_sum[act] ||= 0 + weighted_sum[act] += sim + end + end + case weighted_sum.size + when 1 + return [weighted_sum.keys.first, 1.0] + when 2 + sim_sum = weighted_sum[weighted_sum.keys[0]] + sim_sum -= weighted_sum[weighted_sum.keys[1]] + sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] + confidence = (sim_sum/neighbors.size).abs + return [prediction,confidence] + else + bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" + end + end + # Classification with majority vote from neighbors weighted by similarity # @param [Hash] params Keys `:activities, :sims, :value_map` are required # @return [Numeric] A prediction value. - def self.weighted_majority_vote(neighbors) + def self.fminer_weighted_majority_vote neighbors, training_dataset neighbor_contribution = 0.0 confidence_sum = 0.0 @@ -15,6 +40,7 @@ module OpenTox values = neighbors.collect{|n| n[2]}.uniq neighbors.each do |neighbor| + i = training_dataset.compound_ids.index n.id neighbor_weight = neighbor[1] activity = values.index(neighbor[2]) + 1 # map values to integers > 1 neighbor_contribution += activity * neighbor_weight diff --git a/lib/lazar.rb b/lib/lazar.rb index 2bb89cd..b56a747 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -13,34 +13,26 @@ module OpenTox field :creator, type: String, default: __FILE__ # datasets field :training_dataset_id, type: BSON::ObjectId - field :feature_dataset_id, type: BSON::ObjectId # algorithms - field :feature_calculation_algorithm, type: String field :prediction_algorithm, type: String - field :similarity_algorithm, type: String - field :min_sim, type: Float + field :neighbor_algorithm, type: String + field :neighbor_algorithm_parameters, type: Hash # prediction feature field :prediction_feature_id, type: BSON::ObjectId attr_accessor :prediction_dataset attr_accessor :training_dataset - attr_accessor :feature_dataset - attr_accessor :query_fingerprint - attr_accessor :neighbors # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset - # @param [OpenTox::Dataset] feature_dataset # @return [OpenTox::Model::Lazar] Regression or classification model - def self.create training_dataset, feature_dataset + def self.create training_dataset - bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty? bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 - bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless training_dataset.compounds == feature_dataset.compounds + # TODO document convention prediction_feature = training_dataset.features.first prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new - lazar.feature_dataset_id = feature_dataset.id lazar.training_dataset_id = training_dataset.id lazar.prediction_feature_id = prediction_feature.id lazar.title = prediction_feature.title @@ -49,6 +41,105 @@ module OpenTox lazar end + def predict object + + t = Time.now + at = Time.now + + training_dataset = Dataset.find training_dataset_id + prediction_feature = Feature.find prediction_feature_id + + # parse data + compounds = [] + case object.class.to_s + when "OpenTox::Compound" + compounds = [object] + when "Array" + compounds = object + when "OpenTox::Dataset" + compounds = object.compounds + else + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + end + + # make predictions + predictions = [] + compounds.each_with_index do |compound,c| + t = Time.new + neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) + # add activities + # TODO: improve efficiency, takes 3 times longer than previous version + # TODO database activity?? + neighbors.collect! do |n| + rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} + acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact + acts.empty? ? nil : n << acts + end + neighbors.compact! # remove neighbors without training activities + predictions << Algorithm.run(prediction_algorithm, neighbors) + end + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + return predictions.first + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :title => "Lazar prediction for #{prediction_feature.title}", + :creator => __FILE__, + :prediction_feature_id => prediction_feature.id + + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) + # TODO move into warnings field + warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions + prediction_dataset.save_all + return prediction_dataset + end + + end + + def training_activities + i = training_dataset.feature_ids.index prediction_feature_id + training_dataset.data_entries.collect{|de| de[i]} + end + + end + + class LazarClassification < Lazar + def initialize + super + self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" + self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.neighbor_algorithm_parameters = {:min_sim => 0.7} + end + end + + class LazarFminerClassification < LazarClassification + field :feature_dataset_id, type: BSON::ObjectId + field :feature_calculation_algorithm, type: String + + def self.create training_dataset + model = super(training_dataset) + model.update "_type" => self.to_s # adjust class + model = self.find model.id # adjust class + model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity" + model.neighbor_algorithm_parameters = { + :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", + :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id, + :min_sim => 0.3 + } + model.save + model + end + +=begin def predict object t = Time.now @@ -98,17 +189,9 @@ module OpenTox next else - if prediction_algorithm =~ /Regression/ - mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) - mtf.transform - @training_fingerprints = mtf.n_prop - query_fingerprint = mtf.q_prop - neighbors = [[nil,nil,nil,query_fingerprint]] - else - #training_fingerprints = @feature_dataset.data_entries - query_fingerprint = @query_fingerprint[c] - neighbors = [] - end + #training_fingerprints = @feature_dataset.data_entries + query_fingerprint = @query_fingerprint[c] + neighbors = [] tt += Time.now-t t = Time.new @@ -146,7 +229,7 @@ module OpenTox # AM: transform to original space (TODO) - confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/ + #confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/ $logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}" @@ -184,43 +267,18 @@ module OpenTox end end - - def training_dataset - Dataset.find training_dataset_id - end - - def prediction_feature - Feature.find prediction_feature_id - end - - def training_activities - i = @training_dataset.feature_ids.index prediction_feature_id - @training_dataset.data_entries.collect{|de| de[i]} - end - +=end end class LazarRegression < Lazar - field :min_train_performance, type: Float, default: 0.1 - def initialize - super - self.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression" - self.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine" - self.min_sim = 0.7 - - # AM: transform to cosine space - min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/ - end - end - class LazarClassification < Lazar def initialize super - self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - self.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto" - self.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_match" - self.min_sim = 0.3 + self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" + self.neighbor_algorithm_parameters = {:min_sim => 0.7} end + end end diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb index 74e058c..97db792 100644 --- a/lib/opentox-algorithm.rb +++ b/lib/opentox-algorithm.rb @@ -20,7 +20,8 @@ require_relative "bbrc.rb" require_relative "lazar.rb" require_relative "transform.rb" require_relative "similarity.rb" -#require_relative "neighbors.rb" +require_relative "neighbor.rb" require_relative "classification.rb" require_relative "regression.rb" require_relative "validation.rb" +require_relative "crossvalidation.rb" diff --git a/lib/regression.rb b/lib/regression.rb index 4bade40..891d7f9 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,13 +1,38 @@ -#require "rinruby" - # TODO install R packages kernlab, caret, doMC, class, e1071 -# TODO use Rserve + + # log transform activities (create new dataset) + # scale, normalize features, might not be necessary + # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is + # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression + # zero-order correlation and the semi-partial correlation + # seems to be necessary for svm + # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1 + # http://stackoverflow.com/questions/15436367/svm-scaling-input-values + # use lasso or elastic net?? + # select relevant features + # remove features with a single value + # remove correlated features + # remove features not correlated with endpoint module OpenTox module Algorithm class Regression -require "rserve" + + def self.weighted_average neighbors + weighted_sum = 0.0 + sim_sum = 0.0 + neighbors.each do |row| + n,sim,acts = row + acts.each do |act| + weighted_sum += sim*Math.log10(act) + sim_sum += sim + end + end + confidence = sim_sum/neighbors.size.to_f + sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + [prediction,confidence] + end # Local support vector regression from neighbors # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required diff --git a/lib/similarity.rb b/lib/similarity.rb index 934c4b0..91e18db 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -23,7 +23,7 @@ module OpenTox #common += 1 if n == b[i] #end #common/a.size - # TODO check if calculation is correct + # TODO check if calculation speed can be improved common_p_sum = 0.0 all_p_sum = 0.0 (0...a.size).each { |idx| diff --git a/lib/validation.rb b/lib/validation.rb index c2250de..bcbe49a 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -1,36 +1,41 @@ module OpenTox class Validation - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "validations" field :prediction_dataset_id, type: BSON::ObjectId field :test_dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer + field :predictions, type: Array + + def prediction_dataset + Dataset.find prediction_dataset_id + end + + def test_dataset + Dataset.find test_dataset_id + end + + end + + class ClassificationValidation < Validation field :accept_values, type: String field :confusion_matrix, type: Array field :weighted_confusion_matrix, type: Array - field :predictions, type: Array - # TODO classification und regression in subclasses def self.create model, training_set, test_set validation = self.class.new - feature_dataset = Dataset.find model.feature_dataset_id - if feature_dataset.is_a? FminerDataset - features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters - else - # TODO search for descriptors - end - validation_model = Model::Lazar.create training_set, features + #feature_dataset = Dataset.find model.feature_dataset_id + # TODO check and delegate to Algorithm + #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters + validation_model = model.class.create training_set#, features test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used prediction_dataset = validation_model.predict test_set_without_activities accept_values = prediction_dataset.prediction_feature.accept_values confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} predictions = [] + nr_unpredicted = 0 prediction_dataset.data_entries.each_with_index do |pe,i| if pe[0] and pe[1] and pe[1].numeric? prediction = pe[0] @@ -56,13 +61,15 @@ module OpenTox weighted_confusion_matrix[1][0] += confidence end end + else + nr_unpredicted += 1 if pe[0].nil? end end validation = self.new( :prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, :nr_instances => test_set.compound_ids.size, - :nr_unpredicted => prediction_dataset.data_entries.count{|de| de.first.nil?}, + :nr_unpredicted => nr_unpredicted, :accept_values => accept_values, :confusion_matrix => confusion_matrix, :weighted_confusion_matrix => weighted_confusion_matrix, @@ -71,94 +78,37 @@ module OpenTox validation.save validation end - - def prediction_dataset - Dataset.find prediction_dataset_id - end - - def test_dataset - Dataset.find test_dataset_id - end - end - class CrossValidation - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "crossvalidations" - - field :validation_ids, type: Array, default: [] - field :folds, type: Integer - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash - field :predictivity, type: Hash - field :predictions, type: Array - # TODO auc, f-measure (usability??) - - def self.create model, n=10 - validation_ids = [] - nr_instances = 0 - nr_unpredicted = 0 - accept_values = model.prediction_feature.accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} + class RegressionValidation < Validation + def self.create model, training_set, test_set + + validation_model = Model::LazarRegression.create training_set + test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used + prediction_dataset = validation_model.predict test_set_without_activities predictions = [] - model.training_dataset.folds(n).each do |fold| - validation = Validation.create(model, fold[0], fold[1]) - validation_ids << validation.id - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - validation.confusion_matrix.each_with_index do |r,i| - r.each_with_index do |c,j| - confusion_matrix[i][j] += c - weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j] - end - end - predictions << validation.predictions - end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c + nr_unpredicted = 0 + activities = test_set.data_entries.collect{|de| de.first} + prediction_dataset.data_entries.each_with_index do |de,i| + if de[0] and de[1] and de[1].numeric? + activity = activities[i] + prediction = de.first + confidence = de[1] + predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence] + else + nr_unpredicted += 1 end end - cv = CrossValidation.new( - :folds => n, - :validation_ids => validation_ids, - :nr_instances => nr_instances, + validation = self.new( + :prediction_dataset_id => prediction_dataset.id, + :test_dataset_id => test_set.id, + :nr_instances => test_set.compound_ids.size, :nr_unpredicted => nr_unpredicted, - :accept_values => accept_values, - :confusion_matrix => confusion_matrix, - :weighted_confusion_matrix => weighted_confusion_matrix, - :accuracy => (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - :true_rate => true_rate, - :predictivity => predictivity, :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence ) - cv.save - cv + validation.save + validation end - - #Average area under roc 0.646 - #Area under roc 0.646 - #F measure carcinogen: 0.769, noncarcinogen: 0.348 - end end -- cgit v1.2.3