summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-08-09 13:40:52 +0200
committerChristoph Helma <helma@in-silico.ch>2015-08-09 13:40:52 +0200
commit65c7bdd2bc5de1c2f7bf44a4ed93cb80cc7b4b17 (patch)
treedb0ec599ea9116db8535bd9061974dcd38dc58e5
parentcf98ec284d07adb51910794f0a6e6583382ce68e (diff)
customized prediction algorithms implemented
-rw-r--r--lib/algorithm.rb12
-rw-r--r--lib/classification.rb28
-rw-r--r--lib/lazar.rb166
-rw-r--r--lib/opentox-algorithm.rb3
-rw-r--r--lib/regression.rb33
-rw-r--r--lib/similarity.rb2
-rw-r--r--lib/validation.rb136
7 files changed, 224 insertions, 156 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 0e227d6..113f847 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -2,10 +2,18 @@ module OpenTox
module Algorithm
- def self.run algorithm, object, parameters={}
+ # Generic method to execute algorithms
+ # Algorithms should:
+ # - accept a Compound, an Array of Compounds or a Dataset as first argument
+ # - optional parameters as second argument
+ # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
+ # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
+ # @param [Hash] Algorithm parameters
+ # @return Algorithm result
+ def self.run algorithm, object, parameters=nil
bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
klass,method = algorithm.split('.')
- parameters.empty? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
+ parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
end
end
diff --git a/lib/classification.rb b/lib/classification.rb
index d71ab77..fc6fa77 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -3,10 +3,35 @@ module OpenTox
class Classification
+ def self.weighted_majority_vote neighbors
+ return [nil,nil] if neighbors.empty?
+ weighted_sum = {}
+ sim_sum = 0.0
+ neighbors.each do |row|
+ n,sim,acts = row
+ acts.each do |act|
+ weighted_sum[act] ||= 0
+ weighted_sum[act] += sim
+ end
+ end
+ case weighted_sum.size
+ when 1
+ return [weighted_sum.keys.first, 1.0]
+ when 2
+ sim_sum = weighted_sum[weighted_sum.keys[0]]
+ sim_sum -= weighted_sum[weighted_sum.keys[1]]
+ sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
+ confidence = (sim_sum/neighbors.size).abs
+ return [prediction,confidence]
+ else
+ bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
+ end
+ end
+
# Classification with majority vote from neighbors weighted by similarity
# @param [Hash] params Keys `:activities, :sims, :value_map` are required
# @return [Numeric] A prediction value.
- def self.weighted_majority_vote(neighbors)
+ def self.fminer_weighted_majority_vote neighbors, training_dataset
neighbor_contribution = 0.0
confidence_sum = 0.0
@@ -15,6 +40,7 @@ module OpenTox
values = neighbors.collect{|n| n[2]}.uniq
neighbors.each do |neighbor|
+ i = training_dataset.compound_ids.index n.id
neighbor_weight = neighbor[1]
activity = values.index(neighbor[2]) + 1 # map values to integers > 1
neighbor_contribution += activity * neighbor_weight
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 2bb89cd..b56a747 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -13,34 +13,26 @@ module OpenTox
field :creator, type: String, default: __FILE__
# datasets
field :training_dataset_id, type: BSON::ObjectId
- field :feature_dataset_id, type: BSON::ObjectId
# algorithms
- field :feature_calculation_algorithm, type: String
field :prediction_algorithm, type: String
- field :similarity_algorithm, type: String
- field :min_sim, type: Float
+ field :neighbor_algorithm, type: String
+ field :neighbor_algorithm_parameters, type: Hash
# prediction feature
field :prediction_feature_id, type: BSON::ObjectId
attr_accessor :prediction_dataset
attr_accessor :training_dataset
- attr_accessor :feature_dataset
- attr_accessor :query_fingerprint
- attr_accessor :neighbors
# Create a lazar model from a training_dataset and a feature_dataset
# @param [OpenTox::Dataset] training_dataset
- # @param [OpenTox::Dataset] feature_dataset
# @return [OpenTox::Model::Lazar] Regression or classification model
- def self.create training_dataset, feature_dataset
+ def self.create training_dataset
- bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty?
bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
- bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless training_dataset.compounds == feature_dataset.compounds
+ # TODO document convention
prediction_feature = training_dataset.features.first
prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
- lazar.feature_dataset_id = feature_dataset.id
lazar.training_dataset_id = training_dataset.id
lazar.prediction_feature_id = prediction_feature.id
lazar.title = prediction_feature.title
@@ -54,6 +46,105 @@ module OpenTox
t = Time.now
at = Time.now
+ training_dataset = Dataset.find training_dataset_id
+ prediction_feature = Feature.find prediction_feature_id
+
+ # parse data
+ compounds = []
+ case object.class.to_s
+ when "OpenTox::Compound"
+ compounds = [object]
+ when "Array"
+ compounds = object
+ when "OpenTox::Dataset"
+ compounds = object.compounds
+ else
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+ end
+
+ # make predictions
+ predictions = []
+ compounds.each_with_index do |compound,c|
+ t = Time.new
+ neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
+ # add activities
+ # TODO: improve efficiency, takes 3 times longer than previous version
+ # TODO database activity??
+ neighbors.collect! do |n|
+ rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
+ acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
+ acts.empty? ? nil : n << acts
+ end
+ neighbors.compact! # remove neighbors without training activities
+ predictions << Algorithm.run(prediction_algorithm, neighbors)
+ end
+
+ # serialize result
+ case object.class.to_s
+ when "OpenTox::Compound"
+ return predictions.first
+ when "Array"
+ return predictions
+ when "OpenTox::Dataset"
+ # prepare prediction dataset
+ prediction_dataset = LazarPrediction.new(
+ :title => "Lazar prediction for #{prediction_feature.title}",
+ :creator => __FILE__,
+ :prediction_feature_id => prediction_feature.id
+
+ )
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
+ # TODO move into warnings field
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
+ prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+ prediction_dataset.compounds = compounds
+ prediction_dataset.data_entries = predictions
+ prediction_dataset.save_all
+ return prediction_dataset
+ end
+
+ end
+
+ def training_activities
+ i = training_dataset.feature_ids.index prediction_feature_id
+ training_dataset.data_entries.collect{|de| de[i]}
+ end
+
+ end
+
+ class LazarClassification < Lazar
+ def initialize
+ super
+ self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
+ self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+ self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+ end
+ end
+
+ class LazarFminerClassification < LazarClassification
+ field :feature_dataset_id, type: BSON::ObjectId
+ field :feature_calculation_algorithm, type: String
+
+ def self.create training_dataset
+ model = super(training_dataset)
+ model.update "_type" => self.to_s # adjust class
+ model = self.find model.id # adjust class
+ model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
+ model.neighbor_algorithm_parameters = {
+ :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
+ :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
+ :min_sim => 0.3
+ }
+ model.save
+ model
+ end
+
+=begin
+ def predict object
+
+ t = Time.now
+ at = Time.now
+
@training_dataset = OpenTox::Dataset.find(training_dataset_id)
@feature_dataset = OpenTox::Dataset.find(feature_dataset_id)
@@ -98,17 +189,9 @@ module OpenTox
next
else
- if prediction_algorithm =~ /Regression/
- mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self)
- mtf.transform
- @training_fingerprints = mtf.n_prop
- query_fingerprint = mtf.q_prop
- neighbors = [[nil,nil,nil,query_fingerprint]]
- else
- #training_fingerprints = @feature_dataset.data_entries
- query_fingerprint = @query_fingerprint[c]
- neighbors = []
- end
+ #training_fingerprints = @feature_dataset.data_entries
+ query_fingerprint = @query_fingerprint[c]
+ neighbors = []
tt += Time.now-t
t = Time.new
@@ -146,7 +229,7 @@ module OpenTox
# AM: transform to original space (TODO)
- confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/
+ #confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/
$logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}"
@@ -184,43 +267,18 @@ module OpenTox
end
end
-
- def training_dataset
- Dataset.find training_dataset_id
- end
-
- def prediction_feature
- Feature.find prediction_feature_id
- end
-
- def training_activities
- i = @training_dataset.feature_ids.index prediction_feature_id
- @training_dataset.data_entries.collect{|de| de[i]}
- end
-
+=end
end
class LazarRegression < Lazar
- field :min_train_performance, type: Float, default: 0.1
- def initialize
- super
- self.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression"
- self.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine"
- self.min_sim = 0.7
-
- # AM: transform to cosine space
- min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/
- end
- end
- class LazarClassification < Lazar
def initialize
super
- self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
- self.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto"
- self.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_match"
- self.min_sim = 0.3
+ self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+ self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
+ self.neighbor_algorithm_parameters = {:min_sim => 0.7}
end
+
end
end
diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb
index 74e058c..97db792 100644
--- a/lib/opentox-algorithm.rb
+++ b/lib/opentox-algorithm.rb
@@ -20,7 +20,8 @@ require_relative "bbrc.rb"
require_relative "lazar.rb"
require_relative "transform.rb"
require_relative "similarity.rb"
-#require_relative "neighbors.rb"
+require_relative "neighbor.rb"
require_relative "classification.rb"
require_relative "regression.rb"
require_relative "validation.rb"
+require_relative "crossvalidation.rb"
diff --git a/lib/regression.rb b/lib/regression.rb
index 4bade40..891d7f9 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -1,13 +1,38 @@
-#require "rinruby"
-
# TODO install R packages kernlab, caret, doMC, class, e1071
-# TODO use Rserve
+
+ # log transform activities (create new dataset)
+ # scale, normalize features, might not be necessary
+ # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
+ # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
+ # zero-order correlation and the semi-partial correlation
+ # seems to be necessary for svm
+ # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
+ # http://stackoverflow.com/questions/15436367/svm-scaling-input-values
+ # use lasso or elastic net??
+ # select relevant features
+ # remove features with a single value
+ # remove correlated features
+ # remove features not correlated with endpoint
module OpenTox
module Algorithm
class Regression
-require "rserve"
+
+ def self.weighted_average neighbors
+ weighted_sum = 0.0
+ sim_sum = 0.0
+ neighbors.each do |row|
+ n,sim,acts = row
+ acts.each do |act|
+ weighted_sum += sim*Math.log10(act)
+ sim_sum += sim
+ end
+ end
+ confidence = sim_sum/neighbors.size.to_f
+ sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+ [prediction,confidence]
+ end
# Local support vector regression from neighbors
# @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
diff --git a/lib/similarity.rb b/lib/similarity.rb
index 934c4b0..91e18db 100644
--- a/lib/similarity.rb
+++ b/lib/similarity.rb
@@ -23,7 +23,7 @@ module OpenTox
#common += 1 if n == b[i]
#end
#common/a.size
- # TODO check if calculation is correct
+ # TODO check if calculation speed can be improved
common_p_sum = 0.0
all_p_sum = 0.0
(0...a.size).each { |idx|
diff --git a/lib/validation.rb b/lib/validation.rb
index c2250de..bcbe49a 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -1,36 +1,41 @@
module OpenTox
class Validation
- include OpenTox
- include Mongoid::Document
- include Mongoid::Timestamps
- store_in collection: "validations"
field :prediction_dataset_id, type: BSON::ObjectId
field :test_dataset_id, type: BSON::ObjectId
field :nr_instances, type: Integer
field :nr_unpredicted, type: Integer
+ field :predictions, type: Array
+
+ def prediction_dataset
+ Dataset.find prediction_dataset_id
+ end
+
+ def test_dataset
+ Dataset.find test_dataset_id
+ end
+
+ end
+
+ class ClassificationValidation < Validation
field :accept_values, type: String
field :confusion_matrix, type: Array
field :weighted_confusion_matrix, type: Array
- field :predictions, type: Array
- # TODO classification und regression in subclasses
def self.create model, training_set, test_set
validation = self.class.new
- feature_dataset = Dataset.find model.feature_dataset_id
- if feature_dataset.is_a? FminerDataset
- features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
- else
- # TODO search for descriptors
- end
- validation_model = Model::Lazar.create training_set, features
+ #feature_dataset = Dataset.find model.feature_dataset_id
+ # TODO check and delegate to Algorithm
+ #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
+ validation_model = model.class.create training_set#, features
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
prediction_dataset = validation_model.predict test_set_without_activities
accept_values = prediction_dataset.prediction_feature.accept_values
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
predictions = []
+ nr_unpredicted = 0
prediction_dataset.data_entries.each_with_index do |pe,i|
if pe[0] and pe[1] and pe[1].numeric?
prediction = pe[0]
@@ -56,13 +61,15 @@ module OpenTox
weighted_confusion_matrix[1][0] += confidence
end
end
+ else
+ nr_unpredicted += 1 if pe[0].nil?
end
end
validation = self.new(
:prediction_dataset_id => prediction_dataset.id,
:test_dataset_id => test_set.id,
:nr_instances => test_set.compound_ids.size,
- :nr_unpredicted => prediction_dataset.data_entries.count{|de| de.first.nil?},
+ :nr_unpredicted => nr_unpredicted,
:accept_values => accept_values,
:confusion_matrix => confusion_matrix,
:weighted_confusion_matrix => weighted_confusion_matrix,
@@ -71,94 +78,37 @@ module OpenTox
validation.save
validation
end
-
- def prediction_dataset
- Dataset.find prediction_dataset_id
- end
-
- def test_dataset
- Dataset.find test_dataset_id
- end
-
end
- class CrossValidation
- include OpenTox
- include Mongoid::Document
- include Mongoid::Timestamps
- store_in collection: "crossvalidations"
-
- field :validation_ids, type: Array, default: []
- field :folds, type: Integer
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :accept_values, type: Array
- field :confusion_matrix, type: Array
- field :weighted_confusion_matrix, type: Array
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash
- field :predictivity, type: Hash
- field :predictions, type: Array
- # TODO auc, f-measure (usability??)
-
- def self.create model, n=10
- validation_ids = []
- nr_instances = 0
- nr_unpredicted = 0
- accept_values = model.prediction_feature.accept_values
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- true_rate = {}
- predictivity = {}
+ class RegressionValidation < Validation
+ def self.create model, training_set, test_set
+
+ validation_model = Model::LazarRegression.create training_set
+ test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+ prediction_dataset = validation_model.predict test_set_without_activities
predictions = []
- model.training_dataset.folds(n).each do |fold|
- validation = Validation.create(model, fold[0], fold[1])
- validation_ids << validation.id
- nr_instances += validation.nr_instances
- nr_unpredicted += validation.nr_unpredicted
- validation.confusion_matrix.each_with_index do |r,i|
- r.each_with_index do |c,j|
- confusion_matrix[i][j] += c
- weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
- end
- end
- predictions << validation.predictions
- end
- true_rate = {}
- predictivity = {}
- accept_values.each_with_index do |v,i|
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
+ nr_unpredicted = 0
+ activities = test_set.data_entries.collect{|de| de.first}
+ prediction_dataset.data_entries.each_with_index do |de,i|
+ if de[0] and de[1] and de[1].numeric?
+ activity = activities[i]
+ prediction = de.first
+ confidence = de[1]
+ predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
+ else
+ nr_unpredicted += 1
end
end
- cv = CrossValidation.new(
- :folds => n,
- :validation_ids => validation_ids,
- :nr_instances => nr_instances,
+ validation = self.new(
+ :prediction_dataset_id => prediction_dataset.id,
+ :test_dataset_id => test_set.id,
+ :nr_instances => test_set.compound_ids.size,
:nr_unpredicted => nr_unpredicted,
- :accept_values => accept_values,
- :confusion_matrix => confusion_matrix,
- :weighted_confusion_matrix => weighted_confusion_matrix,
- :accuracy => (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
- :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
- :true_rate => true_rate,
- :predictivity => predictivity,
:predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
)
- cv.save
- cv
+ validation.save
+ validation
end
-
- #Average area under roc 0.646
- #Area under roc 0.646
- #F measure carcinogen: 0.769, noncarcinogen: 0.348
-
end
end