summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-03-15 17:40:40 +0100
committerChristoph Helma <helma@in-silico.ch>2016-03-15 17:40:40 +0100
commit7c3bd90c26dfeea2db3cf74a1cefc23d8dece7c0 (patch)
tree045d18b43e30ef3bf9a548230e45986b591535a6 /lib
parent0c5d2e678908a2d4aea43efbedbedc2c0439be30 (diff)
validation tests pass
Diffstat (limited to 'lib')
-rw-r--r--lib/classification.rb73
-rw-r--r--lib/crossvalidation.rb68
-rw-r--r--lib/dataset.rb23
-rw-r--r--lib/leave-one-out-validation.rb16
-rw-r--r--lib/model.rb77
-rw-r--r--lib/regression.rb43
-rw-r--r--lib/validation.rb3
7 files changed, 91 insertions, 212 deletions
diff --git a/lib/classification.rb b/lib/classification.rb
index abbb5b3..0202940 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -28,80 +28,7 @@ module OpenTox
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
end
end
-
- # Classification with majority vote from neighbors weighted by similarity
- # @param [Hash] params Keys `:activities, :sims, :value_map` are required
- # @return [Numeric] A prediction value.
- def self.fminer_weighted_majority_vote neighbors, training_dataset
-
- neighbor_contribution = 0.0
- confidence_sum = 0.0
-
- $logger.debug "Weighted Majority Vote Classification."
-
- values = neighbors.collect{|n| n[2]}.uniq
- neighbors.each do |neighbor|
- i = training_dataset.compound_ids.index n.id
- neighbor_weight = neighbor[1]
- activity = values.index(neighbor[2]) + 1 # map values to integers > 1
- neighbor_contribution += activity * neighbor_weight
- if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
- case activity
- when 1
- confidence_sum -= neighbor_weight
- when 2
- confidence_sum += neighbor_weight
- end
- else
- confidence_sum += neighbor_weight
- end
- end
- if values.size == 2
- if confidence_sum >= 0.0
- prediction = values[1]
- elsif confidence_sum < 0.0
- prediction = values[0]
- end
- elsif values.size == 1 # all neighbors have the same value
- prediction = values[0]
- else
- prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
- end
-
- confidence = (confidence_sum/neighbors.size).abs
- {:value => prediction, :confidence => confidence.abs}
- end
-
- # Local support vector regression from neighbors
- # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
- # @return [Numeric] A prediction value.
- def self.local_svm_classification(params)
-
- confidence = 0.0
- prediction = nil
-
- $logger.debug "Local SVM."
- if params[:activities].size>0
- if params[:props]
- n_prop = params[:props][0].collect.to_a
- q_prop = params[:props][1].collect.to_a
- props = [ n_prop, q_prop ]
- end
- activities = params[:activities].collect.to_a
- activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
- prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
- prediction = prediction.sub(/Val/,"") if prediction # Convert back
- confidence = 0.0 if prediction.nil?
- confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
- end
- {:value => prediction, :confidence => confidence}
-
- end
-
-
-
end
-
end
end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index cd94e33..08a5ad3 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -52,9 +52,10 @@ module OpenTox
cv.update_attributes(
nr_instances: nr_instances,
nr_unpredicted: nr_unpredicted,
- predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+ predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
)
$logger.debug "Nr unpredicted: #{nr_unpredicted}"
+ cv.statistics
cv
end
end
@@ -78,23 +79,26 @@ module OpenTox
true_rate = {}
predictivity = {}
predictions.each do |pred|
- compound_id,activity,prediction,confidence = pred
- if activity and prediction and confidence.numeric?
- if prediction == activity
- if prediction == accept_values[0]
- confusion_matrix[0][0] += 1
- weighted_confusion_matrix[0][0] += confidence
- elsif prediction == accept_values[1]
- confusion_matrix[1][1] += 1
- weighted_confusion_matrix[1][1] += confidence
- end
- elsif prediction != activity
- if prediction == accept_values[0]
- confusion_matrix[0][1] += 1
- weighted_confusion_matrix[0][1] += confidence
- elsif prediction == accept_values[1]
- confusion_matrix[1][0] += 1
- weighted_confusion_matrix[1][0] += confidence
+ compound_id,activities,prediction,confidence = pred
+ if activities and prediction #and confidence.numeric?
+ if activities.uniq.size == 1
+ activity = activities.uniq.first
+ if prediction == activity
+ if prediction == accept_values[0]
+ confusion_matrix[0][0] += 1
+ #weighted_confusion_matrix[0][0] += confidence
+ elsif prediction == accept_values[1]
+ confusion_matrix[1][1] += 1
+ #weighted_confusion_matrix[1][1] += confidence
+ end
+ elsif prediction != activity
+ if prediction == accept_values[0]
+ confusion_matrix[0][1] += 1
+ #weighted_confusion_matrix[0][1] += confidence
+ elsif prediction == accept_values[1]
+ confusion_matrix[1][0] += 1
+ #weighted_confusion_matrix[1][0] += confidence
+ end
end
end
else
@@ -108,17 +112,17 @@ module OpenTox
predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
end
confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
- end
- end
+ #weighted_confusion_matrix.each do |r|
+ #r.each do |c|
+ #confidence_sum += c
+ #end
+ #end
update_attributes(
accept_values: accept_values,
confusion_matrix: confusion_matrix,
- weighted_confusion_matrix: weighted_confusion_matrix,
+ #weighted_confusion_matrix: weighted_confusion_matrix,
accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
- weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+ #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
true_rate: true_rate,
predictivity: predictivity,
finished_at: Time.now
@@ -161,20 +165,12 @@ module OpenTox
field :rmse, type: Float
field :mae, type: Float
- field :weighted_rmse, type: Float
- field :weighted_mae, type: Float
field :r_squared, type: Float
field :correlation_plot_id, type: BSON::ObjectId
- field :confidence_plot_id, type: BSON::ObjectId
def statistics
rmse = 0
- weighted_rmse = 0
- rse = 0
- weighted_rse = 0
mae = 0
- weighted_mae = 0
- confidence_sum = 0
x = []
y = []
predictions.each do |pred|
@@ -185,10 +181,10 @@ module OpenTox
y << -Math.log10(prediction)
error = Math.log10(prediction)-Math.log10(activity.median)
rmse += error**2
- weighted_rmse += confidence*error**2
+ #weighted_rmse += confidence*error**2
mae += error.abs
- weighted_mae += confidence*error.abs
- confidence_sum += confidence
+ #weighted_mae += confidence*error.abs
+ #confidence_sum += confidence
end
else
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
diff --git a/lib/dataset.rb b/lib/dataset.rb
index af851b5..5d8aeaf 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -85,6 +85,7 @@ module OpenTox
compound.dataset_ids << dataset.id
compound.save
end
+ dataset.save
dataset
end
start = last+1
@@ -283,28 +284,6 @@ module OpenTox
end
end
- def scale
- scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)}
- centers = []
- scales = []
- feature_ids.each_with_index do |feature_id,col|
- R.assign "x", data_entries.collect{|de| de[col]}
- R.eval "scaled = scale(x,center=T,scale=T)"
- centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby
- scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby
- R.eval("scaled").to_ruby.each_with_index do |value,row|
- scaled_data_entries[row][col] = value
- end
- end
- scaled_dataset = ScaledDataset.new(attributes)
- scaled_dataset["_id"] = BSON::ObjectId.new
- scaled_dataset["_type"] = "OpenTox::ScaledDataset"
- scaled_dataset.centers = centers
- scaled_dataset.scales = scales
- scaled_dataset.data_entries = scaled_data_entries
- scaled_dataset.save
- scaled_dataset
- end
end
# Dataset for lazar predictions
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 9db10c6..2cd13db 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -18,7 +18,7 @@ module OpenTox
predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
loo.nr_instances = predictions.size
predictions.select!{|p| p[:value]} # remove unpredicted
- loo.predictions = predictions.sort{|a,b| b[:confidence] <=> a[:confidence]}
+ loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
loo.statistics
loo.save
@@ -126,8 +126,8 @@ module OpenTox
field :rmse, type: Float, default: 0.0
field :mae, type: Float, default: 0
- field :weighted_rmse, type: Float, default: 0
- field :weighted_mae, type: Float, default: 0
+ #field :weighted_rmse, type: Float, default: 0
+ #field :weighted_mae, type: Float, default: 0
field :r_squared, type: Float
field :correlation_plot_id, type: BSON::ObjectId
field :confidence_plot_id, type: BSON::ObjectId
@@ -143,10 +143,10 @@ module OpenTox
measured_values << activity
error = Math.log10(pred[:value])-Math.log10(activity)
self.rmse += error**2
- self.weighted_rmse += pred[:confidence]*error**2
+ #self.weighted_rmse += pred[:confidence]*error**2
self.mae += error.abs
- self.weighted_mae += pred[:confidence]*error.abs
- confidence_sum += pred[:confidence]
+ #self.weighted_mae += pred[:confidence]*error.abs
+ #confidence_sum += pred[:confidence]
end
end
if pred[:database_activities].empty?
@@ -160,9 +160,9 @@ module OpenTox
r = R.eval("r").to_ruby
self.mae = self.mae/predictions.size
- self.weighted_mae = self.weighted_mae/confidence_sum
+ #self.weighted_mae = self.weighted_mae/confidence_sum
self.rmse = Math.sqrt(self.rmse/predictions.size)
- self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
+ #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
self.r_squared = r**2
self.finished_at = Time.now
save
diff --git a/lib/model.rb b/lib/model.rb
index ebc0db3..f21ea54 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -47,13 +47,32 @@ module OpenTox
self
end
- def predict object
+ def predict_compound compound
+ prediction_feature = Feature.find prediction_feature_id
+ neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+ # remove neighbors without prediction_feature
+ # check for database activities (neighbors may include query compound)
+ database_activities = nil
+ prediction = {}
+ if neighbors.collect{|n| n["_id"]}.include? compound.id
+
+ database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
+ prediction[:database_activities] = database_activities
+ prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
+ neighbors.delete_if{|n| n["_id"] == compound.id}
+ end
+ neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
+ if neighbors.empty?
+ prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset."})
+ else
+ prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
+ end
+ prediction
+ end
- t = Time.now
- at = Time.now
+ def predict object
training_dataset = Dataset.find training_dataset_id
- prediction_feature = Feature.find prediction_feature_id
# parse data
compounds = []
@@ -70,30 +89,7 @@ module OpenTox
# make predictions
predictions = []
- neighbors = []
- compounds.each_with_index do |compound,c|
- t = Time.new
-
- neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
- # remove neighbors without prediction_feature
- # check for database activities (neighbors may include query compound)
- database_activities = nil
- prediction = {}
- if neighbors.collect{|n| n["_id"]}.include? compound.id
-
- database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
- prediction[:database_activities] = database_activities
- prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
- neighbors.delete_if{|n| n["_id"] == compound.id}
- end
- neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
- if neighbors.empty?
- prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset."})
- else
- prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
- end
- predictions << prediction
- end
+ predictions = compounds.collect{|c| predict_compound c}
# serialize result
case object.class.to_s
@@ -105,7 +101,8 @@ module OpenTox
return predictions
when "OpenTox::Dataset"
# prepare prediction dataset
- measurement_feature = prediction_feature
+ measurement_feature = Feature.find prediction_feature_id
+
prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
prediction_dataset = LazarPrediction.new(
:name => "Lazar prediction for #{prediction_feature.name}",
@@ -114,11 +111,9 @@ module OpenTox
)
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
- # TODO move into warnings field
warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
prediction_dataset.compounds = compounds
- # TODO fix dataset measurements
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
prediction_dataset.save
return prediction_dataset
@@ -171,25 +166,6 @@ module OpenTox
end
end
- class LazarFminerClassification < LazarClassification
- field :feature_calculation_parameters, type: Hash
-
- def self.create training_dataset, fminer_params={}
- model = super(training_dataset)
- model.update "_type" => self.to_s # adjust class
- model = self.find model.id # adjust class
- model.neighbor_algorithm = "fminer_neighbors"
- model.neighbor_algorithm_parameters = {
- :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
- :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id,
- :min_sim => 0.3
- }
- model.feature_calculation_parameters = fminer_params
- model.save
- model
- end
- end
-
class Prediction
include OpenTox
include Mongoid::Document
@@ -238,7 +214,6 @@ module OpenTox
training_dataset = Dataset.from_csv_file file
model = nil
if training_dataset.features.first.nominal?
- #model = LazarFminerClassification.create training_dataset
model = LazarClassification.create training_dataset
elsif training_dataset.features.first.numeric?
model = LazarRegression.create training_dataset
diff --git a/lib/regression.rb b/lib/regression.rb
index e0b109e..b8efd30 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -1,25 +1,23 @@
module OpenTox
module Algorithm
- # TODO add LOO errors
class Regression
def self.local_weighted_average compound, params
weighted_sum = 0.0
sim_sum = 0.0
- confidence = 0.0
neighbors = params[:neighbors]
neighbors.each do |row|
sim = row["tanimoto"]
- confidence = sim if sim > confidence # distance to nearest neighbor
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- weighted_sum += sim*Math.log10(act)
- sim_sum += sim
+ if row["features"][params[:prediction_feature_id].to_s]
+ row["features"][params[:prediction_feature_id].to_s].each do |act|
+ weighted_sum += sim*Math.log10(act)
+ sim_sum += sim
+ end
end
end
- confidence = 0 if confidence.nan?
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
- {:value => prediction,:confidence => confidence}
+ {:value => prediction}
end
# TODO explicit neighbors, also for physchem
@@ -31,15 +29,18 @@ module OpenTox
weights = []
fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
+ #p neighbors
neighbors.each_with_index do |row,i|
neighbor = Compound.find row["_id"]
fingerprint = neighbor.fingerprint
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- activities << Math.log10(act)
- weights << row["tanimoto"]
- fingerprint_ids.each_with_index do |id,j|
- fingerprints[id] ||= []
- fingerprints[id] << fingerprint.include?(id)
+ if row["features"][params[:prediction_feature_id].to_s]
+ row["features"][params[:prediction_feature_id].to_s].each do |act|
+ activities << Math.log10(act)
+ weights << row["tanimoto"]
+ fingerprint_ids.each_with_index do |id,j|
+ fingerprints[id] ||= []
+ fingerprints[id] << fingerprint.include?(id)
+ end
end
end
end
@@ -86,12 +87,14 @@ module OpenTox
neighbors.each_with_index do |row,i|
neighbor = Compound.find row["_id"]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- activities << Math.log10(act)
- weights << row["tanimoto"] # TODO cosine ?
- neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
- physchem[pid] ||= []
- physchem[pid] << v
+ if row["features"][params[:prediction_feature_id].to_s]
+ row["features"][params[:prediction_feature_id].to_s].each do |act|
+ activities << Math.log10(act)
+ weights << row["tanimoto"] # TODO cosine ?
+ neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
+ physchem[pid] ||= []
+ physchem[pid] << v
+ end
end
end
end
diff --git a/lib/validation.rb b/lib/validation.rb
index 3659341..b72d273 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -37,11 +37,10 @@ module OpenTox
nr_unpredicted = 0
activities = test_set.data_entries.collect{|de| de.first}
prediction_dataset.data_entries.each_with_index do |de,i|
- if de[0] and de[1]
+ if de[0] #and de[1]
cid = prediction_dataset.compound_ids[i]
rows = cids.each_index.select{|r| cids[r] == cid }
activities = rows.collect{|r| test_set.data_entries[r][0]}
- #activity = activities[i]
prediction = de.first
confidence = de[1]
predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]