summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-04-15 11:01:16 +0200
committerChristoph Helma <helma@in-silico.ch>2016-04-15 11:01:16 +0200
commit8aab046eb1ad39aaf10c5a8596102c35c7b2ee0b (patch)
treee3a654da5a042b7b52655be051ce70eeec2a66e7
parent753fcc204d93d86c76860bee6e2f7d0468c3c940 (diff)
data_entries removed from datasets. datasets are now just containers for compounds and features, feature values have to be retrieved from substances.
-rw-r--r--lib/compound.rb3
-rw-r--r--lib/crossvalidation.rb12
-rw-r--r--lib/dataset.rb65
-rw-r--r--lib/leave-one-out-validation.rb11
-rw-r--r--lib/model.rb44
-rw-r--r--lib/validation.rb5
-rw-r--r--test/prediction_models.rb1
-rw-r--r--test/setup.rb4
-rw-r--r--test/validation.rb16
9 files changed, 70 insertions, 91 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 55cd482..049d77b 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -288,8 +288,7 @@ module OpenTox
training_dataset.compounds.each do |compound|
candidate_fingerprint = compound.fingerprint params[:type]
sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
- feature_values = training_dataset.values(compound,prediction_feature)
- neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
+ neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => compound.toxicities[prediction_feature.id.to_s]}, "tanimoto" => sim} if sim >= params[:min_sim]
end
neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index f93a04c..752d393 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -22,8 +22,10 @@ module OpenTox
end
def self.create model, n=10
- model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
- bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+ klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
+ klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
+ bad_request_error "Unknown model class #{model.class}." unless klass
+
cv = klass.new(
name: model.name,
model_id: model.id,
@@ -35,7 +37,7 @@ module OpenTox
predictions = {}
training_dataset = Dataset.find model.training_dataset_id
training_dataset.folds(n).each_with_index do |fold,fold_nr|
- #fork do # parallel execution of validations
+ #fork do # parallel execution of validations can lead to Rserve and memory problems
$logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
t = Time.now
validation = Validation.create(model, fold[0], fold[1],cv)
@@ -121,7 +123,6 @@ module OpenTox
end
def misclassifications n=nil
- #n = predictions.size unless n
n ||= 10
model = Model::Lazar.find(self.model_id)
training_dataset = Dataset.find(model.training_dataset_id)
@@ -132,8 +133,7 @@ module OpenTox
neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
neighbors.collect! do |n|
neighbor = Compound.find(n[0])
- values = training_dataset.values(neighbor,prediction_feature)
- { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+ { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]}
end
{
:smiles => compound.smiles,
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 274c475..fdf1bfc 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -5,7 +5,8 @@ module OpenTox
class Dataset
- field :data_entries, type: Hash, default: {}
+ field :substance_ids, type: Array, default: []
+ field :feature_ids, type: Array, default: []
# Readers
@@ -19,13 +20,13 @@ module OpenTox
# Get all substances
def substances
- @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}
+ @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id}
@substances
end
# Get all features
def features
- @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact
+ @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)}
@features
end
@@ -33,9 +34,9 @@ module OpenTox
# @param compound [OpenTox::Compound] OpenTox Compound object
# @param feature [OpenTox::Feature] OpenTox Feature object
# @return [Array] Data entry values
- def values(compound, feature)
- data_entries[compound.id.to_s][feature.id.to_s]
- end
+ #def values(compound, feature)
+ #data_entries[compound.id.to_s][feature.id.to_s]
+ #end
# Writers
@@ -45,9 +46,9 @@ module OpenTox
end
# Set features
- #def features=(features)
- #self.feature_ids = features.collect{|f| f.id}
- #end
+ def features=(features)
+ self.feature_ids = features.collect{|f| f.id}
+ end
# Dataset operations
@@ -55,8 +56,7 @@ module OpenTox
# @param [Integer] number of folds
# @return [Array] Array with folds [training_dataset,test_dataset]
def folds n
- substance_ids = data_entries.keys
- len = substance_ids.size
+ len = self.substance_ids.size
indices = (0..len-1).to_a.shuffle
mid = (len/n)
chunks = []
@@ -69,19 +69,11 @@ module OpenTox
training_idxs = indices-test_idxs
training_cids = training_idxs.collect{|i| substance_ids[i]}
chunk = [training_cids,test_cids].collect do |cids|
- new_data_entries = {}
- cids.each do |cid|
- data_entries[cid].each do |f,v|
- new_data_entries[cid] ||= {}
- new_data_entries[cid][f] = v
- end
- end
- dataset = self.class.new(:data_entries => new_data_entries, :source => self.id )
+ dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id )
dataset.compounds.each do |compound|
compound.dataset_ids << dataset.id
compound.save
end
- dataset.save
dataset
end
start = last+1
@@ -90,12 +82,6 @@ module OpenTox
chunks
end
- # Diagnostics
-
- def duplicates feature=self.features.first
- data_entries.select{|sid,f| f[feature.id].size > 1}
- end
-
# Serialisation
# converts dataset to csv format including compound smiles as first column, other column headers are feature names
@@ -161,7 +147,6 @@ module OpenTox
compound_format = feature_names.shift.strip
# TODO nanoparticles
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
-
numeric = []
# guess feature types
feature_names.each_with_index do |f,i|
@@ -180,8 +165,7 @@ module OpenTox
numeric[i] = false
feature = NominalFeature.find_or_create_by(metadata)
end
- @features ||= []
- @features << feature if feature
+ feature_ids << feature.id if feature
end
$logger.debug "Feature values: #{Time.now-time}"
@@ -196,7 +180,7 @@ module OpenTox
table.each_with_index do |vals,i|
ct = Time.now
identifier = vals.shift.strip
- warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
+ warn "No feature values for compound at position #{i+2}." if vals.compact.empty?
begin
case compound_format
when /SMILES/i
@@ -208,41 +192,38 @@ module OpenTox
rescue
compound = nil
end
- if compound.nil?
- # compound parsers may return nil
- warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
+ if compound.nil? # compound parsers may return nil
+ warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
next
end
+ substance_ids << compound.id
compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
compound_time += Time.now-ct
r += 1
- unless vals.size == @features.size
- warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
+ unless vals.size == feature_ids.size
+ warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
next
end
vals.each_with_index do |v,j|
if v.blank?
- warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
+ warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
next
elsif numeric[j]
v = v.to_f
else
v = v.strip
end
- self.data_entries[compound.id.to_s] ||= {}
- self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= []
- self.data_entries[compound.id.to_s][@features[j].id.to_s] << v
- compound.toxicities[@features[j].id.to_s] ||= []
- compound.toxicities[@features[j].id.to_s] << v
+ compound.toxicities[feature_ids[j].to_s] ||= []
+ compound.toxicities[feature_ids[j].to_s] << v
compound.save
end
end
compounds.duplicates.each do |compound|
positions = []
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
- warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 10fbe85..ed917eb 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -10,6 +10,8 @@ module OpenTox
field :finished_at, type: Time
def self.create model
+ $logger.debug "#{model.name}: LOO validation started"
+ t = Time.now
model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
predictions = model.predict model.training_dataset.compounds
@@ -17,7 +19,7 @@ module OpenTox
nr_unpredicted = 0
predictions.each do |cid,prediction|
if prediction[:value]
- prediction[:measured] = model.training_dataset.data_entries[cid][prediction[:prediction_feature_id].to_s]
+ prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s]
else
nr_unpredicted += 1
end
@@ -28,6 +30,7 @@ module OpenTox
loo.predictions = predictions
loo.statistics
loo.save
+ $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
loo
end
@@ -84,16 +87,12 @@ module OpenTox
class RegressionLeaveOneOutValidation < LeaveOneOutValidation
-
- field :rmse, type: Float, default: 0.0
+ field :rmse, type: Float, default: 0
field :mae, type: Float, default: 0
- #field :weighted_rmse, type: Float, default: 0
- #field :weighted_mae, type: Float, default: 0
field :r_squared, type: Float
field :correlation_plot_id, type: BSON::ObjectId
field :confidence_plot_id, type: BSON::ObjectId
-
def statistics
stat = ValidationStatistics.regression predictions
update_attributes(stat)
diff --git a/lib/model.rb b/lib/model.rb
index 1960c10..b82f098 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -20,6 +20,10 @@ module OpenTox
def training_dataset
Dataset.find(training_dataset_id)
end
+
+ def prediction_feature
+ Feature.find(prediction_feature_id)
+ end
end
class Lazar < Model
@@ -31,13 +35,10 @@ module OpenTox
# Create a lazar model from a training_dataset and a feature_dataset
# @param [OpenTox::Dataset] training_dataset
# @return [OpenTox::Model::Lazar] Regression or classification model
- def initialize training_dataset, params={}
+ def initialize prediction_feature, training_dataset, params={}
super params
- # TODO document convention
- #p training_dataset.features
- prediction_feature = training_dataset.features.first
# set defaults for empty parameters
self.prediction_feature_id ||= prediction_feature.id
self.training_dataset_id ||= training_dataset.id
@@ -49,7 +50,6 @@ module OpenTox
end
def predict_compound compound
- prediction_feature = Feature.find prediction_feature_id
neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
# remove neighbors without prediction_feature
# check for database activities (neighbors may include query compound)
@@ -122,18 +122,13 @@ module OpenTox
end
end
-
- def training_activities
- i = training_dataset.feature_ids.index prediction_feature_id
- training_dataset.data_entries.collect{|de| de[i]}
- end
end
class LazarClassification < Lazar
- def self.create training_dataset, params={}
- model = self.new training_dataset, params
+ def self.create prediction_feature, training_dataset, params={}
+ model = self.new prediction_feature, training_dataset, params
model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
model.neighbor_algorithm ||= "fingerprint_neighbors"
model.neighbor_algorithm_parameters ||= {}
@@ -151,8 +146,8 @@ module OpenTox
class LazarRegression < Lazar
- def self.create training_dataset, params={}
- model = self.new training_dataset, params
+ def self.create prediction_feature, training_dataset, params={}
+ model = self.new prediction_feature, training_dataset, params
model.neighbor_algorithm ||= "fingerprint_neighbors"
model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
model.neighbor_algorithm_parameters ||= {}
@@ -173,13 +168,13 @@ module OpenTox
include Mongoid::Document
include Mongoid::Timestamps
- # TODO field Validations
field :endpoint, type: String
field :species, type: String
field :source, type: String
field :unit, type: String
field :model_id, type: BSON::ObjectId
field :repeated_crossvalidation_id, type: BSON::ObjectId
+ field :leave_one_out_validation_id, type: BSON::ObjectId
def predict object
Lazar.find(model_id).predict object
@@ -201,12 +196,16 @@ module OpenTox
repeated_crossvalidation.crossvalidations
end
+ def leave_one_out_validation
+ LeaveOneOutValidation.find leave_one_out_validation_id
+ end
+
def regression?
- training_dataset.features.first.numeric?
+ model.is_a? LazarRegression
end
def classification?
- training_dataset.features.first.nominal?
+ model.is_a? LazarClassification
end
def self.from_csv_file file
@@ -214,14 +213,17 @@ module OpenTox
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
prediction_model = self.new JSON.parse(File.read(metadata_file))
training_dataset = Dataset.from_csv_file file
+ prediction_feature = training_dataset.features.first
model = nil
- if training_dataset.features.first.nominal?
- model = LazarClassification.create training_dataset
- elsif training_dataset.features.first.numeric?
- model = LazarRegression.create training_dataset
+ if prediction_feature.nominal?
+ model = LazarClassification.create prediction_feature, training_dataset
+ elsif prediction_feature.numeric?
+ model = LazarRegression.create prediction_feature, training_dataset
end
prediction_model[:model_id] = model.id
+ prediction_model[:prediction_feature_id] = prediction_feature.id
prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
+ prediction_model[:leave_one_out_validation_id] = LeaveOneOutValidation.create(model).id
prediction_model.save
prediction_model
end
diff --git a/lib/validation.rb b/lib/validation.rb
index 484e22e..6b515e4 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -27,14 +27,14 @@ module OpenTox
atts = model.attributes.dup # do not modify attributes from original model
atts["_id"] = BSON::ObjectId.new
atts[:training_dataset_id] = training_set.id
- validation_model = model.class.create training_set, atts
+ validation_model = model.class.create model.prediction_feature, training_set, atts
validation_model.save
predictions = validation_model.predict test_set.compounds
predictions.each{|cid,p| p.delete(:neighbors)}
nr_unpredicted = 0
predictions.each do |cid,prediction|
if prediction[:value]
- prediction[:measured] = test_set.data_entries[cid][prediction[:prediction_feature_id].to_s]
+ prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s]
else
nr_unpredicted += 1
end
@@ -42,7 +42,6 @@ module OpenTox
end
validation = self.new(
:model_id => validation_model.id,
- #:prediction_dataset_id => prediction_dataset.id,
:test_dataset_id => test_set.id,
:nr_instances => test_set.compounds.size,
:nr_unpredicted => nr_unpredicted,
diff --git a/test/prediction_models.rb b/test/prediction_models.rb
index a2e5fe2..49a2472 100644
--- a/test/prediction_models.rb
+++ b/test/prediction_models.rb
@@ -10,7 +10,6 @@ class PredictionModelTest < MiniTest::Test
assert pm.classification?
refute pm.regression?
pm.crossvalidations.each do |cv|
- p cv
assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
end
prediction = pm.predict Compound.from_smiles("CCCC(NN)C")
diff --git a/test/setup.rb b/test/setup.rb
index 6c97282..e7c32b4 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb'
include OpenTox
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
-#$mongo.database.drop
-#$gridfs = $mongo.database.fs
+$mongo.database.drop
+$gridfs = $mongo.database.fs
diff --git a/test/validation.rb b/test/validation.rb
index e702278..baee2d1 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -6,14 +6,14 @@ class ValidationTest < MiniTest::Test
def test_default_classification_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
- model = Model::LazarClassification.create dataset
+ model = Model::LazarClassification.create dataset.features.first, dataset
cv = ClassificationCrossValidation.create model
assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split"
end
def test_default_regression_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
- model = Model::LazarRegression.create dataset
+ model = Model::LazarRegression.create dataset.features.first, dataset
cv = RegressionCrossValidation.create model
assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be larger than 1.5, this may occur due to an unfavorable training/test set split"
assert cv.mae < 1, "MAE #{cv.mae} should be larger than 1, this may occur due to an unfavorable training/test set split"
@@ -30,7 +30,7 @@ class ValidationTest < MiniTest::Test
:type => "FP3"
}
}
- model = Model::LazarClassification.create dataset, params
+ model = Model::LazarClassification.create dataset.features.first, dataset, params
model.save
cv = ClassificationCrossValidation.create model
params = model.neighbor_algorithm_parameters
@@ -54,7 +54,7 @@ class ValidationTest < MiniTest::Test
:min_sim => 0.7,
}
}
- model = Model::LazarRegression.create dataset, params
+ model = Model::LazarRegression.create dataset.features.first, dataset, params
cv = RegressionCrossValidation.create model
cv.validation_ids.each do |vid|
model = Model::Lazar.find(Validation.find(vid).model_id)
@@ -70,7 +70,7 @@ class ValidationTest < MiniTest::Test
def test_physchem_regression_crossvalidation
training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
- model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
+ model = Model::LazarRegression.create(training_dataset.features.first, training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
cv = RegressionCrossValidation.create model
refute_nil cv.rmse
refute_nil cv.mae
@@ -80,7 +80,7 @@ class ValidationTest < MiniTest::Test
def test_classification_loo_validation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
- model = Model::LazarClassification.create dataset
+ model = Model::LazarClassification.create dataset.features.first, dataset
loo = ClassificationLeaveOneOutValidation.create model
assert_equal 14, loo.nr_unpredicted
refute_empty loo.confusion_matrix
@@ -89,7 +89,7 @@ class ValidationTest < MiniTest::Test
def test_regression_loo_validation
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
- model = Model::LazarRegression.create dataset
+ model = Model::LazarRegression.create dataset.features.first, dataset
loo = RegressionLeaveOneOutValidation.create model
assert loo.r_squared > 0.34
end
@@ -98,7 +98,7 @@ class ValidationTest < MiniTest::Test
def test_repeated_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
- model = Model::LazarClassification.create dataset
+ model = Model::LazarClassification.create dataset.features.first, dataset
repeated_cv = RepeatedCrossValidation.create model
repeated_cv.crossvalidations.each do |cv|
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"