summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-09-23 14:51:41 +0200
committerChristoph Helma <helma@in-silico.ch>2015-09-23 14:51:41 +0200
commitd5bf97c2cb999539c56bf59aa1d7d3286745be84 (patch)
tree91d5ab3fd9641c7349d45356d43aef867e4bee92
parent259cd085e053193b4c166495ae1af35cfa94bcf6 (diff)
validations fixed (all models were executed with default parameters)
-rw-r--r--lib/compound.rb41
-rw-r--r--lib/crossvalidation.rb7
-rw-r--r--lib/dataset.rb40
-rw-r--r--lib/descriptor.rb3
-rw-r--r--lib/experiment.rb162
-rw-r--r--lib/model.rb88
-rw-r--r--lib/unique_descriptors.rb4
-rw-r--r--lib/validation.rb12
-rw-r--r--test/compound.rb12
-rw-r--r--test/dataset.rb10
-rw-r--r--test/experiment.rb63
-rw-r--r--test/lazar-physchem-short.rb34
-rw-r--r--test/lazar-regression.rb10
-rw-r--r--test/validation.rb66
14 files changed, 394 insertions, 158 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 7abd913..d3df125 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -44,6 +44,21 @@ module OpenTox
compound.save
compound
end
+
+ #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
+ def mpd
+ smarts = obconversion(smiles,"smi","mpd").strip.split("\t")
+ smarts.shift # remove Title
+ smarts
+
+ end
+
+ #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
+ def mna level=2
+ smarts = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
+ smarts.shift # remove Title
+ smarts
+ end
def openbabel_fingerprint type="FP2"
unless self.send(type.downcase.to_sym) # stored fingerprint
@@ -72,7 +87,7 @@ module OpenTox
end
start += bitsperint
end
- update type.downcase.to_sym, bits_set
+ update_attribute type.downcase.to_sym, bits_set
end
self.send(type.downcase.to_sym)
end
@@ -242,6 +257,28 @@ module OpenTox
neighbors
end
+ def physchem_neighbors params
+ feature_dataset = Dataset.find params[:feature_dataset_id]
+ query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
+ neighbors = []
+ feature_dataset.data_entries.each_with_index do |fingerprint, i|
+ # TODO implement pearson and cosine similarity separatly
+ R.assign "x", query_fingerprint
+ R.assign "y", fingerprint
+ # pearson r
+ #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
+ #p "pearson"
+ #p sim
+ #p "cosine"
+ sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
+ #p sim
+ if sim >= params[:min_sim]
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ neighbors
+ end
+
def neighbors threshold=0.7
# TODO restrict to dataset
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
@@ -308,7 +345,7 @@ print sdf
end
def obconversion(identifier,input_format,output_format,option=nil)
- self.class.obconversion(identifier,input_format,output_format,option=nil)
+ self.class.obconversion(identifier,input_format,output_format,option)
end
end
end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 337b434..4c80344 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -33,15 +33,12 @@ module OpenTox
nr_instances = 0
nr_unpredicted = 0
predictions = []
- validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
training_dataset = Dataset.find model.training_dataset_id
training_dataset.folds(n).each_with_index do |fold,fold_nr|
fork do # parallel execution of validations
$logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
t = Time.now
- #p validation_class#.create(model, fold[0], fold[1],cv)
- validation = validation_class.create(model, fold[0], fold[1],cv)
- #p validation
+ validation = Validation.create(model, fold[0], fold[1],cv)
$logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
end
end
@@ -170,7 +167,7 @@ module OpenTox
y = predictions.collect{|p| p[2]}
R.assign "measurement", x
R.assign "prediction", y
- R.eval "r <- cor(-log(measurement),-log(prediction))"
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
r = R.eval("r").to_ruby
mae = mae/predictions.size
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 946fd90..7c8ab44 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -46,6 +46,12 @@ module OpenTox
else
@data_entries = Marshal.load(data_entry_file.data)
bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
+ unless @data_entries.first.size == feature_ids.size
+ # TODO: fix (unknown) source of empty data_entries
+ sleep 1
+ data_entry_file = $gridfs.find_one(_id: data_entries_id)
+ @data_entries = Marshal.load(data_entry_file.data)
+ end
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
# TODO: data_entries can be empty, poorly reproducible, mongo problem?
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
@@ -281,6 +287,29 @@ module OpenTox
end
end
end
+
+ def scale
+ scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)}
+ centers = []
+ scales = []
+ feature_ids.each_with_index do |feature_id,col|
+ R.assign "x", data_entries.collect{|de| de[col]}
+ R.eval "scaled = scale(x,center=T,scale=T)"
+ centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby
+ scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby
+ R.eval("scaled").to_ruby.each_with_index do |value,row|
+ scaled_data_entries[row][col] = value
+ end
+ end
+ scaled_dataset = ScaledDataset.new(attributes)
+ scaled_dataset["_id"] = BSON::ObjectId.new
+ scaled_dataset["_type"] = "OpenTox::ScaledDataset"
+ scaled_dataset.centers = centers
+ scaled_dataset.scales = scales
+ scaled_dataset.data_entries = scaled_data_entries
+ scaled_dataset.save_all
+ scaled_dataset
+ end
end
# Dataset for lazar predictions
@@ -297,6 +326,17 @@ module OpenTox
# Dataset for descriptors (physchem)
class DescriptorDataset < Dataset
field :feature_calculation_algorithm, type: String
+
+ end
+
+ class ScaledDataset < DescriptorDataset
+
+ field :centers, type: Array, default: []
+ field :scales, type: Array, default: []
+
+ def original_value value, i
+ value * scales[i] + centers[i]
+ end
end
# Dataset for fminer descriptors
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 5ae0ef2..9733bde 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -16,7 +16,7 @@ module OpenTox
LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
- obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
+ obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
name,description = d.split(/\s+/,2)
["Openbabel."+name,description] unless obexclude.include? name
@@ -107,6 +107,7 @@ module OpenTox
des[lib] << descriptor
end
des.each do |lib,descriptors|
+ p lib, descriptors
send(lib, descriptors)
end
serialize
diff --git a/lib/experiment.rb b/lib/experiment.rb
index 0a76c53..616a273 100644
--- a/lib/experiment.rb
+++ b/lib/experiment.rb
@@ -4,105 +4,93 @@ module OpenTox
field :dataset_ids, type: Array
field :model_settings, type: Array, default: []
field :results, type: Hash, default: {}
- end
- def run
- dataset_ids.each do |dataset_id|
- dataset = Dataset.find(dataset_id)
- results[dataset_id.to_s] = []
- model_settings.each do |setting|
- model = Object.const_get(setting[:algorithm]).create dataset
- model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm]
- model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm]
- model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter]
- model.save
- repeated_crossvalidation = RepeatedCrossValidation.create model
- results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
+ def run
+ dataset_ids.each do |dataset_id|
+ dataset = Dataset.find(dataset_id)
+ results[dataset_id.to_s] = []
+ model_settings.each do |setting|
+ model_algorithm = setting.delete :model_algorithm
+ model = Object.const_get(model_algorithm).create dataset, setting
+ #model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm]
+ #model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm]
+ #model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter]
+ p model
+ model.save
+ repeated_crossvalidation = RepeatedCrossValidation.create model
+ results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
+ end
end
+ save
end
- save
- end
-
- def self.create params
- experiment = self.new
- $logge.debug "Experiment started ..."
- #experiment.run params
- experiment
- end
- def report
- # TODO significances
- # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
- report = {}
- report[:name] = name
- report[:experiment_id] = self.id.to_s
- report[:results] = {}
- parameters = []
- dataset_ids.each do |dataset_id|
- dataset_name = Dataset.find(dataset_id).name
- report[:results][dataset_name] = {}
- report[:results][dataset_name][:anova] = {}
- report[:results][dataset_name][:data] = []
- results[dataset_id.to_s].each do |result|
- model = Model::Lazar.find(result[:model_id])
- repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
- crossvalidations = repeated_cv.crossvalidations
- if crossvalidations.first.is_a? ClassificationCrossValidation
- parameters = [:accuracy,:true_rate,:predictivity]
- elsif crossvalidations.first.is_a? RegressionCrossValidation
- parameters = [:rmse,:mae,:r_squared]
- end
- summary = {}
- [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
- summary[key] = model[key]
- end
- summary[:nr_instances] = crossvalidations.first.nr_instances
- summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
- summary[:time] = crossvalidations.collect{|cv| cv.time}
- parameters.each do |param|
- summary[param] = crossvalidations.collect{|cv| cv.send(param)}
+ def report
+ # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
+ report = {}
+ report[:name] = name
+ report[:experiment_id] = self.id.to_s
+ report[:results] = {}
+ parameters = []
+ dataset_ids.each do |dataset_id|
+ dataset_name = Dataset.find(dataset_id).name
+ report[:results][dataset_name] = {}
+ report[:results][dataset_name][:anova] = {}
+ report[:results][dataset_name][:data] = []
+ results[dataset_id.to_s].each do |result|
+ model = Model::Lazar.find(result[:model_id])
+ repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
+ crossvalidations = repeated_cv.crossvalidations
+ if crossvalidations.first.is_a? ClassificationCrossValidation
+ parameters = [:accuracy,:true_rate,:predictivity]
+ elsif crossvalidations.first.is_a? RegressionCrossValidation
+ parameters = [:rmse,:mae,:r_squared]
+ end
+ summary = {}
+ [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
+ summary[key] = model[key]
+ end
+ summary[:nr_instances] = crossvalidations.first.nr_instances
+ summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
+ summary[:time] = crossvalidations.collect{|cv| cv.time}
+ parameters.each do |param|
+ summary[param] = crossvalidations.collect{|cv| cv.send(param)}
+ end
+ report[:results][dataset_name][:data] << summary
end
- report[:results][dataset_name][:data] << summary
end
- end
- report[:results].each do |dataset,results|
- ([:time,:nr_unpredicted]+parameters).each do |param|
- experiments = []
- outcome = []
- results[:data].each_with_index do |result,i|
- result[param].each do |p|
- experiments << i
- outcome << p
+ report[:results].each do |dataset,results|
+ ([:time,:nr_unpredicted]+parameters).each do |param|
+ experiments = []
+ outcome = []
+ results[:data].each_with_index do |result,i|
+ result[param].each do |p|
+ experiments << i
+ p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
+ outcome << p
+ end
end
- end
- R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
- R.eval "experiment_nr = factor(experiment_nr)"
- R.assign "outcome",outcome
- R.eval "data = data.frame(experiment_nr,outcome)"
- # one-way ANOVA
- R.eval "fit = aov(outcome ~ experiment_nr, data=data)"
- # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
- p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
- # aequivalent
- # sum = R.eval("summary(fit)")
- #p_value = sum.to_ruby.first.last.first
+ R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
+ R.eval "experiment_nr = factor(experiment_nr)"
+ R.assign "outcome", outcome
+ R.eval "data = data.frame(experiment_nr,outcome)"
+ # one-way ANOVA
+ R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
+ # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
+ p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
+ # aequivalent
+ # sum = R.eval("summary(fit)")
+ #p_value = sum.to_ruby.first.last.first
+ report[:results][dataset][:anova][param] = p_value
=begin
- if p_value < 0.01
- p_value = "#{p_value} ***"
- elsif p_value < 0.05
- p_value = "#{p_value} **"
- elsif p_value < 0.1
- p_value = "#{p_value} *"
- end
=end
- report[:results][dataset][:anova][param] = p_value
+ end
end
+ report
end
- report
- end
- def summary
- report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
+ def summary
+ report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
+ end
end
end
diff --git a/lib/model.rb b/lib/model.rb
index 9892f64..817a61e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -26,25 +26,26 @@ module OpenTox
# algorithms
field :neighbor_algorithm, type: String
- field :neighbor_algorithm_parameters, type: Hash
+ field :neighbor_algorithm_parameters, type: Hash, default: {}
# Create a lazar model from a training_dataset and a feature_dataset
# @param [OpenTox::Dataset] training_dataset
# @return [OpenTox::Model::Lazar] Regression or classification model
- def self.create training_dataset
+ def initialize training_dataset, params={}
+ super params
bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
# TODO document convention
prediction_feature = training_dataset.features.first
- prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
- lazar.training_dataset_id = training_dataset.id
- lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
- lazar.prediction_feature_id = prediction_feature.id
- lazar.name = "#{training_dataset.name} #{prediction_feature.name}"
-
- lazar.save
- lazar
+ # set defaults for empty parameters
+ self.prediction_feature_id ||= prediction_feature.id
+ self.training_dataset_id ||= training_dataset.id
+ self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
+ self.neighbor_algorithm_parameters ||= {}
+ self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
+ save
+ self
end
def predict object
@@ -80,6 +81,7 @@ module OpenTox
next
end
neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+
#neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
# add activities
# TODO: improve efficiency, takes 3 times longer than previous version
@@ -90,6 +92,17 @@ module OpenTox
end
neighbors.compact! # remove neighbors without training activities
predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size})
+=begin
+# TODO scaled dataset for physchem
+ p neighbor_algorithm_parameters
+ p (neighbor_algorithm_parameters["feature_dataset_id"])
+ d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"])
+ p d
+ p d.class
+ if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset
+ p "SCALED"
+ end
+=end
end
# serialize result
@@ -128,15 +141,40 @@ module OpenTox
end
class LazarClassification < Lazar
- def initialize
- super
- self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
- self.neighbor_algorithm = "fingerprint_neighbors"
- self.neighbor_algorithm_parameters = {
+
+ def self.create training_dataset, params={}
+ model = self.new training_dataset, params
+ model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
+ model.neighbor_algorithm |= "fingerprint_neighbors"
+ model.neighbor_algorithm_parameters ||= {}
+ {
:type => "FP4",
- :training_dataset_id => training_dataset_id,
+ :training_dataset_id => training_dataset.id,
:min_sim => 0.7
- }
+ }.each do |key,value|
+ model.neighbor_algorithm_parameters[key] ||= value
+ end
+ model.save
+ model
+ end
+ end
+
+ class LazarRegression < Lazar
+
+ def self.create training_dataset, params={}
+ model = self.new training_dataset, params
+ #model.neighbor_algorithm ||= "fingerprint_neighbors"
+ #model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average"
+ #model.neighbor_algorithm_parameters ||= {}
+ #{
+ #:type => "FP4",
+ #:training_dataset_id => training_dataset.id,
+ #:min_sim => 0.7
+ #}.each do |key,value|
+ #model.neighbor_algorithm_parameters[key] ||= value
+ #end
+ model.save
+ model
end
end
@@ -159,26 +197,12 @@ module OpenTox
end
end
- class LazarRegression < Lazar
-
- def initialize
- super
- #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
- self.neighbor_algorithm = "fingerprint_neighbors"
- self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
- self.neighbor_algorithm_parameters = {
- :type => "FP4",
- :training_dataset_id => self.training_dataset_id,
- :min_sim => 0.7
- }
- end
- end
-
class Prediction
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
+ # TODO cv -> repeated cv
# TODO field Validations
field :endpoint, type: String
field :species, type: String
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
index 676f34a..cf9cbf3 100644
--- a/lib/unique_descriptors.rb
+++ b/lib/unique_descriptors.rb
@@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [
"Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
"Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
"Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
- "Openbabel.L5", #Lipinski Rule of Five
+ #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
"Openbabel.logP", #octanol/water partition coefficient
"Openbabel.MP", #Melting point
"Openbabel.MR", #molar refractivity
@@ -56,7 +56,7 @@ UNIQUEDESCRIPTORS = [
"Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
"Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
"Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
- "Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
+ #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
"Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
"Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
"Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
diff --git a/lib/validation.rb b/lib/validation.rb
index 63fbd89..9eebef8 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -2,6 +2,7 @@ module OpenTox
class Validation
+ field :model_id, type: BSON::ObjectId
field :prediction_dataset_id, type: BSON::ObjectId
field :crossvalidation_id, type: BSON::ObjectId
field :test_dataset_id, type: BSON::ObjectId
@@ -17,9 +18,17 @@ module OpenTox
Dataset.find test_dataset_id
end
+ def model
+ Model::Lazar.find model_id
+ end
+
def self.create model, training_set, test_set, crossvalidation=nil
- validation_model = model.class.create training_set#, features
+ atts = model.attributes.dup # do not modify attributes from original model
+ atts["_id"] = BSON::ObjectId.new
+ atts[:training_dataset_id] = training_set.id
+ validation_model = model.class.create training_set, atts
+ validation_model.save
test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
prediction_dataset = validation_model.predict test_set_without_activities
predictions = []
@@ -36,6 +45,7 @@ module OpenTox
end
end
validation = self.new(
+ :model_id => validation_model.id,
:prediction_dataset_id => prediction_dataset.id,
:test_dataset_id => test_set.id,
:nr_instances => test_set.compound_ids.size,
diff --git a/test/compound.rb b/test/compound.rb
index 6a3c696..b33a643 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -134,4 +134,16 @@ print c.sdf
end
end
end
+
+ def test_mna
+ c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
+ p c.mna 4
+ end
+
+ def test_mpd
+ c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
+ assert 13, c.mpd.size
+ assert 7, c.mpd.uniq.size
+ assert_equal c.mpd, c.openbabel_fingerprint("mpd")
+ end
end
diff --git a/test/dataset.rb b/test/dataset.rb
index 84be547..752073e 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -202,5 +202,15 @@ class DatasetTest < MiniTest::Test
assert_equal 0.00323, d2.data_entries[5][0]
end
+ def test_scaled_dataset
+ original_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
+ scaled_dataset = original_dataset.scale
+ scaled_dataset.data_entries.each_with_index do |row,i|
+ row.each_with_index do |value,j|
+ assert_equal original_dataset.data_entries[i][j].round(4), scaled_dataset.original_value(value,j).round(4) if value # ignore nils
+ end
+ end
+ end
+
end
diff --git a/test/experiment.rb b/test/experiment.rb
index 4b54768..76a0498 100644
--- a/test/experiment.rb
+++ b/test/experiment.rb
@@ -63,21 +63,26 @@ class ExperimentTest < MiniTest::Test
end
def test_regression_fingerprints
+=begin
datasets = [
- "LOAEL_mmol_corrected_smiles.csv"
+ "EPAFHM.medi.csv",
+ #"LOAEL_mmol_corrected_smiles.csv"
]
min_sims = [0.3,0.7]
- types = ["FP2","FP3","FP4","MACCS"]
+ #min_sims = [0.7]
+ #types = ["FP2","FP3","FP4","MACCS","mpd"]
+ types = ["mpd","FP3"]
experiment = Experiment.create(
- :name => "Fminer vs fingerprint classification for datasets #{datasets}.",
+ :name => "Fingerprint regression with different types for datasets #{datasets}.",
:dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
)
types.each do |type|
min_sims.each do |min_sim|
experiment.model_settings << {
- :algorithm => "OpenTox::Model::LazarRegression",
+ :model_algorithm => "OpenTox::Model::LazarRegression",
+ :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average",
:neighbor_algorithm => "fingerprint_neighbors",
- :neighbor_algorithm_parameter => {
+ :neighbor_algorithm_parameters => {
:type => type,
:min_sim => min_sim,
}
@@ -85,7 +90,53 @@ class ExperimentTest < MiniTest::Test
end
end
experiment.run
- p experiment.report
+=end
+#=begin
+ experiment = Experiment.find '56029cb92b72ed673d000000'
+#=end
+ p experiment.id
+ experiment.results.each do |dataset,result|
+ result.each do |r|
+ params = Model::Lazar.find(r["model_id"])[:neighbor_algorithm_parameters]
+ RepeatedCrossValidation.find(r["repeated_crossvalidation_id"]).crossvalidations.each do |cv|
+ cv.validation_ids.each do |vid|
+ model_params = Model::Lazar.find(Validation.find(vid).model_id)[:neighbor_algorithm_parameters]
+ assert_equal params[:type], model_params[:type]
+ assert_equal params[:min_sim], model_params[:min_sim]
+ refute_equal params[:training_dataset_id], model_params[:training_dataset_id]
+ end
+ end
+ end
+ end
+ puts experiment.report.to_yaml
+ p experiment.summary
+ end
+ def test_mpd_fingerprints
+=begin
+ datasets = [
+ "EPAFHM.medi.csv",
+ ]
+ types = ["FP2","mpd"]
+ experiment = Experiment.create(
+ :name => "FP2 vs mpd fingerprint regression for datasets #{datasets}.",
+ :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
+ )
+ types.each do |type|
+ experiment.model_settings << {
+ :algorithm => "OpenTox::Model::LazarRegression",
+ :neighbor_algorithm => "fingerprint_neighbors",
+ :neighbor_algorithm_parameter => {
+ :type => type,
+ :min_sim => 0.7,
+ }
+ }
+ end
+ experiment.run
+ p experiment.id
+=end
+ experiment = Experiment.find '55ffd0c02b72ed123c000000'
+ p experiment
+ puts experiment.report.to_yaml
end
end
diff --git a/test/lazar-physchem-short.rb b/test/lazar-physchem-short.rb
index e74a4b9..59d8112 100644
--- a/test/lazar-physchem-short.rb
+++ b/test/lazar-physchem-short.rb
@@ -2,27 +2,29 @@ require_relative "setup.rb"
class LazarPhyschemDescriptorTest < MiniTest::Test
def test_epafhm
- skip "Physchem Regression not yet implemented."
- # check available descriptors
- @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys
- assert_equal 111,@descriptors.size,"wrong number of physchem descriptors"
- @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES
- # select descriptors for test
- @num_features_offset = 0
- @descriptors.keep_if{|x| x=~/^Openbabel\./}
- @descriptors.delete("Openbabel.L5") # TODO Openbabel.L5 does not work, investigate!!!
- puts "Descriptors: #{@descriptors}"
+ @descriptors = OpenTox::Algorithm::Descriptor::OBDESCRIPTORS.keys
+ refute_empty @descriptors
# UPLOAD DATA
training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
- puts "Dataset: "+training_dataset.id
-# feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors
- model = Model::LazarRegression.create training_dataset#, feature_dataset
- #p model
+ feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors
+ scaled_feature_dataset = feature_dataset.scale
+ model = Model::LazarRegression.create training_dataset
+ model.neighbor_algorithm = "physchem_neighbors"
+ model.neighbor_algorithm_parameters = {
+ :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.physchem",
+ :descriptors => @descriptors,
+ :feature_dataset_id => scaled_feature_dataset.id,
+ :min_sim => 0.3
+ }
+ model.save
compound = Compound.from_smiles "CC(C)(C)CN"
prediction = model.predict compound
- p prediction
-
+ refute_nil prediction[:value]
+ refute_nil prediction[:confidence]
+ prediction[:neighbors].each do |line|
+ assert_operator line[1], :>, 0.3
+ end
end
end
diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb
index cc7f356..8b2d473 100644
--- a/test/lazar-regression.rb
+++ b/test/lazar-regression.rb
@@ -13,6 +13,16 @@ class LazarRegressionTest < MiniTest::Test
assert_equal 1, prediction[:neighbors].size
end
+ def test_mpd_fingerprints
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
+ model = Model::LazarRegression.create training_dataset
+ model.neighbor_algorithm_parameters[:type] = "mpd"
+ compound = Compound.from_smiles "CCCSCCSCC"
+ prediction = model.predict compound
+ assert_equal 0.04, prediction[:value].round(2)
+ assert_equal 1, prediction[:neighbors].size
+ end
+
def test_local_linear_regression
skip
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
diff --git a/test/validation.rb b/test/validation.rb
index dfa2c81..9717ccc 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -20,10 +20,25 @@ class ValidationTest < MiniTest::Test
end
def test_regression_crossvalidation
- #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
- dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
- model = Model::LazarRegression.create dataset
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
+ #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
+ params = {
+ :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average",
+ :neighbor_algorithm => "fingerprint_neighbors",
+ :neighbor_algorithm_parameters => {
+ :type => "MACCS",
+ :min_sim => 0.7,
+ }
+ }
+ model = Model::LazarRegression.create dataset, params
cv = RegressionCrossValidation.create model
+ cv.validation_ids.each do |vid|
+ model = Model::Lazar.find(Validation.find(vid).model_id)
+ assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
+ assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
+ refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
+ end
+
#`inkview #{cv.plot}`
#puts JSON.pretty_generate(cv.misclassifications)#.collect{|l| l.join ", "}.join "\n"
#`inkview #{cv.plot}`
@@ -37,12 +52,51 @@ class ValidationTest < MiniTest::Test
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
model = Model::LazarClassification.create dataset
repeated_cv = RepeatedCrossValidation.create model
- p repeated_cv
repeated_cv.crossvalidations.each do |cv|
- p cv
- p cv.accuracy
assert cv.accuracy > 0.7
end
end
+ def test_crossvalidation_parameters
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+ params = {
+ :neighbor_algorithm_parameters => {
+ :min_sim => 0.3,
+ :type => "FP3"
+ }
+ }
+ model = Model::LazarClassification.create dataset, params
+ model.save
+ cv = ClassificationCrossValidation.create model
+ params = model.neighbor_algorithm_parameters
+ params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
+ cv.validations.each do |validation|
+ assert_equal params, validation.model.neighbor_algorithm_parameters
+ end
+ end
+
+ def test_physchem_regression_crossvalidation
+
+ @descriptors = OpenTox::Algorithm::Descriptor::OBDESCRIPTORS.keys
+ refute_empty @descriptors
+
+ # UPLOAD DATA
+ training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
+ feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors
+ feature_dataset.save
+ scaled_feature_dataset = feature_dataset.scale
+ scaled_feature_dataset.save
+ model = Model::LazarRegression.create training_dataset
+ model.neighbor_algorithm = "physchem_neighbors"
+ model.neighbor_algorithm_parameters = {
+ :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.physchem",
+ :descriptors => @descriptors,
+ :feature_dataset_id => scaled_feature_dataset.id,
+ :min_sim => 0.3
+ }
+ model.save
+ cv = RegressionCrossValidation.create model
+ p cv
+ end
+
end