summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/compound.rb67
-rw-r--r--lib/crossvalidation.rb58
-rw-r--r--lib/dataset.rb7
-rw-r--r--lib/error.rb2
-rw-r--r--lib/experiment.rb83
-rw-r--r--lib/lazar.rb7
-rw-r--r--lib/model.rb27
-rw-r--r--lib/neighbor.rb25
-rw-r--r--lib/opentox.rb1
-rw-r--r--test/compound.rb37
-rw-r--r--test/experiment.rb91
-rw-r--r--test/validation.rb12
12 files changed, 295 insertions, 122 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 6adf3c0..7abd913 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -19,8 +19,11 @@ module OpenTox
field :png_id, type: BSON::ObjectId
field :svg_id, type: BSON::ObjectId
field :sdf_id, type: BSON::ObjectId
+ field :fp2, type: Array
+ field :fp3, type: Array
field :fp4, type: Array
field :fp4_size, type: Integer
+ field :maccs, type: Array
index({smiles: 1}, {unique: true})
@@ -42,6 +45,38 @@ module OpenTox
compound
end
+ def openbabel_fingerprint type="FP2"
+ unless self.send(type.downcase.to_sym) # stored fingerprint
+ fp = OpenBabel::OBFingerprint.find_fingerprint(type)
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_format "smi"
+ obconversion.read_string obmol, smiles
+ result = OpenBabel::VectorUnsignedInt.new
+ fp.get_fingerprint(obmol,result)
+ # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
+ #p OpenBabel::OBFingerprint.describe_bits(result)
+ # convert result to a list of the bits that are set
+ # from openbabel/scripts/python/pybel.py line 830
+ # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
+ result = result.to_a
+ bitsperint = OpenBabel::OBFingerprint.getbitsperint()
+ bits_set = []
+ start = 1
+ result.each do |x|
+ i = start
+ while x > 0 do
+ bits_set << i if (x % 2) == 1
+ x >>= 1
+ i += 1
+ end
+ start += bitsperint
+ end
+ update type.downcase.to_sym, bits_set
+ end
+ self.send(type.downcase.to_sym)
+ end
+
# Create a compound from smiles string
# @example
# compound = OpenTox::Compound.from_smiles("c1ccccc1")
@@ -177,6 +212,36 @@ module OpenTox
self["chemblid"]
end
+ def fingerprint_neighbors params
+ bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
+ neighbors = []
+ query_fingerprint = self.openbabel_fingerprint params[:type]
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+ unless self == compound
+ fingerprint = compound.openbabel_fingerprint params[:type]
+ sim = (query_fingerprint & fingerprint).size/(query_fingerprint | fingerprint).size.to_f
+ neighbors << [compound.id, sim] if sim >= params[:min_sim]
+ end
+ end
+ neighbors.sort{|a,b| b.last <=> a.last}
+ end
+
+ def fminer_neighbors params
+ bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
+ feature_dataset = Dataset.find params[:feature_dataset_id]
+ query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
+ neighbors = []
+
+ # find neighbors
+ feature_dataset.data_entries.each_with_index do |fingerprint, i|
+ sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
+ if sim >= params[:min_sim]
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ neighbors
+ end
+
def neighbors threshold=0.7
# TODO restrict to dataset
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
@@ -202,8 +267,6 @@ module OpenTox
$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
end
-=begin
-=end
private
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 90c0d75..337b434 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -22,7 +22,9 @@ module OpenTox
end
def self.create model, n=10
- cv = self.new(
+ model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
+ bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+ cv = klass.new(
name: model.name,
model_id: model.id,
folds: n
@@ -55,6 +57,7 @@ module OpenTox
nr_unpredicted: nr_unpredicted,
predictions: predictions
)
+ cv.statistics
cv
end
end
@@ -70,14 +73,13 @@ module OpenTox
field :predictivity, type: Hash
# TODO auc, f-measure (usability??)
- def self.create model, n=10
- cv = super model, n
+ def statistics
accept_values = Feature.find(model.prediction_feature_id).accept_values
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
true_rate = {}
predictivity = {}
- cv.predictions.each do |pred|
+ predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction and confidence.numeric?
if prediction == activity
@@ -113,18 +115,16 @@ module OpenTox
confidence_sum += c
end
end
- cv.update_attributes(
+ update_attributes(
accept_values: accept_values,
confusion_matrix: confusion_matrix,
weighted_confusion_matrix: weighted_confusion_matrix,
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(cv.nr_instances-cv.nr_unpredicted).to_f,
+ accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
true_rate: true_rate,
predictivity: predictivity,
finished_at: Time.now
)
- cv.save
- cv
end
#Average area under roc 0.646
@@ -142,8 +142,7 @@ module OpenTox
field :correlation_plot_id, type: BSON::ObjectId
field :confidence_plot_id, type: BSON::ObjectId
- def self.create model, n=10
- cv = super model, n
+ def statistics
rmse = 0
weighted_rmse = 0
rse = 0
@@ -153,7 +152,7 @@ module OpenTox
rae = 0
weighted_rae = 0
confidence_sum = 0
- cv.predictions.each do |pred|
+ predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction
error = Math.log10(prediction)-Math.log10(activity)
@@ -163,24 +162,24 @@ module OpenTox
weighted_mae += confidence*error.abs
confidence_sum += confidence
else
- cv.warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}."
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
end
end
- x = cv.predictions.collect{|p| p[1]}
- y = cv.predictions.collect{|p| p[2]}
+ x = predictions.collect{|p| p[1]}
+ y = predictions.collect{|p| p[2]}
R.assign "measurement", x
R.assign "prediction", y
R.eval "r <- cor(-log(measurement),-log(prediction))"
r = R.eval("r").to_ruby
- mae = mae/cv.predictions.size
+ mae = mae/predictions.size
weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/cv.predictions.size)
+ rmse = Math.sqrt(rmse/predictions.size)
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
# TODO check!!
=begin
- cv.predictions.sort! do |a,b|
+ predictions.sort! do |a,b|
relative_error_a = (a[1]-a[2]).abs/a[1].to_f
relative_error_a = 1/relative_error_a if relative_error_a < 1
relative_error_b = (b[1]-b[2]).abs/b[1].to_f
@@ -188,15 +187,14 @@ module OpenTox
[relative_error_b,b[3]] <=> [relative_error_a,a[3]]
end
=end
- cv.update_attributes(
+ update_attributes(
mae: mae,
rmse: rmse,
weighted_mae: weighted_mae,
weighted_rmse: weighted_rmse,
- r_squared: r**2
+ r_squared: r**2,
+ finished_at: Time.now
)
- cv.save
- cv
end
def misclassifications n=nil
@@ -277,5 +275,21 @@ module OpenTox
end
end
+ class RepeatedCrossValidation
+ field :crossvalidation_ids, type: Array, default: []
+ def self.create model, folds=10, repeats=3
+ repeated_cross_validation = self.new
+ repeats.times do |n|
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+ end
+ repeated_cross_validation.save
+ repeated_cross_validation
+ end
+ def crossvalidations
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
+ end
+ end
+
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 851fabd..00e2bc3 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -47,6 +47,7 @@ module OpenTox
@data_entries = Marshal.load(data_entry_file.data)
bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
+ # TODO: data_entries can be empty, poorly reproducible, mongo problem?
bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
#$logger.debug "Retrieving data: #{Time.now-t}"
end
@@ -151,7 +152,7 @@ module OpenTox
name = File.basename(file,".*")
dataset = self.find_by(:source => source, :name => name)
if dataset
- $logger.debug "Skipping #{file}, it is already in the database (id: #{dataset.id})."
+ $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
else
$logger.debug "Parsing #{file}."
table = CSV.read file, :skip_blanks => true
@@ -202,7 +203,7 @@ module OpenTox
feature = NominalFeature.find_or_create_by(metadata)
end
end
- feature_ids << feature.id
+ feature_ids << feature.id if feature
end
$logger.debug "Feature values: #{Time.now-time}"
@@ -244,7 +245,7 @@ module OpenTox
end
compound_ids << compound.id
- @data_entries << Array.new(table.first.size-1)
+ @data_entries << Array.new(table.first.size-1) if (table.first.size-1) > 0
vals.each_with_index do |v,j|
if v.blank?
diff --git a/lib/error.rb b/lib/error.rb
index 8fe8a1e..39b3c76 100644
--- a/lib/error.rb
+++ b/lib/error.rb
@@ -58,7 +58,7 @@ module OpenTox
OpenTox.const_set error[:class],c
# define global methods for raising errors, eg. bad_request_error
- Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
+ Object.send(:define_method, error[:method]) do |message|
raise c.new(message)
end
end
diff --git a/lib/experiment.rb b/lib/experiment.rb
index 2f51756..985a491 100644
--- a/lib/experiment.rb
+++ b/lib/experiment.rb
@@ -2,45 +2,22 @@ module OpenTox
class Experiment
field :dataset_ids, type: Array
- field :model_algorithms, type: Array
- field :model_ids, type: Array, default: []
- field :crossvalidation_ids, type: Array, default: []
- field :prediction_algorithms, type: Array
- field :neighbor_algorithms, type: Array
- field :neighbor_algorithm_parameters, type: Array
+ field :model_settings, type: Array, default: []
+ field :results, type: Hash, default: {}
end
- # TODO more sophisticated experimental design
def run
dataset_ids.each do |dataset_id|
dataset = Dataset.find(dataset_id)
- model_algorithms.each do |model_algorithm|
- prediction_algorithms.each do |prediction_algorithm|
- neighbor_algorithms.each do |neighbor_algorithm|
- neighbor_algorithm_parameters.each do |neighbor_algorithm_parameter|
- $logger.debug "Creating #{model_algorithm} model for dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}."
- model = Object.const_get(model_algorithm).create dataset
- model.prediction_algorithm = prediction_algorithm
- model.neighbor_algorithm = neighbor_algorithm
- model.neighbor_algorithm_parameters = neighbor_algorithm_parameter
- model.save
- model_ids << model.id
- cv = nil
- if dataset.features.first.nominal
- cv = ClassificationCrossValidation
- elsif dataset.features.first.numeric
- cv = RegressionCrossValidation
- end
- if cv
- $logger.debug "Creating #{cv} for #{model_algorithm}, dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}."
- crossvalidation = cv.create model
- self.crossvalidation_ids << crossvalidation.id
- else
- $logger.warn "#{dataset.features.first} is neither nominal nor numeric."
- end
- end
- end
- end
+ results[dataset_id.to_s] = []
+ model_settings.each do |setting|
+ model = Object.const_get(setting[:algorithm]).create dataset
+ model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm]
+ model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm]
+ model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter]
+ model.save
+ repeated_crossvalidation = RepeatedCrossValidation.create model
+ results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
end
end
save
@@ -49,18 +26,42 @@ module OpenTox
def self.create params
experiment = self.new
$logge.debug "Experiment started ..."
- experiment.run params
+ #experiment.run params
experiment
end
def report
- # TODO create ggplot2 report
- self.crossvalidation_ids.each do |id|
- cv = CrossValidation.find(id)
- file = "/tmp/#{id}.svg"
- File.open(file,"w+"){|f| f.puts cv.correlation_plot}
- `inkview '#{file}'`
+ # TODO significances
+ report = {}
+ report[:name] = name
+ report[:experiment_id] = self.id.to_s
+ dataset_ids.each do |dataset_id|
+ dataset_name = Dataset.find(dataset_id).name
+ report[dataset_name] = []
+ results[dataset_id.to_s].each do |result|
+ model = Model::Lazar.find(result[:model_id])
+ repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
+ crossvalidations = repeated_cv.crossvalidations
+ summary = {}
+ [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
+ summary[key] = model[key]
+ end
+ summary[:nr_instances] = crossvalidations.first.nr_instances
+ summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
+ summary[:time] = crossvalidations.collect{|cv| cv.time}
+ if crossvalidations.first.is_a? ClassificationCrossValidation
+ summary[:accuracies] = crossvalidations.collect{|cv| cv.accuracy}
+ elsif crossvalidations.first.is_a? RegressionCrossValidation
+ summary[:r_squared] = crossvalidations.collect{|cv| cv.r_squared}
+ end
+ report[dataset_name] << summary
+ #p repeated_cv.crossvalidations.collect{|cv| cv.accuracy}
+ #file = "/tmp/#{id}.svg"
+ #File.open(file,"w+"){|f| f.puts cv.correlation_plot}
+ #`inkview '#{file}'`
+ end
end
+ report
end
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index decbe69..89b50f7 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -15,7 +15,8 @@ ENV["MONGOID_ENV"] ||= "development"
# TODO remove config files, change default via ENV or directly in Mongoid class
Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
Mongoid.raise_not_found_error = false # return nil if no document is found
-$mongo = Mongoid.default_client
+$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
+#$mongo = Mongoid.default_client
$gridfs = $mongo.database.fs
# R setup
@@ -42,7 +43,7 @@ ENV['FMINER_SILENT'] = 'true'
ENV['FMINER_NR_HITS'] = 'true'
# OpenTox classes and includes
-CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
[ # be aware of the require sequence as it affects class/method overwrites
"overwrite.rb",
@@ -58,7 +59,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Experi
"bbrc.rb",
"model.rb",
"similarity.rb",
- "neighbor.rb",
+ #"neighbor.rb",
"classification.rb",
"regression.rb",
"validation.rb",
diff --git a/lib/model.rb b/lib/model.rb
index 0155fc8..9892f64 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -28,9 +28,6 @@ module OpenTox
field :neighbor_algorithm, type: String
field :neighbor_algorithm_parameters, type: Hash
- #attr_accessor :prediction_dataset
- #attr_accessor :training_dataset
-
# Create a lazar model from a training_dataset and a feature_dataset
# @param [OpenTox::Dataset] training_dataset
# @return [OpenTox::Model::Lazar] Regression or classification model
@@ -42,6 +39,7 @@ module OpenTox
prediction_feature = training_dataset.features.first
prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
lazar.training_dataset_id = training_dataset.id
+ lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
lazar.prediction_feature_id = prediction_feature.id
lazar.name = "#{training_dataset.name} #{prediction_feature.name}"
@@ -81,7 +79,8 @@ module OpenTox
predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
next
end
- neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
+ neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+ #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
# add activities
# TODO: improve efficiency, takes 3 times longer than previous version
neighbors.collect! do |n|
@@ -132,8 +131,12 @@ module OpenTox
def initialize
super
self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
- self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
- self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+ self.neighbor_algorithm = "fingerprint_neighbors"
+ self.neighbor_algorithm_parameters = {
+ :type => "FP4",
+ :training_dataset_id => training_dataset_id,
+ :min_sim => 0.7
+ }
end
end
@@ -144,7 +147,7 @@ module OpenTox
model = super(training_dataset)
model.update "_type" => self.to_s # adjust class
model = self.find model.id # adjust class
- model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
+ model.neighbor_algorithm = "fminer_neighbors"
model.neighbor_algorithm_parameters = {
:feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
:feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id,
@@ -157,11 +160,17 @@ module OpenTox
end
class LazarRegression < Lazar
+
def initialize
super
- self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+ #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+ self.neighbor_algorithm = "fingerprint_neighbors"
self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
- self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+ self.neighbor_algorithm_parameters = {
+ :type => "FP4",
+ :training_dataset_id => self.training_dataset_id,
+ :min_sim => 0.7
+ }
end
end
diff --git a/lib/neighbor.rb b/lib/neighbor.rb
deleted file mode 100644
index d849cbf..0000000
--- a/lib/neighbor.rb
+++ /dev/null
@@ -1,25 +0,0 @@
-module OpenTox
- module Algorithm
- class Neighbor
-
- def self.fingerprint_similarity compound, params={}
- compound.neighbors params[:min_sim]
- end
-
- def self.fminer_similarity compound, params
- feature_dataset = Dataset.find params[:feature_dataset_id]
- query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
- neighbors = []
-
- # find neighbors
- feature_dataset.data_entries.each_with_index do |fingerprint, i|
- sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
- if sim > params[:min_sim]
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
- end
- end
- neighbors
- end
- end
- end
-end
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 875487c..186c87a 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -14,7 +14,6 @@ module OpenTox
store_in collection: klass.downcase.pluralize
field :name, type: String
field :warnings, type: Array, default: []
-
end
OpenTox.const_set klass,c
end
diff --git a/test/compound.rb b/test/compound.rb
index 06c19a2..6a3c696 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -97,4 +97,41 @@ print c.sdf
c = Compound.from_inchi(inchi)
assert_equal inchi, c.inchi
end
+
+ def test_openbabel_fingerprint
+ [
+ "CC(=O)CC(C)C#N",
+ "CC(=O)CC(C)C",
+ "C(=O)CC(C)C#N",
+ ].each do |smi|
+ c = OpenTox::Compound.from_smiles smi
+ assert_equal c.openbabel_fingerprint("FP4").size, c.fp4.size
+ end
+ end
+
+ def test_fingerprint_neighbors
+ types = ["FP2", "FP3", "FP4", "MACCS"]
+ min_sim = 0.7
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
+ [
+ "CC(=O)CC(C)C#N",
+ "CC(=O)CC(C)C",
+ "C(=O)CC(C)C#N",
+ ].each do |smi|
+ c = OpenTox::Compound.from_smiles smi
+ p c.smiles
+ types.each do |type|
+ p type
+ neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
+ p neighbors.collect{|n| [Compound.find(n.first).smiles,n.last]}
+ if type == "FP4"
+ fp4_neighbors = c.neighbors
+ neighbors.each do |n|
+ p [Compound.find(n.first).smiles,n.last] unless fp4_neighbors.include?(n)
+ assert_includes fp4_neighbors, n
+ end
+ end
+ end
+ end
+ end
end
diff --git a/test/experiment.rb b/test/experiment.rb
index c465d7b..4b54768 100644
--- a/test/experiment.rb
+++ b/test/experiment.rb
@@ -4,27 +4,88 @@ class ExperimentTest < MiniTest::Test
def test_regression_experiment
datasets = [
- "EPAFHM.csv",
- "FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv",
+ "EPAFHM.medi.csv",
+ #"EPAFHM.csv",
+ #"FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv",
"LOAEL_mmol_corrected_smiles.csv"
+ ]
+ experiment = Experiment.create(
+ :name => "Default regression for datasets #{datasets}.",
+ :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
+ :model_settings => [
+ {
+ :algorithm => "OpenTox::Model::LazarRegression",
+ }
]
- model_algorithms = ["OpenTox::Model::LazarRegression"]
- neighbor_algorithms = ["OpenTox::Algorithm::Neighbor.fingerprint_similarity"]
- prediction_algorithms = ["OpenTox::Algorithm::Regression.weighted_average"]
- neighbor_algorithm_parameters = [{:min_sim => 0.7}]
+ )
+ #experiment.run
+ puts experiment.report.to_yaml
+ assert_equal datasets.size, experiment.results.size
+ experiment.results.each do |dataset_id, result|
+ assert_equal 1, result.size
+ result.each do |r|
+ assert_kind_of BSON::ObjectId, r[:model_id]
+ assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id]
+ end
+ end
+ end
+
+ def test_classification_experiment
+
+ datasets = [ "hamster_carcinogenicity.csv" ]
experiment = Experiment.create(
- :name => "Regression for datasets #{datasets}.",
+ :name => "Fminer vs fingerprint classification for datasets #{datasets}.",
:dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
- :model_algorithms => model_algorithms,
- :neighbor_algorithms => neighbor_algorithms,
- :neighbor_algorithm_parameters => neighbor_algorithm_parameters,
- :prediction_algorithms => prediction_algorithms,
+ :model_settings => [
+ {
+ :algorithm => "OpenTox::Model::LazarClassification",
+ },{
+ :algorithm => "OpenTox::Model::LazarClassification",
+ :neighbor_algorithm_parameter => {:min_sim => 0.3}
+ },
+ #{
+ #:algorithm => "OpenTox::Model::LazarFminerClassification",
+ #}
+ ]
)
- experiment.run
+ #experiment.run
=begin
- p experiment
- experiment.report
+ experiment = Experiment.find "55f944a22b72ed7de2000000"
=end
- refute_empty experiment.crossvalidation_ids
+ puts experiment.report.to_yaml
+ experiment.results.each do |dataset_id, result|
+ assert_equal 2, result.size
+ result.each do |r|
+ assert_kind_of BSON::ObjectId, r[:model_id]
+ assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id]
+ end
+ end
+ end
+
+ def test_regression_fingerprints
+ datasets = [
+ "LOAEL_mmol_corrected_smiles.csv"
+ ]
+ min_sims = [0.3,0.7]
+ types = ["FP2","FP3","FP4","MACCS"]
+ experiment = Experiment.create(
+ :name => "Fminer vs fingerprint classification for datasets #{datasets}.",
+ :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
+ )
+ types.each do |type|
+ min_sims.each do |min_sim|
+ experiment.model_settings << {
+ :algorithm => "OpenTox::Model::LazarRegression",
+ :neighbor_algorithm => "fingerprint_neighbors",
+ :neighbor_algorithm_parameter => {
+ :type => type,
+ :min_sim => min_sim,
+ }
+ }
+ end
+ end
+ experiment.run
+ p experiment.report
+
end
end
diff --git a/test/validation.rb b/test/validation.rb
index a4c3d80..dfa2c81 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -33,4 +33,16 @@ class ValidationTest < MiniTest::Test
#assert cv.weighted_mae < cv.mae
end
+ def test_repeated_crossvalidation
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+ model = Model::LazarClassification.create dataset
+ repeated_cv = RepeatedCrossValidation.create model
+ p repeated_cv
+ repeated_cv.crossvalidations.each do |cv|
+ p cv
+ p cv.accuracy
+ assert cv.accuracy > 0.7
+ end
+ end
+
end