summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/compound.rb84
-rw-r--r--lib/crossvalidation.rb3
-rw-r--r--lib/dataset.rb2
-rw-r--r--lib/experiment.rb4
-rw-r--r--lib/lazar.rb2
-rw-r--r--lib/model.rb24
-rw-r--r--lib/neighbor.rb25
-rw-r--r--lib/opentox.rb1
-rw-r--r--test/compound.rb26
-rw-r--r--test/experiment.rb31
10 files changed, 139 insertions, 63 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 7f175ca..7abd913 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -19,8 +19,11 @@ module OpenTox
field :png_id, type: BSON::ObjectId
field :svg_id, type: BSON::ObjectId
field :sdf_id, type: BSON::ObjectId
+ field :fp2, type: Array
+ field :fp3, type: Array
field :fp4, type: Array
field :fp4_size, type: Integer
+ field :maccs, type: Array
index({smiles: 1}, {unique: true})
@@ -43,32 +46,35 @@ module OpenTox
end
def openbabel_fingerprint type="FP2"
- fp = OpenBabel::OBFingerprint.find_fingerprint(type)
- obmol = OpenBabel::OBMol.new
- obconversion = OpenBabel::OBConversion.new
- obconversion.set_in_format "smi"
- obconversion.read_string obmol, smiles
- result = OpenBabel::VectorUnsignedInt.new
- fp.get_fingerprint(obmol,result)
- # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
- #p OpenBabel::OBFingerprint.describe_bits(result)
- result = result.to_a
- # convert result to a list of the bits that are set
- # from openbabel/scripts/python/pybel.py line 830
- # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
- bitsperint = OpenBabel::OBFingerprint.getbitsperint()
- bits_set = []
- start = 1
- result.each do |x|
- i = start
- while x > 0 do
- bits_set << i if (x % 2) == 1
- x >>= 1
- i += 1
+ unless self.send(type.downcase.to_sym) # stored fingerprint
+ fp = OpenBabel::OBFingerprint.find_fingerprint(type)
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_format "smi"
+ obconversion.read_string obmol, smiles
+ result = OpenBabel::VectorUnsignedInt.new
+ fp.get_fingerprint(obmol,result)
+ # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
+ #p OpenBabel::OBFingerprint.describe_bits(result)
+ # convert result to a list of the bits that are set
+ # from openbabel/scripts/python/pybel.py line 830
+ # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
+ result = result.to_a
+ bitsperint = OpenBabel::OBFingerprint.getbitsperint()
+ bits_set = []
+ start = 1
+ result.each do |x|
+ i = start
+ while x > 0 do
+ bits_set << i if (x % 2) == 1
+ x >>= 1
+ i += 1
+ end
+ start += bitsperint
end
- start += bitsperint
+ update type.downcase.to_sym, bits_set
end
- bits_set
+ self.send(type.downcase.to_sym)
end
# Create a compound from smiles string
@@ -206,6 +212,36 @@ module OpenTox
self["chemblid"]
end
+ def fingerprint_neighbors params
+ bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
+ neighbors = []
+ query_fingerprint = self.openbabel_fingerprint params[:type]
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+ unless self == compound
+ fingerprint = compound.openbabel_fingerprint params[:type]
+ sim = (query_fingerprint & fingerprint).size/(query_fingerprint | fingerprint).size.to_f
+ neighbors << [compound.id, sim] if sim >= params[:min_sim]
+ end
+ end
+ neighbors.sort{|a,b| b.last <=> a.last}
+ end
+
+ def fminer_neighbors params
+ bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
+ feature_dataset = Dataset.find params[:feature_dataset_id]
+ query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
+ neighbors = []
+
+ # find neighbors
+ feature_dataset.data_entries.each_with_index do |fingerprint, i|
+ sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
+ if sim >= params[:min_sim]
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ neighbors
+ end
+
def neighbors threshold=0.7
# TODO restrict to dataset
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index f480932..337b434 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -279,7 +279,8 @@ module OpenTox
field :crossvalidation_ids, type: Array, default: []
def self.create model, folds=10, repeats=3
repeated_cross_validation = self.new
- repeats.times do
+ repeats.times do |n|
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
end
repeated_cross_validation.save
diff --git a/lib/dataset.rb b/lib/dataset.rb
index d884716..7d889f8 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -152,7 +152,7 @@ module OpenTox
name = File.basename(file,".*")
dataset = self.find_by(:source => source, :name => name)
if dataset
- $logger.debug "Skipping #{file}, it is already in the database (id: #{dataset.id})."
+ $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
else
$logger.debug "Parsing #{file}."
table = CSV.read file, :skip_blanks => true
diff --git a/lib/experiment.rb b/lib/experiment.rb
index 7849337..985a491 100644
--- a/lib/experiment.rb
+++ b/lib/experiment.rb
@@ -2,7 +2,7 @@ module OpenTox
class Experiment
field :dataset_ids, type: Array
- field :model_settings, type: Array
+ field :model_settings, type: Array, default: []
field :results, type: Hash, default: {}
end
@@ -26,7 +26,7 @@ module OpenTox
def self.create params
experiment = self.new
$logge.debug "Experiment started ..."
- experiment.run params
+ #experiment.run params
experiment
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 9b02053..89b50f7 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -59,7 +59,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat
"bbrc.rb",
"model.rb",
"similarity.rb",
- "neighbor.rb",
+ #"neighbor.rb",
"classification.rb",
"regression.rb",
"validation.rb",
diff --git a/lib/model.rb b/lib/model.rb
index ddb69e4..9892f64 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -39,6 +39,7 @@ module OpenTox
prediction_feature = training_dataset.features.first
prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
lazar.training_dataset_id = training_dataset.id
+ lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
lazar.prediction_feature_id = prediction_feature.id
lazar.name = "#{training_dataset.name} #{prediction_feature.name}"
@@ -78,7 +79,8 @@ module OpenTox
predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
next
end
- neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
+ neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+ #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
# add activities
# TODO: improve efficiency, takes 3 times longer than previous version
neighbors.collect! do |n|
@@ -129,8 +131,12 @@ module OpenTox
def initialize
super
self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
- self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
- self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+ self.neighbor_algorithm = "fingerprint_neighbors"
+ self.neighbor_algorithm_parameters = {
+ :type => "FP4",
+ :training_dataset_id => training_dataset_id,
+ :min_sim => 0.7
+ }
end
end
@@ -141,7 +147,7 @@ module OpenTox
model = super(training_dataset)
model.update "_type" => self.to_s # adjust class
model = self.find model.id # adjust class
- model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
+ model.neighbor_algorithm = "fminer_neighbors"
model.neighbor_algorithm_parameters = {
:feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
:feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id,
@@ -154,11 +160,17 @@ module OpenTox
end
class LazarRegression < Lazar
+
def initialize
super
- self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+ #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+ self.neighbor_algorithm = "fingerprint_neighbors"
self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
- self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+ self.neighbor_algorithm_parameters = {
+ :type => "FP4",
+ :training_dataset_id => self.training_dataset_id,
+ :min_sim => 0.7
+ }
end
end
diff --git a/lib/neighbor.rb b/lib/neighbor.rb
deleted file mode 100644
index d849cbf..0000000
--- a/lib/neighbor.rb
+++ /dev/null
@@ -1,25 +0,0 @@
-module OpenTox
- module Algorithm
- class Neighbor
-
- def self.fingerprint_similarity compound, params={}
- compound.neighbors params[:min_sim]
- end
-
- def self.fminer_similarity compound, params
- feature_dataset = Dataset.find params[:feature_dataset_id]
- query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
- neighbors = []
-
- # find neighbors
- feature_dataset.data_entries.each_with_index do |fingerprint, i|
- sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
- if sim > params[:min_sim]
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
- end
- end
- neighbors
- end
- end
- end
-end
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 875487c..186c87a 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -14,7 +14,6 @@ module OpenTox
store_in collection: klass.downcase.pluralize
field :name, type: String
field :warnings, type: Array, default: []
-
end
OpenTox.const_set klass,c
end
diff --git a/test/compound.rb b/test/compound.rb
index 6deba4e..6a3c696 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -108,4 +108,30 @@ print c.sdf
assert_equal c.openbabel_fingerprint("FP4").size, c.fp4.size
end
end
+
+ def test_fingerprint_neighbors
+ types = ["FP2", "FP3", "FP4", "MACCS"]
+ min_sim = 0.7
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
+ [
+ "CC(=O)CC(C)C#N",
+ "CC(=O)CC(C)C",
+ "C(=O)CC(C)C#N",
+ ].each do |smi|
+ c = OpenTox::Compound.from_smiles smi
+ p c.smiles
+ types.each do |type|
+ p type
+ neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
+ p neighbors.collect{|n| [Compound.find(n.first).smiles,n.last]}
+ if type == "FP4"
+ fp4_neighbors = c.neighbors
+ neighbors.each do |n|
+ p [Compound.find(n.first).smiles,n.last] unless fp4_neighbors.include?(n)
+ assert_includes fp4_neighbors, n
+ end
+ end
+ end
+ end
+ end
end
diff --git a/test/experiment.rb b/test/experiment.rb
index cad4fa7..4b54768 100644
--- a/test/experiment.rb
+++ b/test/experiment.rb
@@ -18,7 +18,7 @@ class ExperimentTest < MiniTest::Test
}
]
)
- experiment.run
+ #experiment.run
puts experiment.report.to_yaml
assert_equal datasets.size, experiment.results.size
experiment.results.each do |dataset_id, result|
@@ -48,7 +48,7 @@ class ExperimentTest < MiniTest::Test
#}
]
)
- experiment.run
+ #experiment.run
=begin
experiment = Experiment.find "55f944a22b72ed7de2000000"
=end
@@ -61,4 +61,31 @@ class ExperimentTest < MiniTest::Test
end
end
end
+
+ def test_regression_fingerprints
+ datasets = [
+ "LOAEL_mmol_corrected_smiles.csv"
+ ]
+ min_sims = [0.3,0.7]
+ types = ["FP2","FP3","FP4","MACCS"]
+ experiment = Experiment.create(
+ :name => "Fminer vs fingerprint classification for datasets #{datasets}.",
+ :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
+ )
+ types.each do |type|
+ min_sims.each do |min_sim|
+ experiment.model_settings << {
+ :algorithm => "OpenTox::Model::LazarRegression",
+ :neighbor_algorithm => "fingerprint_neighbors",
+ :neighbor_algorithm_parameter => {
+ :type => type,
+ :min_sim => min_sim,
+ }
+ }
+ end
+ end
+ experiment.run
+ p experiment.report
+
+ end
end