diff options
-rw-r--r-- | lib/compound.rb | 84 | ||||
-rw-r--r-- | lib/crossvalidation.rb | 3 | ||||
-rw-r--r-- | lib/dataset.rb | 2 | ||||
-rw-r--r-- | lib/experiment.rb | 4 | ||||
-rw-r--r-- | lib/lazar.rb | 2 | ||||
-rw-r--r-- | lib/model.rb | 24 | ||||
-rw-r--r-- | lib/neighbor.rb | 25 | ||||
-rw-r--r-- | lib/opentox.rb | 1 | ||||
-rw-r--r-- | test/compound.rb | 26 | ||||
-rw-r--r-- | test/experiment.rb | 31 |
10 files changed, 139 insertions, 63 deletions
diff --git a/lib/compound.rb b/lib/compound.rb index 7f175ca..7abd913 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -19,8 +19,11 @@ module OpenTox field :png_id, type: BSON::ObjectId field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId + field :fp2, type: Array + field :fp3, type: Array field :fp4, type: Array field :fp4_size, type: Integer + field :maccs, type: Array index({smiles: 1}, {unique: true}) @@ -43,32 +46,35 @@ module OpenTox end def openbabel_fingerprint type="FP2" - fp = OpenBabel::OBFingerprint.find_fingerprint(type) - obmol = OpenBabel::OBMol.new - obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format "smi" - obconversion.read_string obmol, smiles - result = OpenBabel::VectorUnsignedInt.new - fp.get_fingerprint(obmol,result) - # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i - #p OpenBabel::OBFingerprint.describe_bits(result) - result = result.to_a - # convert result to a list of the bits that are set - # from openbabel/scripts/python/pybel.py line 830 - # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints - bitsperint = OpenBabel::OBFingerprint.getbitsperint() - bits_set = [] - start = 1 - result.each do |x| - i = start - while x > 0 do - bits_set << i if (x % 2) == 1 - x >>= 1 - i += 1 + unless self.send(type.downcase.to_sym) # stored fingerprint + fp = OpenBabel::OBFingerprint.find_fingerprint(type) + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format "smi" + obconversion.read_string obmol, smiles + result = OpenBabel::VectorUnsignedInt.new + fp.get_fingerprint(obmol,result) + # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i + #p OpenBabel::OBFingerprint.describe_bits(result) + # convert result to a list of the bits that are set + # from openbabel/scripts/python/pybel.py line 830 + # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints + result = result.to_a + bitsperint = OpenBabel::OBFingerprint.getbitsperint() + bits_set = [] + start = 1 + result.each do |x| + i = start + while x > 0 do + bits_set << i if (x % 2) == 1 + x >>= 1 + i += 1 + end + start += bitsperint end - start += bitsperint + update type.downcase.to_sym, bits_set end - bits_set + self.send(type.downcase.to_sym) end # Create a compound from smiles string @@ -206,6 +212,36 @@ module OpenTox self["chemblid"] end + def fingerprint_neighbors params + bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] + neighbors = [] + query_fingerprint = self.openbabel_fingerprint params[:type] + training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| + unless self == compound + fingerprint = compound.openbabel_fingerprint params[:type] + sim = (query_fingerprint & fingerprint).size/(query_fingerprint | fingerprint).size.to_f + neighbors << [compound.id, sim] if sim >= params[:min_sim] + end + end + neighbors.sort{|a,b| b.last <=> a.last} + end + + def fminer_neighbors params + bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim] + feature_dataset = Dataset.find params[:feature_dataset_id] + query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features) + neighbors = [] + + # find neighbors + feature_dataset.data_entries.each_with_index do |fingerprint, i| + sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint + if sim >= params[:min_sim] + neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + end + end + neighbors + end + def neighbors threshold=0.7 # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index f480932..337b434 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -279,7 +279,8 @@ module OpenTox field :crossvalidation_ids, type: Array, default: [] def self.create model, folds=10, repeats=3 repeated_cross_validation = self.new - repeats.times do + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id end repeated_cross_validation.save diff --git a/lib/dataset.rb b/lib/dataset.rb index d884716..7d889f8 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -152,7 +152,7 @@ module OpenTox name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) if dataset - $logger.debug "Skipping #{file}, it is already in the database (id: #{dataset.id})." + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." else $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true diff --git a/lib/experiment.rb b/lib/experiment.rb index 7849337..985a491 100644 --- a/lib/experiment.rb +++ b/lib/experiment.rb @@ -2,7 +2,7 @@ module OpenTox class Experiment field :dataset_ids, type: Array - field :model_settings, type: Array + field :model_settings, type: Array, default: [] field :results, type: Hash, default: {} end @@ -26,7 +26,7 @@ module OpenTox def self.create params experiment = self.new $logge.debug "Experiment started ..." - experiment.run params + #experiment.run params experiment end diff --git a/lib/lazar.rb b/lib/lazar.rb index 9b02053..89b50f7 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -59,7 +59,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat "bbrc.rb", "model.rb", "similarity.rb", - "neighbor.rb", + #"neighbor.rb", "classification.rb", "regression.rb", "validation.rb", diff --git a/lib/model.rb b/lib/model.rb index ddb69e4..9892f64 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -39,6 +39,7 @@ module OpenTox prediction_feature = training_dataset.features.first prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new lazar.training_dataset_id = training_dataset.id + lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id lazar.prediction_feature_id = prediction_feature.id lazar.name = "#{training_dataset.name} #{prediction_feature.name}" @@ -78,7 +79,8 @@ module OpenTox predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} next end - neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) + neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) + #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) # add activities # TODO: improve efficiency, takes 3 times longer than previous version neighbors.collect! do |n| @@ -129,8 +131,12 @@ module OpenTox def initialize super self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" - self.neighbor_algorithm_parameters = {:min_sim => 0.7} + self.neighbor_algorithm = "fingerprint_neighbors" + self.neighbor_algorithm_parameters = { + :type => "FP4", + :training_dataset_id => training_dataset_id, + :min_sim => 0.7 + } end end @@ -141,7 +147,7 @@ module OpenTox model = super(training_dataset) model.update "_type" => self.to_s # adjust class model = self.find model.id # adjust class - model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity" + model.neighbor_algorithm = "fminer_neighbors" model.neighbor_algorithm_parameters = { :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id, @@ -154,11 +160,17 @@ module OpenTox end class LazarRegression < Lazar + def initialize super - self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.neighbor_algorithm = "fingerprint_neighbors" self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" - self.neighbor_algorithm_parameters = {:min_sim => 0.7} + self.neighbor_algorithm_parameters = { + :type => "FP4", + :training_dataset_id => self.training_dataset_id, + :min_sim => 0.7 + } end end diff --git a/lib/neighbor.rb b/lib/neighbor.rb deleted file mode 100644 index d849cbf..0000000 --- a/lib/neighbor.rb +++ /dev/null @@ -1,25 +0,0 @@ -module OpenTox - module Algorithm - class Neighbor - - def self.fingerprint_similarity compound, params={} - compound.neighbors params[:min_sim] - end - - def self.fminer_similarity compound, params - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features) - neighbors = [] - - # find neighbors - feature_dataset.data_entries.each_with_index do |fingerprint, i| - sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint - if sim > params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming - end - end - neighbors - end - end - end -end diff --git a/lib/opentox.rb b/lib/opentox.rb index 875487c..186c87a 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -14,7 +14,6 @@ module OpenTox store_in collection: klass.downcase.pluralize field :name, type: String field :warnings, type: Array, default: [] - end OpenTox.const_set klass,c end diff --git a/test/compound.rb b/test/compound.rb index 6deba4e..6a3c696 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -108,4 +108,30 @@ print c.sdf assert_equal c.openbabel_fingerprint("FP4").size, c.fp4.size end end + + def test_fingerprint_neighbors + types = ["FP2", "FP3", "FP4", "MACCS"] + min_sim = 0.7 + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv") + [ + "CC(=O)CC(C)C#N", + "CC(=O)CC(C)C", + "C(=O)CC(C)C#N", + ].each do |smi| + c = OpenTox::Compound.from_smiles smi + p c.smiles + types.each do |type| + p type + neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim}) + p neighbors.collect{|n| [Compound.find(n.first).smiles,n.last]} + if type == "FP4" + fp4_neighbors = c.neighbors + neighbors.each do |n| + p [Compound.find(n.first).smiles,n.last] unless fp4_neighbors.include?(n) + assert_includes fp4_neighbors, n + end + end + end + end + end end diff --git a/test/experiment.rb b/test/experiment.rb index cad4fa7..4b54768 100644 --- a/test/experiment.rb +++ b/test/experiment.rb @@ -18,7 +18,7 @@ class ExperimentTest < MiniTest::Test } ] ) - experiment.run + #experiment.run puts experiment.report.to_yaml assert_equal datasets.size, experiment.results.size experiment.results.each do |dataset_id, result| @@ -48,7 +48,7 @@ class ExperimentTest < MiniTest::Test #} ] ) - experiment.run + #experiment.run =begin experiment = Experiment.find "55f944a22b72ed7de2000000" =end @@ -61,4 +61,31 @@ class ExperimentTest < MiniTest::Test end end end + + def test_regression_fingerprints + datasets = [ + "LOAEL_mmol_corrected_smiles.csv" + ] + min_sims = [0.3,0.7] + types = ["FP2","FP3","FP4","MACCS"] + experiment = Experiment.create( + :name => "Fminer vs fingerprint classification for datasets #{datasets}.", + :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, + ) + types.each do |type| + min_sims.each do |min_sim| + experiment.model_settings << { + :algorithm => "OpenTox::Model::LazarRegression", + :neighbor_algorithm => "fingerprint_neighbors", + :neighbor_algorithm_parameter => { + :type => type, + :min_sim => min_sim, + } + } + end + end + experiment.run + p experiment.report + + end end |