From 6ac119c32cef094d4f1c2fb5c2daa4e274401f70 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 17 Sep 2015 14:56:25 +0200 Subject: neighbor calculation moved to Compound class --- lib/compound.rb | 84 +++++++++++++++++++++++++++++++++++--------------- lib/crossvalidation.rb | 3 +- lib/dataset.rb | 2 +- lib/experiment.rb | 4 +-- lib/lazar.rb | 2 +- lib/model.rb | 24 +++++++++++---- lib/neighbor.rb | 25 --------------- lib/opentox.rb | 1 - 8 files changed, 84 insertions(+), 61 deletions(-) delete mode 100644 lib/neighbor.rb (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 7f175ca..7abd913 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -19,8 +19,11 @@ module OpenTox field :png_id, type: BSON::ObjectId field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId + field :fp2, type: Array + field :fp3, type: Array field :fp4, type: Array field :fp4_size, type: Integer + field :maccs, type: Array index({smiles: 1}, {unique: true}) @@ -43,32 +46,35 @@ module OpenTox end def openbabel_fingerprint type="FP2" - fp = OpenBabel::OBFingerprint.find_fingerprint(type) - obmol = OpenBabel::OBMol.new - obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format "smi" - obconversion.read_string obmol, smiles - result = OpenBabel::VectorUnsignedInt.new - fp.get_fingerprint(obmol,result) - # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i - #p OpenBabel::OBFingerprint.describe_bits(result) - result = result.to_a - # convert result to a list of the bits that are set - # from openbabel/scripts/python/pybel.py line 830 - # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints - bitsperint = OpenBabel::OBFingerprint.getbitsperint() - bits_set = [] - start = 1 - result.each do |x| - i = start - while x > 0 do - bits_set << i if (x % 2) == 1 - x >>= 1 - i += 1 + unless self.send(type.downcase.to_sym) # stored fingerprint + fp = OpenBabel::OBFingerprint.find_fingerprint(type) + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format "smi" + obconversion.read_string obmol, smiles + result = OpenBabel::VectorUnsignedInt.new + fp.get_fingerprint(obmol,result) + # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i + #p OpenBabel::OBFingerprint.describe_bits(result) + # convert result to a list of the bits that are set + # from openbabel/scripts/python/pybel.py line 830 + # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints + result = result.to_a + bitsperint = OpenBabel::OBFingerprint.getbitsperint() + bits_set = [] + start = 1 + result.each do |x| + i = start + while x > 0 do + bits_set << i if (x % 2) == 1 + x >>= 1 + i += 1 + end + start += bitsperint end - start += bitsperint + update type.downcase.to_sym, bits_set end - bits_set + self.send(type.downcase.to_sym) end # Create a compound from smiles string @@ -206,6 +212,36 @@ module OpenTox self["chemblid"] end + def fingerprint_neighbors params + bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] + neighbors = [] + query_fingerprint = self.openbabel_fingerprint params[:type] + training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| + unless self == compound + fingerprint = compound.openbabel_fingerprint params[:type] + sim = (query_fingerprint & fingerprint).size/(query_fingerprint | fingerprint).size.to_f + neighbors << [compound.id, sim] if sim >= params[:min_sim] + end + end + neighbors.sort{|a,b| b.last <=> a.last} + end + + def fminer_neighbors params + bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim] + feature_dataset = Dataset.find params[:feature_dataset_id] + query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features) + neighbors = [] + + # find neighbors + feature_dataset.data_entries.each_with_index do |fingerprint, i| + sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint + if sim >= params[:min_sim] + neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + end + end + neighbors + end + def neighbors threshold=0.7 # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index f480932..337b434 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -279,7 +279,8 @@ module OpenTox field :crossvalidation_ids, type: Array, default: [] def self.create model, folds=10, repeats=3 repeated_cross_validation = self.new - repeats.times do + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id end repeated_cross_validation.save diff --git a/lib/dataset.rb b/lib/dataset.rb index d884716..7d889f8 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -152,7 +152,7 @@ module OpenTox name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) if dataset - $logger.debug "Skipping #{file}, it is already in the database (id: #{dataset.id})." + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." else $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true diff --git a/lib/experiment.rb b/lib/experiment.rb index 7849337..985a491 100644 --- a/lib/experiment.rb +++ b/lib/experiment.rb @@ -2,7 +2,7 @@ module OpenTox class Experiment field :dataset_ids, type: Array - field :model_settings, type: Array + field :model_settings, type: Array, default: [] field :results, type: Hash, default: {} end @@ -26,7 +26,7 @@ module OpenTox def self.create params experiment = self.new $logge.debug "Experiment started ..." - experiment.run params + #experiment.run params experiment end diff --git a/lib/lazar.rb b/lib/lazar.rb index 9b02053..89b50f7 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -59,7 +59,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat "bbrc.rb", "model.rb", "similarity.rb", - "neighbor.rb", + #"neighbor.rb", "classification.rb", "regression.rb", "validation.rb", diff --git a/lib/model.rb b/lib/model.rb index ddb69e4..9892f64 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -39,6 +39,7 @@ module OpenTox prediction_feature = training_dataset.features.first prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new lazar.training_dataset_id = training_dataset.id + lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id lazar.prediction_feature_id = prediction_feature.id lazar.name = "#{training_dataset.name} #{prediction_feature.name}" @@ -78,7 +79,8 @@ module OpenTox predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} next end - neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) + neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) + #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) # add activities # TODO: improve efficiency, takes 3 times longer than previous version neighbors.collect! do |n| @@ -129,8 +131,12 @@ module OpenTox def initialize super self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" - self.neighbor_algorithm_parameters = {:min_sim => 0.7} + self.neighbor_algorithm = "fingerprint_neighbors" + self.neighbor_algorithm_parameters = { + :type => "FP4", + :training_dataset_id => training_dataset_id, + :min_sim => 0.7 + } end end @@ -141,7 +147,7 @@ module OpenTox model = super(training_dataset) model.update "_type" => self.to_s # adjust class model = self.find model.id # adjust class - model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity" + model.neighbor_algorithm = "fminer_neighbors" model.neighbor_algorithm_parameters = { :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id, @@ -154,11 +160,17 @@ module OpenTox end class LazarRegression < Lazar + def initialize super - self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.neighbor_algorithm = "fingerprint_neighbors" self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" - self.neighbor_algorithm_parameters = {:min_sim => 0.7} + self.neighbor_algorithm_parameters = { + :type => "FP4", + :training_dataset_id => self.training_dataset_id, + :min_sim => 0.7 + } end end diff --git a/lib/neighbor.rb b/lib/neighbor.rb deleted file mode 100644 index d849cbf..0000000 --- a/lib/neighbor.rb +++ /dev/null @@ -1,25 +0,0 @@ -module OpenTox - module Algorithm - class Neighbor - - def self.fingerprint_similarity compound, params={} - compound.neighbors params[:min_sim] - end - - def self.fminer_similarity compound, params - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features) - neighbors = [] - - # find neighbors - feature_dataset.data_entries.each_with_index do |fingerprint, i| - sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint - if sim > params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming - end - end - neighbors - end - end - end -end diff --git a/lib/opentox.rb b/lib/opentox.rb index 875487c..186c87a 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -14,7 +14,6 @@ module OpenTox store_in collection: klass.downcase.pluralize field :name, type: String field :warnings, type: Array, default: [] - end OpenTox.const_set klass,c end -- cgit v1.2.3