From ca2bb0f90335b1f2c4ecc28ee423e85b281ffcf0 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 4 Nov 2015 17:50:17 +0100
Subject: neighbor search delegated to database backend

---
 lib/compound.rb | 52 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

(limited to 'lib/compound.rb')

diff --git a/lib/compound.rb b/lib/compound.rb
index a26528b..c5e7f02 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -23,13 +23,16 @@ module OpenTox
     field :sdf_id, type: BSON::ObjectId
     field :fingerprints, type: Hash, default: {}
     field :default_fingerprint_size, type: Integer
+    field :dataset_ids, type: Array, default: []
+    field :features, type: Hash, default: {}
 
     index({smiles: 1}, {unique: true})
+    #index({default_fingerprint: 1}, {unique: false})
 
     # Overwrites standard Mongoid method to create fingerprints before database insertion
     def self.find_or_create_by params
       compound = self.find_or_initialize_by params
-      compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT)
+      compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
       compound.save
       compound
     end
@@ -41,7 +44,7 @@ module OpenTox
         if type == "MP2D"
           fp = obconversion(smiles,"smi","mpd").strip.split("\t")
           name = fp.shift # remove Title
-          fingerprints[type] = fp
+          fingerprints[type] = fp.uniq # no fingerprint counts
         #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
         elsif type== "MNA"
           level = 2 # TODO: level as parameter, evaluate level 1, see paper
@@ -244,20 +247,23 @@ module OpenTox
     def fingerprint_neighbors params
       bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
       neighbors = []
-      #if params[:type] == DEFAULT_FINGERPRINT
-        #neighbors = db_neighbors params
-        #p neighbors
-      #else
+      if params[:type] == DEFAULT_FINGERPRINT
+        neighbors = db_neighbors params
+      else 
         query_fingerprint = self.fingerprint params[:type]
-        training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
-          unless self == compound
+        training_dataset = Dataset.find(params[:training_dataset_id])
+        prediction_feature = training_dataset.features.first
+        training_dataset.compounds.each do |compound|
+          #unless self == compound
             candidate_fingerprint = compound.fingerprint params[:type]
             sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
-            neighbors << [compound.id, sim] if sim >= params[:min_sim]
-          end
+            feature_values = training_dataset.values(compound,prediction_feature)
+            neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
+          #end
         end
-      #end
-      neighbors.sort{|a,b| b.last <=> a.last}
+        neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
+      end
+      neighbors
     end
 
     def fminer_neighbors params
@@ -299,30 +305,34 @@ module OpenTox
     end
 
     def db_neighbors params
-      p "DB NEIGHBORS"
-      p params
-      # TODO restrict to dataset
       # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
-      qn = fingerprint(params[:type]).size
+
+      #qn = default_fingerprint_size
       #qmin = qn * threshold
       #qmax = qn / threshold
       #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
       #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
       aggregate = [
         #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
-        {'$match' =>  {'_id' => {'$ne' => self.id}}}, # remove self
+        #{'$match' =>  {'_id' => {'$ne' => self.id}}}, # remove self
         {'$project' => {
           'tanimoto' => {'$let' => {
-            'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}},
-            'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]}
+            'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
+            #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
+            'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
           }},
-          '_id' => 1
+          '_id' => 1,
+          'features' => 1,
+          'dataset_ids' => 1
         }},
         {'$match' =>  {'tanimoto' => {'$gte' => params[:min_sim]}}},
         {'$sort' => {'tanimoto' => -1}}
       ]
       
-      $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
+      $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
+
+
+      #$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
         
     end
 
-- 
cgit v1.2.3