From 04af01b8135ea147e9ce253e5526e3ee3adcc675 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 7 Aug 2015 19:50:09 +0200
Subject: initial k-nn weighted average implementation

---
 lib/compound.rb       | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/dataset.rb        | 27 +++++++++++++++++---
 lib/feature.rb        |  7 ++++-
 lib/opentox-client.rb |  6 +++--
 4 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/lib/compound.rb b/lib/compound.rb
index 0c27553..4d292f1 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -1,7 +1,7 @@
 # TODO: check
 # *** Open Babel Error  in ParseFile
 #    Could not find contribution data file.
-# 3d creation??
+
 CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
 require 'openbabel'
 require "base64"
@@ -9,6 +9,34 @@ require "base64"
 module OpenTox
 
   class Compound
+    include OpenTox
+
+# OpenBabel FP4 fingerprints
+# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
+# TODO store in DB
+fp4 = FingerprintSmarts.find
+unless fp4
+  fp4 = []
+  File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l| 
+    l.strip!
+    unless l.empty? or l.match /^#/
+      name,smarts = l.split(': ')
+      fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
+    end
+  end
+end
+FP4 = fp4
+
+# TODO investigate other types of fingerprints (MACCS)
+# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
+# http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
+# OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
+# Morgan ECFP, FCFP
+# http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
+# http://www.rdkit.org/docs/GettingStartedInPython.html
+# Chemfp
+# https://chemfp.readthedocs.org/en/latest/using-tools.html
+# CACTVS/PubChem
 
     field :inchi, type: String
     attr_readonly :inchi
@@ -19,6 +47,8 @@ module OpenTox
     field :chemblid, type: String
     field :image_id, type: BSON::ObjectId
     field :sdf_id, type: BSON::ObjectId
+    field :fp4, type: Array
+    field :fp4_size, type: Integer
     #belongs_to :dataset
     #belongs_to :data_entry
 
@@ -26,6 +56,22 @@ module OpenTox
       #self.inchi == compound.inchi
     #end
 
+    def self.find_or_create_by params
+      compound = self.find_or_initialize_by params
+      unless compound.fp4
+        compound.fp4_size = 0
+        compound.fp4 = []
+        Algorithm::Descriptor.smarts_match(compound, FP4.collect{|f| f.smarts}).each_with_index do |m,i|
+          if m > 0
+            compound.fp4 << FP4[i].id
+            compound.fp4_size += 1
+          end
+        end
+      end
+      compound.save
+      compound
+    end
+
     # Create a compound from smiles string
     # @example
     #   compound = OpenTox::Compound.from_smiles("c1ccccc1")
@@ -124,6 +170,29 @@ module OpenTox
       self["chemblid"]
     end
 
+    def neighbors threshold=0.3
+      # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
+      qn = fp4.size
+      qmin = qn * threshold
+      qmax = qn / threshold
+      #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
+      #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
+      aggregate = [
+        #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
+        {'$project': {
+          'tanimoto': {'$let': {
+            'vars': {'common': {'$size': {'$setIntersection': ['$fp4', fp4]}}},
+            'in': {'$divide': ['$$common', {'$subtract': [{'$add': [qn, '$fp4_size']}, '$$common']}]}
+          }},
+          '_id': 1
+        }},
+        {'$match':  {'tanimoto': {'$gte': threshold}}},
+        {'$sort': {'tanimoto': -1}}
+      ]
+      
+      $mongo["compounds"].aggregate(aggregate).collect { |r| [Compound.find(r["_id"]), r["tanimoto"]]}
+    end
+
     private
 
     def self.obconversion(identifier,input_format,output_format,option=nil)
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 152545b..0447bb0 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -10,6 +10,7 @@ module OpenTox
     def prediction_feature
       Feature.find prediction_feature_id
     end
+
   end
 
   class DescriptorDataset < Dataset
@@ -74,6 +75,18 @@ module OpenTox
       @data_entries[row][col] = v
     end
 
+    def correlation_plot training_dataset
+      R.assign "features", data_entries
+      R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
+      R.eval "featurePlot(features,activities)"
+    end
+
+    def density_plot
+      R.assign "acts", data_entries.collect{|r| r.first }#.compact
+      R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
+      # TODO kill Rserve plots
+    end
+
     # merge dataset (i.e. append features)
     def +(dataset)
       bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
@@ -89,7 +102,8 @@ module OpenTox
     end
 
     def fingerprint(compound)
-      data_entries[compound_ids.index(compound.id)]
+      i = compound_ids.index(compound.id)
+      i.nil? ? nil : data_entries[i] 
     end
 
     def data_entries
@@ -209,6 +223,8 @@ module OpenTox
     # does a lot of guesswork in order to determine feature types
     def parse_table table, bioassay=true
 
+      # TODO: remove empty entries + write tests
+
       time = Time.now
 
       # features
@@ -220,7 +236,7 @@ module OpenTox
       numeric = []
       # guess feature types
       feature_names.each_with_index do |f,i|
-        metadata = {}
+        metadata = {:name => f}
         values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
         types = values.collect{|v| v.numeric? ? true : false}.uniq
         if values.size == 0 # empty feature
@@ -246,7 +262,7 @@ module OpenTox
             feature = NominalFeature.find_or_create_by(metadata)
           end
         end
-        feature_ids << OpenTox::Feature.find_or_create_by(metadata).id
+        feature_ids << feature.id
       end
       
       $logger.debug "Feature values: #{Time.now-time}"
@@ -262,6 +278,11 @@ module OpenTox
       table.each_with_index do |vals,i|
         ct = Time.now
         identifier = vals.shift
+        #if vals.compact.empty?
+          #warnings << "No values for compound at position #{i+2}, all entries are ignored."
+          #@data_entries.pop
+          #next
+        #end
         begin
           case compound_format
           when /SMILES/i
diff --git a/lib/feature.rb b/lib/feature.rb
index 0801a47..005d78f 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -27,7 +27,8 @@ module OpenTox
   end
 
   class Smarts < NominalFeature
-    field :name, as: :smarts, type: String # causes warnings
+    field :smarts, type: String 
+    #field :name, as: :smarts, type: String # causes warnings
     field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptors.smarts_match"
     field :parameters, type: Hash, default: {:count => false}
     def initialize params
@@ -46,6 +47,10 @@ module OpenTox
     end
   end
 
+  class FingerprintSmarts < Smarts
+    field :count, type: Integer
+  end
+
   class NominalBioAssay < NominalFeature
     field :description, type: String
   end
diff --git a/lib/opentox-client.rb b/lib/opentox-client.rb
index 42e8186..092b84e 100644
--- a/lib/opentox-client.rb
+++ b/lib/opentox-client.rb
@@ -5,12 +5,14 @@ require 'yaml'
 require 'json'
 require 'logger'
 require 'mongoid'
+require 'rserve'
 
 # TODO store development/test, validation, production in separate databases
 ENV["MONGOID_ENV"] = "development"
 Mongoid.load!("#{ENV['HOME']}/.opentox/config/mongoid.yml")
+R = Rserve::Connection.new
 
-CLASSES = ["Compound", "Feature", "Dataset"]#, "Validation", "Task", "Investigation"]
+CLASSES = ["Feature","Compound",  "Dataset"]#, "Validation", "Task", "Investigation"]
 #CLASSES = ["Feature", "Dataset", "Validation", "Task", "Investigation"]
 
 # Regular expressions for parsing classification data
@@ -26,8 +28,8 @@ FALSE_REGEXP = /^(false|inactive|0|0.0|low tox|deactivating|non-carcinogen|non-m
   #"otlogger.rb", 
   "opentox.rb",
   #"task.rb",
-  "compound.rb",
   "feature.rb",
+  "compound.rb",
   #"data_entry.rb",
   "dataset.rb",
   #"algorithm.rb",
-- 
cgit v1.2.3