From 04af01b8135ea147e9ce253e5526e3ee3adcc675 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 7 Aug 2015 19:50:09 +0200 Subject: initial k-nn weighted average implementation --- lib/compound.rb | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++- lib/dataset.rb | 27 +++++++++++++++++--- lib/feature.rb | 7 ++++- lib/opentox-client.rb | 6 +++-- 4 files changed, 104 insertions(+), 7 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 0c27553..4d292f1 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,7 +1,7 @@ # TODO: check # *** Open Babel Error in ParseFile # Could not find contribution data file. -# 3d creation?? + CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" require 'openbabel' require "base64" @@ -9,6 +9,34 @@ require "base64" module OpenTox class Compound + include OpenTox + +# OpenBabel FP4 fingerprints +# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html +# TODO store in DB +fp4 = FingerprintSmarts.find +unless fp4 + fp4 = [] + File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l| + l.strip! + unless l.empty? or l.match /^#/ + name,smarts = l.split(': ') + fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil? + end + end +end +FP4 = fp4 + +# TODO investigate other types of fingerprints (MACCS) +# OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html +# http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html +# OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna +# Morgan ECFP, FCFP +# http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html +# http://www.rdkit.org/docs/GettingStartedInPython.html +# Chemfp +# https://chemfp.readthedocs.org/en/latest/using-tools.html +# CACTVS/PubChem field :inchi, type: String attr_readonly :inchi @@ -19,6 +47,8 @@ module OpenTox field :chemblid, type: String field :image_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId + field :fp4, type: Array + field :fp4_size, type: Integer #belongs_to :dataset #belongs_to :data_entry @@ -26,6 +56,22 @@ module OpenTox #self.inchi == compound.inchi #end + def self.find_or_create_by params + compound = self.find_or_initialize_by params + unless compound.fp4 + compound.fp4_size = 0 + compound.fp4 = [] + Algorithm::Descriptor.smarts_match(compound, FP4.collect{|f| f.smarts}).each_with_index do |m,i| + if m > 0 + compound.fp4 << FP4[i].id + compound.fp4_size += 1 + end + end + end + compound.save + compound + end + # Create a compound from smiles string # @example # compound = OpenTox::Compound.from_smiles("c1ccccc1") @@ -124,6 +170,29 @@ module OpenTox self["chemblid"] end + def neighbors threshold=0.3 + # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb + qn = fp4.size + qmin = qn * threshold + qmax = qn / threshold + #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) + #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] + aggregate = [ + #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, + {'$project': { + 'tanimoto': {'$let': { + 'vars': {'common': {'$size': {'$setIntersection': ['$fp4', fp4]}}}, + 'in': {'$divide': ['$$common', {'$subtract': [{'$add': [qn, '$fp4_size']}, '$$common']}]} + }}, + '_id': 1 + }}, + {'$match': {'tanimoto': {'$gte': threshold}}}, + {'$sort': {'tanimoto': -1}} + ] + + $mongo["compounds"].aggregate(aggregate).collect { |r| [Compound.find(r["_id"]), r["tanimoto"]]} + end + private def self.obconversion(identifier,input_format,output_format,option=nil) diff --git a/lib/dataset.rb b/lib/dataset.rb index 152545b..0447bb0 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -10,6 +10,7 @@ module OpenTox def prediction_feature Feature.find prediction_feature_id end + end class DescriptorDataset < Dataset @@ -74,6 +75,18 @@ module OpenTox @data_entries[row][col] = v end + def correlation_plot training_dataset + R.assign "features", data_entries + R.assign "activities", training_dataset.data_entries.collect{|de| de.first} + R.eval "featurePlot(features,activities)" + end + + def density_plot + R.assign "acts", data_entries.collect{|r| r.first }#.compact + R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')" + # TODO kill Rserve plots + end + # merge dataset (i.e. append features) def +(dataset) bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset @@ -89,7 +102,8 @@ module OpenTox end def fingerprint(compound) - data_entries[compound_ids.index(compound.id)] + i = compound_ids.index(compound.id) + i.nil? ? nil : data_entries[i] end def data_entries @@ -209,6 +223,8 @@ module OpenTox # does a lot of guesswork in order to determine feature types def parse_table table, bioassay=true + # TODO: remove empty entries + write tests + time = Time.now # features @@ -220,7 +236,7 @@ module OpenTox numeric = [] # guess feature types feature_names.each_with_index do |f,i| - metadata = {} + metadata = {:name => f} values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq if values.size == 0 # empty feature @@ -246,7 +262,7 @@ module OpenTox feature = NominalFeature.find_or_create_by(metadata) end end - feature_ids << OpenTox::Feature.find_or_create_by(metadata).id + feature_ids << feature.id end $logger.debug "Feature values: #{Time.now-time}" @@ -262,6 +278,11 @@ module OpenTox table.each_with_index do |vals,i| ct = Time.now identifier = vals.shift + #if vals.compact.empty? + #warnings << "No values for compound at position #{i+2}, all entries are ignored." + #@data_entries.pop + #next + #end begin case compound_format when /SMILES/i diff --git a/lib/feature.rb b/lib/feature.rb index 0801a47..005d78f 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -27,7 +27,8 @@ module OpenTox end class Smarts < NominalFeature - field :name, as: :smarts, type: String # causes warnings + field :smarts, type: String + #field :name, as: :smarts, type: String # causes warnings field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptors.smarts_match" field :parameters, type: Hash, default: {:count => false} def initialize params @@ -46,6 +47,10 @@ module OpenTox end end + class FingerprintSmarts < Smarts + field :count, type: Integer + end + class NominalBioAssay < NominalFeature field :description, type: String end diff --git a/lib/opentox-client.rb b/lib/opentox-client.rb index 42e8186..092b84e 100644 --- a/lib/opentox-client.rb +++ b/lib/opentox-client.rb @@ -5,12 +5,14 @@ require 'yaml' require 'json' require 'logger' require 'mongoid' +require 'rserve' # TODO store development/test, validation, production in separate databases ENV["MONGOID_ENV"] = "development" Mongoid.load!("#{ENV['HOME']}/.opentox/config/mongoid.yml") +R = Rserve::Connection.new -CLASSES = ["Compound", "Feature", "Dataset"]#, "Validation", "Task", "Investigation"] +CLASSES = ["Feature","Compound", "Dataset"]#, "Validation", "Task", "Investigation"] #CLASSES = ["Feature", "Dataset", "Validation", "Task", "Investigation"] # Regular expressions for parsing classification data @@ -26,8 +28,8 @@ FALSE_REGEXP = /^(false|inactive|0|0.0|low tox|deactivating|non-carcinogen|non-m #"otlogger.rb", "opentox.rb", #"task.rb", - "compound.rb", "feature.rb", + "compound.rb", #"data_entry.rb", "dataset.rb", #"algorithm.rb", -- cgit v1.2.3