From 0c5d2e678908a2d4aea43efbedbedc2c0439be30 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 14 Mar 2016 15:25:50 +0100
Subject: descriptor tests

---
 ext/lazar/extconf.rb       |  36 +------
 lib/bbrc.rb                | 165 -----------------------------
 lib/classification.rb      |   1 -
 lib/compound.rb            |  67 +++++-------
 lib/crossvalidation.rb     |   1 -
 lib/dataset.rb             |   2 -
 lib/descriptor.rb          | 252 ---------------------------------------------
 lib/feature.rb             |   9 --
 lib/lazar.rb               |   8 --
 lib/model.rb               |   3 -
 lib/overwrite.rb           |   6 +-
 lib/physchem.rb            |   4 +
 lib/regression.rb          |   3 +-
 lib/rest-client-wrapper.rb |   1 -
 lib/similarity.rb          |  58 -----------
 lib/validation.rb          |  10 --
 test/compound.rb           |   3 +-
 test/dataset.rb            |   2 +-
 test/descriptor.rb         |  68 +++++-------
 19 files changed, 61 insertions(+), 638 deletions(-)
 delete mode 100644 lib/bbrc.rb
 delete mode 100644 lib/descriptor.rb
 delete mode 100644 lib/similarity.rb

diff --git a/ext/lazar/extconf.rb b/ext/lazar/extconf.rb
index edb960a..a76f0f4 100644
--- a/ext/lazar/extconf.rb
+++ b/ext/lazar/extconf.rb
@@ -5,11 +5,10 @@ main_dir = File.expand_path(File.join(File.dirname(__FILE__),"..",".."))
 
 # install OpenBabel
 
-
 openbabel_version = "2.3.2"
 
 openbabel_dir = File.join main_dir, "openbabel"
-src_dir = openbabel_dir #File.join openbabel_dir, "openbabel-#{openbabel_version}"
+src_dir = openbabel_dir 
 build_dir = File.join src_dir, "build"
 install_dir = openbabel_dir 
 install_lib_dir = File.join install_dir, "lib"
@@ -52,37 +51,4 @@ end
 ob_include= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/include/openbabel-2.0")
 ob_lib= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/lib")
 
-# compile ruby bindings
-=begin
-puts "Compiling and installing OpenBabel Ruby bindings."
-Dir.chdir ruby_src_dir do
-  # fix rpath
-  system "sed -i 's|with_ldflags.*$|with_ldflags(\"#\$LDFLAGS -dynamic -Wl,-rpath,#{install_lib_dir}\") do|' #{File.join(ruby_src_dir,'extconf.rb')}"
-  system "#{RbConfig.ruby} extconf.rb --with-openbabel-include=#{ob_include} --with-openbabel-lib=#{ob_lib}"
-  system "make -j#{nr_processors}"
-end
-=end
-
-# install fminer
-fminer_dir = File.join main_dir, "libfminer"
-system "git clone git://github.com/amaunz/fminer2.git #{fminer_dir}"
-
-["libbbrc","liblast"].each do |lib|
-  FileUtils.cd File.join(fminer_dir,lib)
-  system "sed -i 's,^INCLUDE_OB.*,INCLUDE_OB\ =\ #{ob_include},g' Makefile" 
-  system "sed -i 's,^LDFLAGS_OB.*,LDFLAGS_OB\ =\ #{ob_lib},g' Makefile"
-  system "sed -i 's,^INCLUDE_RB.*,INCLUDE_RB\ =\ #{RbConfig::CONFIG['rubyhdrdir']},g' Makefile" 
-  # TODO fix in fminer Makefile
-  system "sed -i 's,-g, -g -I #{RbConfig::CONFIG['rubyhdrdir']} -I #{RbConfig::CONFIG['rubyarchhdrdir']} -I,' Makefile" # fix include path (CH)
-  system "sed -i '74s/$(CC)/$(CC) -Wl,-rpath,#{ob_lib.gsub('/','\/')} -L/' Makefile" # fix library path (CH)
-  system "make ruby"
-end
-
-# install last-utils
-FileUtils.cd main_dir
-system "git clone git://github.com/amaunz/last-utils.git"
-FileUtils.cd File.join(main_dir,"last-utils")
-`sed -i '8s/"openbabel", //' lu.rb`
-
-# install R packagemain_dir
 $makefile_created = true
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
deleted file mode 100644
index 4594f68..0000000
--- a/lib/bbrc.rb
+++ /dev/null
@@ -1,165 +0,0 @@
-module OpenTox
-  module Algorithm
-    class Fminer
-      TABLE_OF_ELEMENTS = [
-"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
-        
-      #
-      # Run bbrc algorithm on dataset
-      #
-      # @param [OpenTox::Dataset] training dataset
-      # @param [optional] parameters BBRC parameters, accepted parameters are
-      #   - min_frequency  Minimum frequency (default 5)
-      #   - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
-      #   - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
-      #   - min_chisq_significance Significance threshold (between 0 and 1)
-      #   - nr_hits Set to "true" to get hit count instead of presence
-      #   - get_target Set to "true" to obtain target variable as feature
-      # @return [OpenTox::Dataset] Fminer Dataset
-      def self.bbrc training_dataset, params={}
-
-        time = Time.now
-        bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
-
-        prediction_feature = training_dataset.features.first
-        if params[:min_frequency]
-          minfreq = params[:min_frequency]
-        else
-          per_mil = 5 # value from latest version
-          per_mil = 8 # as suggested below
-          i = training_dataset.feature_ids.index prediction_feature.id
-          nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
-          minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
-          minfreq = 2 unless minfreq > 2
-          minfreq = minfreq.round
-        end
-
-        @bbrc ||= Bbrc::Bbrc.new
-        @bbrc.Reset
-        if prediction_feature.numeric 
-          @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
-        else
-          bad_request_error "No accept values for "\
-                            "dataset '#{training_dataset.id}' and "\
-                            "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
-          value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
-        end
-        @bbrc.SetMinfreq(minfreq)
-        @bbrc.SetType(1) if params[:feature_type] == "paths"
-        @bbrc.SetBackbone(false) if params[:backbone] == "false"
-        @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
-        @bbrc.SetConsoleOut(false)
-
-        params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
-        feature_dataset = FminerDataset.new(
-            :training_dataset_id => training_dataset.id,
-            :training_algorithm => "#{self.to_s}.bbrc",
-            :training_feature_id => prediction_feature.id ,
-            :training_parameters => {
-              :min_frequency => minfreq,
-              :nr_hits => nr_hits,
-              :backbone => (params[:backbone] == false ? false : true) 
-            }
-
-        )
-        feature_dataset.compounds = training_dataset.compounds
-
-        # add data 
-        training_dataset.compounds.each_with_index do |compound,i|
-          act = value2act[training_dataset.data_entries[i].first]
-          if act # TODO check if this works
-            @bbrc.AddCompound(compound.smiles,i+1)
-            @bbrc.AddActivity(act,i+1)
-          end
-        end
-        #g_median=@fminer.all_activities.values.to_scale.median
-
-        #task.progress 10
-        #step_width = 80 / @bbrc.GetNoRootNodes().to_f
-
-        $logger.debug "BBRC setup: #{Time.now-time}"
-        time = Time.now
-        ftime = 0
-        itime = 0
-        rtime = 0
-  
-        # run @bbrc
-        (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
-          results = @bbrc.MineRoot(j)
-          results.each do |result|
-            rt = Time.now
-            f = YAML.load(result)[0]
-            smarts = f.shift
-            # convert fminer SMARTS representation into a more human readable format
-            smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
-             element = TABLE_OF_ELEMENTS[$1.to_i-1]
-             $2 == "a" ? element.downcase : element
-            end
-            p_value = f.shift
-            f.flatten!
-            compound_idxs = f.collect{|e| e.first.first-1}
-            # majority class
-            effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
-  
-=begin
-            if (!@bbrc.GetRegression)
-              id_arrs = f[2..-1].flatten
-              max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
-              effect = max+1
-            else #regression part
-              id_arrs = f[2]
-              # DV: effect calculation
-              f_arr=Array.new
-              f[2].each do |id|
-                id=id.keys[0] # extract id from hit count hash
-                f_arr.push(@fminer.all_activities[id])
-              end
-              f_median=f_arr.to_scale.median
-              if g_median >= f_median
-                effect = 'activating'
-              else
-                effect = 'deactivating'
-              end
-            end
-=end
-            rtime += Time.now - rt
-  
-            ft = Time.now
-            feature = OpenTox::FminerSmarts.find_or_create_by({
-              "smarts" => smarts,
-              "p_value" => p_value.to_f.abs.round(5),
-              "effect" => effect,
-              "dataset_id" => feature_dataset.id
-            })
-            feature_dataset.feature_ids << feature.id
-            ftime += Time.now - ft
-
-            it = Time.now
-            f.each do |id_count_hash|
-              id_count_hash.each do |id,count|
-                nr_hits ? count = count.to_i : count = 1
-                feature_dataset.data_entries[id-1] ||= []
-                feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
-              end
-            end
-            itime += Time.now - it
-  
-          end
-        end
-
-        $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
-        time = Time.now
-
-        feature_dataset.fill_nil_with 0
-
-        $logger.debug "Prepare save: #{Time.now-time}"
-        time = Time.now
-        feature_dataset.save
-
-        $logger.debug "Save: #{Time.now-time}"
-        feature_dataset
-  
-      end
-    end
-  end
-end
diff --git a/lib/classification.rb b/lib/classification.rb
index 7a225bb..abbb5b3 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -92,7 +92,6 @@ module OpenTox
           prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
           prediction = prediction.sub(/Val/,"") if prediction # Convert back
           confidence = 0.0 if prediction.nil?
-          #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
           confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
         end
         {:value => prediction, :confidence => confidence}
diff --git a/lib/compound.rb b/lib/compound.rb
index 8c11831..2a79fd6 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -1,7 +1,3 @@
-# TODO: check
-# *** Open Babel Error  in ParseFile
-#    Could not find contribution data file.
-
 CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
 
 module OpenTox
@@ -9,7 +5,6 @@ module OpenTox
   class Compound
     require_relative "unique_descriptors.rb"
     include OpenTox
-    include OpenTox::Descriptor
 
     DEFAULT_FINGERPRINT = "MP2D"
 
@@ -22,7 +17,6 @@ module OpenTox
     field :png_id, type: BSON::ObjectId
     field :svg_id, type: BSON::ObjectId
     field :sdf_id, type: BSON::ObjectId
-    field :molecular_weight, type: Float
     field :fingerprints, type: Hash, default: {}
     field :default_fingerprint_size, type: Integer
     field :physchem_descriptors, type: Hash, default: {}
@@ -30,7 +24,6 @@ module OpenTox
     field :features, type: Hash, default: {}
 
     index({smiles: 1}, {unique: true})
-    #index({default_fingerprint: 1}, {unique: false})
 
     # Overwrites standard Mongoid method to create fingerprints before database insertion
     def self.find_or_create_by params
@@ -106,7 +99,24 @@ module OpenTox
         end
       end
       save
-      physchem_descriptors
+      physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+    end
+
+    def smarts_match smarts, count=false
+      obconversion = OpenBabel::OBConversion.new
+      obmol = OpenBabel::OBMol.new
+      obconversion.set_in_format('smi')
+      obconversion.read_string(obmol,self.smiles)
+      smarts_pattern = OpenBabel::OBSmartsPattern.new
+      smarts.collect do |sma|
+        smarts_pattern.init(sma.smarts)
+        if smarts_pattern.match(obmol)
+          count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
+        else
+          value = 0 
+        end
+        value
+      end
     end
 
     # Create a compound from smiles string
@@ -281,34 +291,16 @@ module OpenTox
         training_dataset = Dataset.find(params[:training_dataset_id])
         prediction_feature = training_dataset.features.first
         training_dataset.compounds.each do |compound|
-          #unless self == compound
-            candidate_fingerprint = compound.fingerprint params[:type]
-            sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
-            feature_values = training_dataset.values(compound,prediction_feature)
-            neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
-          #end
+          candidate_fingerprint = compound.fingerprint params[:type]
+          sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
+          feature_values = training_dataset.values(compound,prediction_feature)
+          neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
         end
         neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
       end
       neighbors
     end
 
-    def fminer_neighbors params
-      bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
-      feature_dataset = Dataset.find params[:feature_dataset_id]
-      query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
-      neighbors = []
-
-      # find neighbors
-      feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
-        sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint
-        if sim >= params[:min_sim]
-          neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
-        end
-      end
-      neighbors
-    end
-
     def physchem_neighbors params
       feature_dataset = Dataset.find params[:feature_dataset_id]
       query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
@@ -317,13 +309,7 @@ module OpenTox
         # TODO implement pearson and cosine similarity separatly
         R.assign "x", query_fingerprint
         R.assign "y", candidate_fingerprint
-        # pearson r
-        #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
-        #p "pearson"
-        #p sim
-        #p "cosine"
         sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
-        #p sim
         if sim >= params[:min_sim]
           neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
         end
@@ -357,9 +343,6 @@ module OpenTox
       ]
       
       $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
-
-
-      #$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
         
     end
     
@@ -378,10 +361,8 @@ module OpenTox
     # Calculate molecular weight of Compound with OB and store it in object
     # @return [Float] molecular weight
     def molecular_weight
-      if self["molecular_weight"]==0.0 || self["molecular_weight"].nil?
-        update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first)
-      end
-      self["molecular_weight"].to_f
+      mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
+      physchem([mw_feature])[mw_feature.id.to_s]
     end
 
     private
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index ea32a2b..cd94e33 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -55,7 +55,6 @@ module OpenTox
         predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
       )
       $logger.debug "Nr unpredicted: #{nr_unpredicted}"
-      #cv.statistics
       cv
     end
   end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index b9c2187..af851b5 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -132,7 +132,6 @@ module OpenTox
       end
     end
 
-
     # Parsers
 
     # Create a dataset from file (csv,sdf,...)
@@ -211,7 +210,6 @@ module OpenTox
       value_time = 0
 
       # compounds and values
-      #@data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
       self.data_entries = []
 
       table.each_with_index do |vals,i|
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
deleted file mode 100644
index 14a123b..0000000
--- a/lib/descriptor.rb
+++ /dev/null
@@ -1,252 +0,0 @@
-require 'digest/md5'
-ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" 
-# TODO store descriptors in mongodb
-
-module OpenTox
-
-  #module Algorithm 
-    
-    # Class for descriptor calculations
-    module Descriptor 
-      include OpenTox
-
-      JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
-      CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
-      JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
-      LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
-      JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
-
-      obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
-      OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
-        name,description = d.split(/\s+/,2)
-        ["Openbabel_"+name,description] unless obexclude.include? name
-      end.compact.sort{|a,b| a[0] <=> b[0]}]
-
-      cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR}  CdkDescriptorInfo`)
-      CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
-      CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"_"+name } }.flatten
-
-      # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
-      joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
-      # strip Joelib messages from stdout
-      JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR}  JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
-        name = d[:java_class].sub(/^joelib2.feature.types./,'').gsub(/\./,"_")
-        ["Joelib_"+name, "impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java"] unless joelibexclude.include? name
-      end.compact.sort{|a,b| a[0] <=> b[0]}] 
-
-      DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
-      DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
-
-      require_relative "unique_descriptors.rb"
-
-      # Description of available descriptors
-      def self.description descriptor
-        lib = descriptor.split('_').first
-        case lib
-        when "Openbabel"
-          OBDESCRIPTORS[descriptor]
-        when "Cdk"
-          name = descriptor.split('_')[0..-2].join('_')
-          CDKDESCRIPTORS[name]
-        when "Joelib"
-          JOELIBDESCRIPTORS[descriptor]
-        when "lookup"
-          "Read feature values from a dataset"
-        end
-      end
-
-      # Match an array of smarts features 
-      def self.smarts_match compounds, smarts_features, count=false
-        bad_request_error "Compounds for smarts_match are empty" unless compounds
-        bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
-        parse compounds
-        @count = count
-        obconversion = OpenBabel::OBConversion.new
-        obmol = OpenBabel::OBMol.new
-        obconversion.set_in_format('smi')
-        smarts_pattern = OpenBabel::OBSmartsPattern.new
-        smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
-        @smarts = smarts_features.collect{|f| f.smarts}
-        @physchem_descriptors = nil
-        @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
-        @compounds.each_with_index do |compound,c|
-          obconversion.read_string(obmol,compound.smiles)
-          @smarts.each_with_index do |smart,s|
-            smarts_pattern.init(smart)
-            if smarts_pattern.match(obmol)
-              count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
-            else
-              value = 0 
-            end
-            @data_entries[c][s] = value
-          end
-        end
-        serialize 
-      end
-
-      # Count matches of an array with smarts features 
-      def self.smarts_count compounds, smarts
-        # TODO: non-overlapping matches?
-        smarts_match compounds,smarts,true
-      end
-
-      # Calculate physchem descriptors
-      # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
-      def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
-        parse compounds
-        @data_entries = Array.new(@compounds.size){[]}
-        @descriptors = descriptors
-        @smarts = nil
-        @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
-        des = {}
-        @descriptors.each do |d|
-          lib, descriptor = d.split("_",2)
-          lib = lib.downcase.to_sym
-          des[lib] ||= []
-          des[lib] << descriptor
-        end
-        des.each do |lib,descriptors|
-          send(lib, descriptors)
-        end
-        serialize
-      end
-
-      def self.openbabel descriptors
-        $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
-        obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
-        obmol = OpenBabel::OBMol.new
-        obconversion = OpenBabel::OBConversion.new
-        obconversion.set_in_format 'smi'
-        last_feature_idx = @physchem_descriptors.size
-        @compounds.each_with_index do |compound,c|
-          obconversion.read_string obmol, compound.smiles
-          obdescriptors.each_with_index do |descriptor,d|
-            @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
-          end
-        end
-        @physchem_descriptors += descriptors.collect{|d| "Openbabel_#{d}"}
-      end
-
-      def self.java_descriptors descriptors, lib
-        $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
-        sdf = sdf_3d 
-        # use java system call (rjb blocks within tasks)
-        # use Tempfiles to avoid "Argument list too long" error 
-        case lib
-        when "cdk"
-          run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR}  CdkDescriptors #{sdf} #{descriptors.join(" ")}"
-        when "joelib"
-          run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR}  JoelibDescriptors  #{sdf} #{descriptors.join(' ')}"
-        end
-        last_feature_idx = @physchem_descriptors.size
-        YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
-          # TODO create warnings
-          #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
-          # CDK Descriptors may calculate multiple values, they are stored in separate features
-          @physchem_descriptors += calculation.keys if i == 0
-          calculation.keys.each_with_index do |name,j|
-            @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
-          end
-        end
-        FileUtils.rm "#{sdf}#{lib}.yaml"
-      end
-
-      def self.cdk descriptors
-        java_descriptors descriptors, "cdk"
-      end
-
-      def self.joelib descriptors
-        java_descriptors descriptors, "joelib"
-      end
-
-      def self.lookup compounds, features, dataset
-        parse compounds
-        fingerprint = []
-        compounds.each do |compound|
-          fingerprint << []
-          features.each do |feature|
-          end
-        end
-      end
-
-      def self.run_cmd cmd
-        cmd = "#{cmd} 2>&1"
-        $logger.debug "running external cmd: '#{cmd}'"
-        p = IO.popen(cmd) do |io|
-          while line = io.gets
-            $logger.debug "> #{line.chomp}"
-          end
-          io.close
-          raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
-        end
-      end
-
-      def self.sdf_3d 
-        # TODO check if 3d sdfs are stored in GridFS
-        sdf = ""
-        @compounds.each do |compound|
-          sdf << compound.sdf
-        end
-        sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
-        File.open(sdf_file,"w+"){|f| f.print sdf}
-        sdf_file
-      end
-
-      def self.parse compounds
-        @input_class = compounds.class.to_s
-        case @input_class
-        when "OpenTox::Compound"
-          @compounds = [compounds]
-        when "Array"
-          @compounds = compounds
-        when "OpenTox::Dataset"
-          @compounds = compounds.compounds
-        else
-          bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
-        end
-      end
-
-      def self.serialize
-        #@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
-        case @input_class
-          # TODO beautify and fix for other objects
-        when "OpenTox::Compound"
-          r = {}
-          @data_entries.first.each_with_index do |d,i|
-            # TODO fix @ source
-            r[@physchem_descriptors[i].gsub(/\./,'_')] = d
-          end
-          r 
-        when "Array"
-          @data_entries
-        when "OpenTox::Dataset"
-          dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
-          if @smarts
-            dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
-            @count ? algo = "count" : algo = "match"
-            dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
-            
-          elsif @physchem_descriptors
-            dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
-            dataset.data_entries = @data_entries
-            dataset.feature_calculation_algorithm = "#{self}.physchem"
-            #TODO params?
-          end
-          dataset.save
-          dataset
-        end
-      end
-
-      def self.fix_value val
-        val = val.first if val.is_a? Array and val.size == 1
-        val = nil if val == "NaN"
-        if val.numeric?
-          val = Float(val)
-          val = nil if val.nan? or val.infinite?
-        end
-        val
-      end
-      private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
-    end
-  #end
-end
diff --git a/lib/feature.rb b/lib/feature.rb
index 21572ca..b58946b 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -10,7 +10,6 @@ module OpenTox
 
   # Feature for categorical variables
   class NominalFeature < Feature
-    # TODO check if accept_values are still needed 
     field :accept_values, type: Array
     def initialize params
       super params
@@ -35,14 +34,6 @@ module OpenTox
     end
   end
 
-  # Feature for supervised fragments from Fminer algorithm
-  class FminerSmarts < Smarts
-    field :p_value, type: Float
-    # TODO check if effect is used
-    field :effect, type: String
-    field :dataset_id 
-  end
-
   # Feature for categorical bioassay results
   class NominalBioAssay < NominalFeature
   end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 63257ca..0125d27 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -24,7 +24,6 @@ Mongoid.load_configuration({
   }
 })
 Mongoid.raise_not_found_error = false # return nil if no document is found
-#$mongo = Mongoid.default_client
 $mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
 $gridfs = $mongo.database.fs
 
@@ -57,9 +56,6 @@ suppressPackageStartupMessages({
 "
 
 # Require sub-Repositories
-#require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
-#require_relative '../libfminer/liblast/last' # 
-#require_relative '../last-utils/lu.rb'
 require_relative '../openbabel/lib/openbabel'
 
 # Fminer environment variables
@@ -79,14 +75,10 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
   "opentox.rb",
   "feature.rb",
   "physchem.rb",
-  "descriptor.rb",
   "compound.rb",
   "dataset.rb",
-  "descriptor.rb",
   "algorithm.rb",
-  #"bbrc.rb",
   "model.rb",
-  "similarity.rb",
   "classification.rb",
   "regression.rb",
   "validation.rb",
diff --git a/lib/model.rb b/lib/model.rb
index 8cffdfd..ebc0db3 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -163,8 +163,6 @@ module OpenTox
           :type => "MP2D",
           :training_dataset_id => training_dataset.id,
           :min_sim => 0.1
-          #:type => "FP4",
-          #:min_sim => 0.7
         }.each do |key,value|
           model.neighbor_algorithm_parameters[key] ||= value
         end
@@ -197,7 +195,6 @@ module OpenTox
       include Mongoid::Document
       include Mongoid::Timestamps
 
-      # TODO cv -> repeated cv
       # TODO field Validations
       field :endpoint, type: String
       field :species, type: String
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index 2287a92..cef5758 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -23,10 +23,10 @@ class Numeric
 end
 
 class Float
-  # round to significant digits
+  # round to n significant digits
   # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
-  def signif(signs)
-    Float("%.#{signs}g" % self)
+  def signif(n)
+    Float("%.#{n}g" % self)
   end
 end
 
diff --git a/lib/physchem.rb b/lib/physchem.rb
index 64018ad..067cd59 100644
--- a/lib/physchem.rb
+++ b/lib/physchem.rb
@@ -37,6 +37,10 @@ module OpenTox
 
     DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
 
+    DESCRIPTORS.each do |name,description|
+      lib,desc = name.split('.',2)
+      self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+    end
 
     require_relative "unique_descriptors.rb"
 
diff --git a/lib/regression.rb b/lib/regression.rb
index 2bf8915..e0b109e 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -23,7 +23,6 @@ module OpenTox
       end
 
       # TODO explicit neighbors, also for physchem
-      #def self.local_fingerprint_regression  compound, params, method="pls", method_params="ncomp = 4"
       def self.local_fingerprint_regression  compound, params, method='pls'#, method_params="sigma=0.05"
         neighbors = params[:neighbors]
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
@@ -129,7 +128,7 @@ module OpenTox
         R.assign "features", training_features
         R.eval "names(data) <- append(c('activities'),features)" #
         begin
-          R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"#, #{params}"
+          R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
         rescue 
           return nil
         end
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
index 6b5d602..9321a75 100644
--- a/lib/rest-client-wrapper.rb
+++ b/lib/rest-client-wrapper.rb
@@ -29,7 +29,6 @@ module OpenTox
         bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) 
         headers[:subjectid] ||= @@subjectid
         bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri
-        #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
         # make sure that no header parameters are set in the payload
         [:accept,:content_type,:subjectid].each do |header|
           if defined? $aa || URI(uri).host == URI($aa[:uri]).host
diff --git a/lib/similarity.rb b/lib/similarity.rb
deleted file mode 100644
index 91e18db..0000000
--- a/lib/similarity.rb
+++ /dev/null
@@ -1,58 +0,0 @@
-=begin
-* Name: similarity.rb
-* Description: Similarity algorithms
-* Author: Andreas Maunz <andreas@maunz.de
-* Date: 10/2012
-=end
-
-module OpenTox
-  module Algorithm
-
-    class Similarity
-
-      #TODO weighted tanimoto
-
-      # Tanimoto similarity
-      # @param [Array] a fingerprints of first compound
-      # @param [Array] b fingerprints of second compound
-      # @return [Float] Tanimoto similarity
-      def self.tanimoto(a,b)
-        bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
-        #common = 0.0
-        #a.each_with_index do |n,i|
-          #common += 1 if n == b[i]
-        #end
-        #common/a.size
-        # TODO check if calculation speed can be improved
-        common_p_sum = 0.0
-        all_p_sum = 0.0
-        (0...a.size).each { |idx|
-          common_p_sum += [ a[idx], b[idx] ].min
-          all_p_sum += [ a[idx], b[idx] ].max
-        }
-        common_p_sum/all_p_sum
-      end
-
-
-      # Cosine similarity
-      # @param [Array] a fingerprints of first compound
-      # @param [Array] b fingerprints of second compound
-      # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
-      def self.cosine(a, b)
-        val = 0.0
-        if a.size>0 and b.size>0
-          if a.size>12 && b.size>12
-            a = a[0..11]
-            b = b[0..11]
-          end
-          a_vec = a.to_gv
-          b_vec = b.to_gv
-          val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
-        end
-        val
-      end
-
-    end
-
-  end
-end
diff --git a/lib/validation.rb b/lib/validation.rb
index 9c19cde..3659341 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -102,16 +102,6 @@ module OpenTox
       weighted_mae = weighted_mae/confidence_sum
       rmse = Math.sqrt(rmse/predictions.size)
       weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
-=begin
-      update_attributes(
-        mae: mae,
-        rmse: rmse,
-        weighted_mae: weighted_mae,
-        weighted_rmse: weighted_rmse,
-        r_squared: r**2,
-        finished_at: Time.now
-      )
-=end
       { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
     end
   end
diff --git a/test/compound.rb b/test/compound.rb
index 6c866b3..7342310 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -64,8 +64,7 @@ print c.sdf
 
   def test_chemblid
     c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
-    #assert_equal "CHEMBL277500", c.chemblid
-    assert_equal "CHEMBL581676", c.chemblid
+    assert_equal "CHEMBL277500", c.chemblid
   end
 
   def test_sdf_storage
diff --git a/test/dataset.rb b/test/dataset.rb
index 76eaf60..2f75703 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -69,7 +69,7 @@ class DatasetTest < MiniTest::Test
     assert_equal 3, d.compounds.size
     assert_equal 2, d.features.size
     assert_equal [[1,2],[4,5],[6,7]], d.data_entries
-    d.save_all
+    d.save
     # check if dataset has been saved correctly
     new_dataset = Dataset.find d.id
     assert_equal 3, new_dataset.compounds.size
diff --git a/test/descriptor.rb b/test/descriptor.rb
index 28be79e..d7d1385 100644
--- a/test/descriptor.rb
+++ b/test/descriptor.rb
@@ -4,81 +4,65 @@ class DescriptorTest < MiniTest::Test
 
   def test_list
     # check available descriptors
-    @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys
-    assert_equal 110,@descriptors.size,"wrong num physchem descriptors"
-    @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES
-    assert_equal 355,@descriptor_values.size,"wrong num physchem descriptors"
-    sum = 0
-    [ @descriptors, @descriptor_values ].each do |desc|
-      {"Openbabel"=>15,"Cdk"=>(desc==@descriptors ? 50 : 295),"Joelib"=>45}.each do |k,v|
-        assert_equal v,desc.select{|x| x=~/^#{k}\./}.size,"wrong num #{k} descriptors"
-        sum += v
-      end
-    end
-    assert_equal (465),sum
+    assert_equal 355,PhysChem.descriptors.size,"incorrect number of physchem descriptors"
+    assert_equal 15,PhysChem.openbabel_descriptors.size,"incorrect number of Openbabel descriptors"
+    assert_equal 295,PhysChem.cdk_descriptors.size,"incorrect number of Cdk descriptors"
+    assert_equal 45,PhysChem.joelib_descriptors.size,"incorrect number of Joelib descriptors"
   end
 
   def test_smarts
     c = OpenTox::Compound.from_smiles "N=C=C1CCC(=F=FO)C1"
     File.open("tmp.png","w+"){|f| f.puts c.png}
     s = Smarts.find_or_create_by(:smarts => "F=F")
-    result = OpenTox::Algorithm::Descriptor.smarts_match c, s
+    result = c.smarts_match [s]
     assert_equal [1], result
     smarts = ["CC", "C", "C=C", "CO", "F=F", "C1CCCC1", "NN"].collect{|s| Smarts.find_or_create_by(:smarts => s)}
-    result = OpenTox::Algorithm::Descriptor.smarts_match c, smarts
+    result = c.smarts_match smarts
     assert_equal [1, 1, 1, 0, 1, 1, 0], result
     smarts_count = [10, 6, 2, 0, 2, 10, 0]
-    result = OpenTox::Algorithm::Descriptor.smarts_count c, smarts
+    result = c.smarts_match smarts, true
     assert_equal smarts_count, result
   end
 
   def test_compound_openbabel_single
     c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
-    result = OpenTox::Algorithm::Descriptor.physchem c, ["Openbabel.logP"]
-    assert_equal 1.12518, result.first
+    result = c.physchem [PhysChem.find_or_create_by(:name => "Openbabel.logP")]
+    assert_equal 1.12518, result.first.last.round(5)
   end
 
   def test_compound_cdk_single
     c = OpenTox::Compound.from_smiles "c1ccccc1"
-    result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"]
-    assert_equal [12], result
+    result = c.physchem [PhysChem.find_or_create_by(:name => "Cdk.AtomCount.nAtom")]
+    assert_equal 12, result.first.last
     c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
-    result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"]
-    assert_equal [17], result
-    result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.CarbonTypes"]
+    result = c.physchem [PhysChem.find_or_create_by(:name => "Cdk.AtomCount.nAtom")]
+    assert_equal 17, result.first.last
     c_types = {"Cdk.CarbonTypes.C1SP1"=>1, "Cdk.CarbonTypes.C2SP1"=>0, "Cdk.CarbonTypes.C1SP2"=>0, "Cdk.CarbonTypes.C2SP2"=>1, "Cdk.CarbonTypes.C3SP2"=>0, "Cdk.CarbonTypes.C1SP3"=>2, "Cdk.CarbonTypes.C2SP3"=>1, "Cdk.CarbonTypes.C3SP3"=>1, "Cdk.CarbonTypes.C4SP3"=>0}
-    assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result
+    physchem_features = c_types.collect{|t,nr| PhysChem.find_or_create_by(:name => t)}
+    result = c.physchem physchem_features
+    assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result.values
   end
 
   def test_compound_joelib_single
     c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
-    result = OpenTox::Algorithm::Descriptor.physchem c, ["Joelib.LogP"]
-    assert_equal [2.65908], result
+    result = c.physchem [PhysChem.find_or_create_by(:name => "Joelib.LogP")]
+    assert_equal 2.65908, result.first.last
   end
 
   def test_compound_all
     c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
-    result = OpenTox::Algorithm::Descriptor.physchem c
-    assert_equal 330, result.size
-    assert_equal 30.8723, result[2]
-    assert_equal 5, result[328]
-    p result
+    result = c.physchem PhysChem.descriptors
+    amr = PhysChem.find_or_create_by(:name => "Cdk.ALOGP.AMR", :library => "Cdk")
+    sbonds = PhysChem.find_by(:name => "Openbabel.sbonds")
+    assert_equal 30.8723, result[amr.id.to_s]
+    assert_equal 5, result[sbonds.id.to_s]
   end
 
   def test_compound_descriptor_parameters
     c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
-    result = OpenTox::Algorithm::Descriptor.physchem c, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ]#, true
-    assert_equal 12, result.size
-    assert_equal [1.12518, 17.0, 1, 0, 0, 1, 0, 2, 1, 1, 0, 2.65908], result#.last
-  end
-
-  def test_dataset_descriptor_parameters
-    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
-    d = OpenTox::Algorithm::Descriptor.physchem dataset, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ]
-    assert_kind_of Dataset, d
-    assert_equal dataset.compounds, d.compounds
-    assert_equal dataset.compounds.size, d.data_entries.size
-    assert_equal 12, d.data_entries.first.size
+    result = c.physchem [ "Openbabel.logP", "Cdk.AtomCount.nAtom", "Joelib.LogP" ].collect{|d| PhysChem.find_or_create_by(:name => d)}
+    assert_equal 3, result.size
+    assert_equal [1.12518, 17.0, 2.65908], result.values.collect{|v| v.round 5}
   end
 
 end
-- 
cgit v1.2.3