From b7cd3ebbb858a8891c35c45896f1bdd525f3534e Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 10 Aug 2015 13:26:06 +0200
Subject: algorithm libraries added, fminer tests pass

---
 lazar.gemspec           |  29 +++++++
 lib/algorithm.rb        |  21 +++++
 lib/bbrc.rb             | 159 ++++++++++++++++++++++++++++++++++++++
 lib/classification.rb   | 107 ++++++++++++++++++++++++++
 lib/compound.rb         |   2 -
 lib/crossvalidation.rb  | 187 +++++++++++++++++++++++++++++++++++++++++++++
 lib/dataset.rb          |   4 +-
 lib/descriptor.rb       |  73 +++++++++---------
 lib/feature.rb          |   3 +
 lib/lazar.rb            |  31 +++++++-
 lib/neighbor.rb         |  25 ++++++
 lib/regression.rb       | 199 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/similarity.rb       |  58 ++++++++++++++
 lib/validation.rb       | 114 +++++++++++++++++++++++++++
 test/descriptor-long.rb |  13 ++++
 test/descriptor.rb      |  83 ++++++++++++++++++++
 test/fminer-long.rb     |  37 +++++++++
 test/fminer.rb          |  46 +++++++++++
 18 files changed, 1146 insertions(+), 45 deletions(-)
 create mode 100644 lazar.gemspec
 create mode 100644 lib/algorithm.rb
 create mode 100644 lib/bbrc.rb
 create mode 100644 lib/classification.rb
 create mode 100644 lib/crossvalidation.rb
 create mode 100644 lib/neighbor.rb
 create mode 100644 lib/regression.rb
 create mode 100644 lib/similarity.rb
 create mode 100644 lib/validation.rb
 create mode 100644 test/descriptor-long.rb
 create mode 100644 test/descriptor.rb
 create mode 100644 test/fminer-long.rb
 create mode 100644 test/fminer.rb

diff --git a/lazar.gemspec b/lazar.gemspec
new file mode 100644
index 0000000..3a9a1af
--- /dev/null
+++ b/lazar.gemspec
@@ -0,0 +1,29 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+
+Gem::Specification.new do |s|
+  s.name        = "opentox-client"
+  s.version     = File.read("./VERSION").strip
+  s.authors     = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler, Denis Gebele"]
+  s.email       = ["helma@in-silico.ch"]
+  s.homepage    = "http://github.com/opentox/lazar"
+  s.summary     = %q{Ruby wrapper for the OpenTox REST API}
+  s.description = %q{Ruby wrapper for the OpenTox REST API (http://www.opentox.org)}
+  s.license     = 'GPL-3'
+
+  s.rubyforge_project = "lazar"
+
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+
+  # specify any dependencies here; for example:
+  s.add_runtime_dependency "bundler"
+  s.add_runtime_dependency "rest-client"
+  s.add_runtime_dependency 'nokogiri'
+  s.add_runtime_dependency "openbabel"
+  s.add_runtime_dependency 'rserve-client'
+  s.add_runtime_dependency "mongoid", '~> 5.0beta'  
+
+end
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
new file mode 100644
index 0000000..113f847
--- /dev/null
+++ b/lib/algorithm.rb
@@ -0,0 +1,21 @@
+module OpenTox
+
+  module Algorithm 
+
+    # Generic method to execute algorithms
+    # Algorithms should:
+    #   - accept a Compound, an Array of Compounds or a Dataset as first argument
+    #   - optional parameters as second argument
+    #   - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
+    # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
+    # @param [Hash] Algorithm parameters
+    # @return Algorithm result
+    def self.run algorithm, object, parameters=nil
+      bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
+      klass,method = algorithm.split('.')
+      parameters.nil? ?  Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
+    end
+
+  end
+end
+
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
new file mode 100644
index 0000000..6a2eed7
--- /dev/null
+++ b/lib/bbrc.rb
@@ -0,0 +1,159 @@
+module OpenTox
+  module Algorithm
+    class Fminer
+      TABLE_OF_ELEMENTS = [
+"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
+        
+      #
+      # Run bbrc algorithm on dataset
+      #
+      # @param [OpenTox::Dataset] training dataset
+      # @param [optional] parameters BBRC parameters, accepted parameters are
+      #   - min_frequency  Minimum frequency (default 5)
+      #   - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
+      #   - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
+      #   - min_chisq_significance Significance threshold (between 0 and 1)
+      #   - nr_hits Set to "true" to get hit count instead of presence
+      #   - get_target Set to "true" to obtain target variable as feature
+      # @return [OpenTox::Dataset] Fminer Dataset
+      def self.bbrc training_dataset, params={}
+
+        time = Time.now
+        bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+
+        prediction_feature = training_dataset.features.first
+        if params[:min_frequency]
+          minfreq = params[:min_frequency]
+        else
+          per_mil = 5 # value from latest version
+          i = training_dataset.feature_ids.index prediction_feature.id
+          nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
+          minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+          minfreq = 2 unless minfreq > 2
+          minfreq = minfreq.round
+        end
+
+        @bbrc ||= Bbrc::Bbrc.new
+        @bbrc.Reset
+        if prediction_feature.numeric 
+          @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
+        else
+          bad_request_error "No accept values for "\
+                            "dataset '#{training_dataset.id}' and "\
+                            "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
+          value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
+        end
+        @bbrc.SetMinfreq(minfreq)
+        @bbrc.SetType(1) if params[:feature_type] == "paths"
+        @bbrc.SetBackbone(false) if params[:backbone] == "false"
+        @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
+        @bbrc.SetConsoleOut(false)
+
+        params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
+        feature_dataset = FminerDataset.new(
+            :training_dataset_id => training_dataset.id,
+            :training_algorithm => "#{self.to_s}.bbrc",
+            :training_feature_id => prediction_feature.id ,
+            :training_parameters => {
+              :min_frequency => minfreq,
+              :nr_hits => nr_hits,
+              :backbone => (params[:backbone] == false ? false : true) 
+            }
+
+        )
+        feature_dataset.compounds = training_dataset.compounds
+
+        # add data 
+        training_dataset.compounds.each_with_index do |compound,i|
+          @bbrc.AddCompound(compound.smiles,i+1)
+          act = value2act[training_dataset.data_entries[i].first]
+          @bbrc.AddActivity(act,i+1)
+        end
+        #g_median=@fminer.all_activities.values.to_scale.median
+
+        #task.progress 10
+        #step_width = 80 / @bbrc.GetNoRootNodes().to_f
+
+        $logger.debug "BBRC setup: #{Time.now-time}"
+        time = Time.now
+        ftime = 0
+        itime = 0
+        rtime = 0
+  
+        # run @bbrc
+        (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
+          results = @bbrc.MineRoot(j)
+          results.each do |result|
+            rt = Time.now
+            f = YAML.load(result)[0]
+            smarts = f.shift
+            # convert fminer SMARTS representation into a more human readable format
+            smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
+             element = TABLE_OF_ELEMENTS[$1.to_i-1]
+             $2 == "a" ? element.downcase : element
+            end
+            p_value = f.shift
+            f.flatten!
+  
+=begin
+            if (!@bbrc.GetRegression)
+              id_arrs = f[2..-1].flatten
+              max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
+              effect = max+1
+            else #regression part
+              id_arrs = f[2]
+              # DV: effect calculation
+              f_arr=Array.new
+              f[2].each do |id|
+                id=id.keys[0] # extract id from hit count hash
+                f_arr.push(@fminer.all_activities[id])
+              end
+              f_median=f_arr.to_scale.median
+              if g_median >= f_median
+                effect = 'activating'
+              else
+                effect = 'deactivating'
+              end
+            end
+=end
+            rtime += Time.now - rt
+  
+            ft = Time.now
+            feature = OpenTox::FminerSmarts.find_or_create_by({
+              "smarts" => smarts,
+              "p_value" => p_value.to_f.abs.round(5),
+              #"effect" => effect,
+              "dataset_id" => feature_dataset.id
+            })
+            feature_dataset.feature_ids << feature.id
+            ftime += Time.now - ft
+
+            it = Time.now
+            f.each do |id_count_hash|
+              id_count_hash.each do |id,count|
+                nr_hits ? count = count.to_i : count = 1
+                feature_dataset.data_entries[id-1] ||= []
+                feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
+              end
+            end
+            itime += Time.now - it
+  
+          end
+        end
+
+        $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
+        time = Time.now
+
+        feature_dataset.fill_nil_with 0
+
+        $logger.debug "Prepare save: #{Time.now-time}"
+        time = Time.now
+        feature_dataset.save_all
+
+        $logger.debug "Save: #{Time.now-time}"
+        feature_dataset
+  
+      end
+    end
+  end
+end
diff --git a/lib/classification.rb b/lib/classification.rb
new file mode 100644
index 0000000..fc6fa77
--- /dev/null
+++ b/lib/classification.rb
@@ -0,0 +1,107 @@
+module OpenTox
+  module Algorithm
+    
+    class Classification
+
+      def self.weighted_majority_vote neighbors
+        return [nil,nil] if neighbors.empty?
+        weighted_sum = {}
+        sim_sum = 0.0
+        neighbors.each do |row|
+          n,sim,acts = row
+          acts.each do |act|
+            weighted_sum[act] ||= 0
+            weighted_sum[act] += sim
+          end
+        end
+        case weighted_sum.size
+        when 1
+          return [weighted_sum.keys.first, 1.0]
+        when 2
+          sim_sum = weighted_sum[weighted_sum.keys[0]]
+          sim_sum -= weighted_sum[weighted_sum.keys[1]]
+          sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] 
+          confidence = (sim_sum/neighbors.size).abs 
+          return [prediction,confidence]
+        else
+          bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
+        end
+      end
+
+      # Classification with majority vote from neighbors weighted by similarity
+      # @param [Hash] params Keys `:activities, :sims, :value_map` are required
+      # @return [Numeric] A prediction value.
+      def self.fminer_weighted_majority_vote neighbors, training_dataset
+
+        neighbor_contribution = 0.0
+        confidence_sum = 0.0
+
+        $logger.debug "Weighted Majority Vote Classification."
+
+        values = neighbors.collect{|n| n[2]}.uniq
+        neighbors.each do |neighbor|
+          i = training_dataset.compound_ids.index n.id
+          neighbor_weight = neighbor[1]
+          activity = values.index(neighbor[2]) + 1 # map values to integers > 1
+          neighbor_contribution += activity * neighbor_weight
+          if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
+            case activity
+            when 1
+              confidence_sum -= neighbor_weight
+            when 2
+              confidence_sum += neighbor_weight
+            end
+          else
+            confidence_sum += neighbor_weight
+          end
+        end
+        if values.size == 2 
+          if confidence_sum >= 0.0
+            prediction = values[1]
+          elsif confidence_sum < 0.0
+            prediction = values[0] 
+          end
+        elsif values.size == 1 # all neighbors have the same value
+          prediction = values[0] 
+        else 
+          prediction = (neighbor_contribution/confidence_sum).round  # AM: new multinomial prediction
+        end 
+
+        confidence = (confidence_sum/neighbors.size).abs 
+        {:value => prediction, :confidence => confidence.abs}
+      end
+
+      # Local support vector regression from neighbors 
+      # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm_classification(params)
+
+        confidence = 0.0
+        prediction = nil
+
+        $logger.debug "Local SVM."
+        if params[:activities].size>0
+          if params[:props]
+            n_prop = params[:props][0].collect.to_a
+            q_prop = params[:props][1].collect.to_a
+            props = [ n_prop, q_prop ]
+          end
+          activities = params[:activities].collect.to_a
+          activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
+          prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+          prediction = prediction.sub(/Val/,"") if prediction # Convert back
+          confidence = 0.0 if prediction.nil?
+          #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
+          confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
+        end
+        {:prediction => prediction, :confidence => confidence}
+
+      end
+
+
+
+    end
+
+  end
+end
+
diff --git a/lib/compound.rb b/lib/compound.rb
index 3ba1670..3418fcc 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -3,8 +3,6 @@
 #    Could not find contribution data file.
 
 CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
-require 'openbabel'
-require "base64"
 
 module OpenTox
 
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
new file mode 100644
index 0000000..d926cc4
--- /dev/null
+++ b/lib/crossvalidation.rb
@@ -0,0 +1,187 @@
+module OpenTox
+
+  class CrossValidation
+    field :validation_ids, type: Array, default: []
+    field :folds, type: Integer
+    field :nr_instances, type: Integer
+    field :nr_unpredicted, type: Integer
+    field :predictions, type: Array
+    field :finished_at, type: Time 
+  end
+
+  class ClassificationCrossValidation < CrossValidation
+
+    field :accept_values, type: Array
+    field :confusion_matrix, type: Array
+    field :weighted_confusion_matrix, type: Array
+    field :accuracy, type: Float
+    field :weighted_accuracy, type: Float
+    field :true_rate, type: Hash
+    field :predictivity, type: Hash
+    # TODO auc, f-measure (usability??)
+
+    def self.create model, n=10
+      cv = self.new
+      validation_ids = []
+      nr_instances = 0
+      nr_unpredicted = 0
+      predictions = []
+      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+      accept_values = Feature.find(model.prediction_feature_id).accept_values
+      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      true_rate = {}
+      predictivity = {}
+      fold_nr = 1
+      training_dataset = Dataset.find model.training_dataset_id
+      training_dataset.folds(n).each do |fold|
+        t = Time.now
+        $logger.debug "Fold #{fold_nr}"
+        validation = validation_class.create(model, fold[0], fold[1])
+        validation_ids << validation.id
+        nr_instances += validation.nr_instances
+        nr_unpredicted += validation.nr_unpredicted
+        predictions += validation.predictions
+        validation.confusion_matrix.each_with_index do |r,i|
+          r.each_with_index do |c,j|
+            confusion_matrix[i][j] += c
+            weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
+          end
+        end
+        $logger.debug "Fold #{fold_nr}:  #{Time.now-t} seconds"
+        fold_nr +=1
+      end
+      true_rate = {}
+      predictivity = {}
+      accept_values.each_with_index do |v,i|
+        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+      end
+      confidence_sum = 0
+      weighted_confusion_matrix.each do |r|
+        r.each do |c|
+          confidence_sum += c
+        end
+      end
+      cv.update_attributes(
+        nr_instances: nr_instances,
+        nr_unpredicted: nr_unpredicted,
+        accept_values: accept_values,
+        confusion_matrix: confusion_matrix,
+        weighted_confusion_matrix: weighted_confusion_matrix,
+        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
+        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        true_rate: true_rate,
+        predictivity: predictivity,
+        predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
+        finished_at: Time.now
+      )
+      cv.save
+      cv
+    end
+
+    #Average area under roc  0.646
+    #Area under roc  0.646
+    #F measure carcinogen: 0.769, noncarcinogen: 0.348
+  end
+
+  class RegressionCrossValidation < Validation
+
+    field :validation_ids, type: Array, default: []
+    field :folds, type: Integer
+    field :rmse, type: Float
+    field :mae, type: Float
+    field :weighted_rmse, type: Float
+    field :weighted_mae, type: Float
+
+    def self.create model, n=10
+      cv = self.new
+      validation_ids = []
+      nr_instances = 0
+      nr_unpredicted = 0
+      predictions = []
+      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+      fold_nr = 1
+      training_dataset = Dataset.find model.training_dataset_id
+      training_dataset.folds(n).each do |fold|
+        t = Time.now
+        $logger.debug "Predicting fold #{fold_nr}"
+
+        validation = validation_class.create(model, fold[0], fold[1])
+        validation_ids << validation.id
+        nr_instances += validation.nr_instances
+        nr_unpredicted += validation.nr_unpredicted
+        predictions += validation.predictions
+        $logger.debug "Fold #{fold_nr}:  #{Time.now-t} seconds"
+        fold_nr +=1
+      end
+      rmse = 0
+      weighted_rmse = 0
+      rse = 0
+      weighted_rse = 0
+      mae = 0
+      weighted_mae = 0
+      rae = 0
+      weighted_rae = 0
+      n = 0
+      confidence_sum = 0
+      predictions.each do |pred|
+        compound_id,activity,prediction,confidence = pred
+        if activity and prediction
+          error = prediction-activity
+          rmse += error**2
+          weighted_rmse += confidence*error**2
+          mae += error.abs
+          weighted_mae += confidence*error.abs
+          n += 1
+          confidence_sum += confidence
+        else
+          # TODO: create warnings
+          p pred
+        end
+      end
+      mae = mae/n
+      weighted_mae = weighted_mae/confidence_sum
+      rmse = Math.sqrt(rmse/n)
+      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+      cv.update_attributes(
+        folds: n,
+        validation_ids: validation_ids,
+        nr_instances: nr_instances,
+        nr_unpredicted: nr_unpredicted,
+        predictions: predictions.sort{|a,b| b[3] <=> a[3]},
+        mae: mae,
+        rmse: rmse,
+        weighted_mae: weighted_mae,
+        weighted_rmse: weighted_rmse
+      )
+      cv.save
+      cv
+    end
+
+    def plot
+      # RMSE
+      x = predictions.collect{|p| p[1]}
+      y = predictions.collect{|p| p[2]}
+      R.assign "Measurement", x
+      R.assign "Prediction", y
+      R.eval "par(pty='s')" # sets the plot type to be square
+      #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
+      #R.eval "error <- log(Measurement)-log(Prediction)"
+      R.eval "error <- Measurement-Prediction"
+      R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
+      R.eval "mae <- mean( abs(error), na.rm = TRUE)"
+      R.eval "r <- cor(log(Prediction),log(Measurement))"
+      R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
+      R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
+      #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
+      #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
+      R.eval "abline(0,1,col='blue')"
+      #R.eval "abline(fitline,col='red')"
+      R.eval "dev.off()"
+      "/tmp/#{id.to_s}.svg"
+    end
+  end
+
+
+end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 0237adf..4f6f0b5 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -294,8 +294,8 @@ module OpenTox
     end
 =end
 
-    private
-
+    # Fill unset data entries 
+    # @param any value
     def fill_nil_with n
       (0 .. compound_ids.size-1).each do |i|
         @data_entries[i] ||= []
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 68bc7a2..335f3dc 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -1,11 +1,12 @@
 require 'digest/md5'
 ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" 
-BABEL_3D_CACHE_DIR = File.join(File.dirname(__FILE__),"..",'/babel_3d_cache')
 # TODO store descriptors in mongodb
 
 module OpenTox
 
   module Algorithm 
+    
+    # Class for descriptor calculations
     class Descriptor 
       include OpenTox
 
@@ -39,6 +40,7 @@ module OpenTox
 
       require_relative "unique_descriptors.rb"
 
+      # Description of available descriptors
       def self.description descriptor
         lib = descriptor.split('.').first
         case lib
@@ -54,6 +56,7 @@ module OpenTox
         end
       end
 
+      # Match an array of smarts features 
       def self.smarts_match compounds, smarts_features, count=false
         bad_request_error "Compounds for smarts_match are empty" unless compounds
         bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
@@ -73,7 +76,7 @@ module OpenTox
           # eg. at line 249 of rat_feature_dataset
           # which worked with opentox-client
           # (but no smarts_match)
-          p "'#{compound.inchi}'"
+          #p "'#{compound.inchi}'"
           obconversion.read_string(obmol,compound.inchi)
           @smarts.each_with_index do |smart,s|
             smarts_pattern.init(smart)
@@ -88,49 +91,20 @@ module OpenTox
         serialize 
       end
 
+      # Count matches of an array with smarts features 
       def self.smarts_count compounds, smarts
+        # TODO: non-overlapping matches?
         smarts_match compounds,smarts,true
       end
 
-      def self.serialize
-        case @input_class
-        when "OpenTox::Compound"
-          if @with_names and @physchem_descriptors
-            [@physchem_descriptors,@data_entries.first]
-          else
-            @data_entries.first
-          end
-        when "Array"
-          if @with_names and @physchem_descriptors
-            [@physchem_descriptors,@data_entries.first]
-          else
-            @data_entries
-          end
-        when "OpenTox::Dataset"
-          dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
-          if @smarts
-            dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
-            @count ? algo = "count" : algo = "match"
-            dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
-            
-          elsif @physchem_descriptors
-            dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
-            dataset.data_entries = @data_entries
-            dataset.feature_calculation_algorithm = "#{self}.physchem"
-            #TODO params?
-          end
-          dataset.save_all
-          dataset
-        end
-      end
-
-      def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS, with_names=false
+      # Calculate physchem descriptors
+      # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
+      def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
         parse compounds
         @data_entries = Array.new(@compounds.size){[]}
         @descriptors = descriptors
         @smarts = nil
         @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
-        @with_names = with_names
         des = {}
         @descriptors.each do |d|
           lib, descriptor = d.split(".",2)
@@ -173,7 +147,8 @@ module OpenTox
         end
         last_feature_idx = @physchem_descriptors.size
         YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
-          $logger.error "Descriptor calculation failed for compound #{compounds[i].inchi}." if calculation.empty?
+          # TODO create warnings
+          #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
           # CDK Descriptors may calculate multiple values, they are stored in separate features
           @physchem_descriptors += calculation.keys if i == 0
           calculation.keys.each_with_index do |name,j|
@@ -238,6 +213,30 @@ module OpenTox
         end
       end
 
+      def self.serialize
+        case @input_class
+        when "OpenTox::Compound"
+          @data_entries.first
+        when "Array"
+          @data_entries
+        when "OpenTox::Dataset"
+          dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
+          if @smarts
+            dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
+            @count ? algo = "count" : algo = "match"
+            dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
+            
+          elsif @physchem_descriptors
+            dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
+            dataset.data_entries = @data_entries
+            dataset.feature_calculation_algorithm = "#{self}.physchem"
+            #TODO params?
+          end
+          dataset.save_all
+          dataset
+        end
+      end
+
       def self.fix_value val
         val = val.first if val.is_a? Array and val.size == 1
         val = nil if val == "NaN"
diff --git a/lib/feature.rb b/lib/feature.rb
index 9deb199..b2bc1f5 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -29,6 +29,9 @@ module OpenTox
   # Feature for SMARTS fragments
   class Smarts < NominalFeature
     field :smarts, type: String 
+    def self.from_smarts smarts
+      self.find_or_create_by :smarts => smarts
+    end
   end
 
   # Feature for supervised fragments from Fminer algorithm
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 8831ba2..2e7e7c2 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -6,6 +6,9 @@ require 'json'
 require 'logger'
 require 'mongoid'
 require 'rserve'
+require "nokogiri"
+require "base64"
+
 
 # Mongo setup
 # TODO retrieve correct environment from Rack/Sinatra
@@ -27,8 +30,21 @@ Mongo::Logger.logger = $logger
 Mongo::Logger.level = Logger::WARN 
 #Mongoid.logger = $logger
 
+# Require sub-Repositories
+require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
+require_relative '../libfminer/liblast/last' # 
+require_relative '../last-utils/lu.rb'
+require 'openbabel'
+
+# Fminer environment variables
+ENV['FMINER_SMARTS'] = 'true'
+ENV['FMINER_NO_AROMATIC'] = 'true'
+ENV['FMINER_PVALUES'] = 'true'
+ENV['FMINER_SILENT'] = 'true'
+ENV['FMINER_NR_HITS'] = 'true'
+
 # OpenTox classes and includes
-CLASSES = ["Feature","Compound",  "Dataset", "Validation", "CrossValidation"]# Algorithm and Models are modules
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
 
 [ # be aware of the require sequence as it affects class/method overwrites
   "overwrite.rb",
@@ -39,8 +55,15 @@ CLASSES = ["Feature","Compound",  "Dataset", "Validation", "CrossValidation"]# A
   "compound.rb",
   "dataset.rb",
   "descriptor.rb",
-  #"algorithm.rb",
-  #"model.rb",
-  #"validation.rb"
+  "algorithm.rb",
+  "descriptor.rb",
+  "bbrc.rb",
+  "lazar.rb",
+  "similarity.rb",
+  "neighbor.rb",
+  "classification.rb",
+  "regression.rb",
+  "validation.rb",
+  "crossvalidation.rb",
 ].each{ |f| require_relative f }
 
diff --git a/lib/neighbor.rb b/lib/neighbor.rb
new file mode 100644
index 0000000..a2c28d4
--- /dev/null
+++ b/lib/neighbor.rb
@@ -0,0 +1,25 @@
+module OpenTox
+  module Algorithm
+    class Neighbor
+
+      def self.fingerprint_similarity compound, params={}
+        compound.neighbors params[:min_sim]
+      end
+
+      def self.fminer_similarity compound, params
+        feature_dataset = Dataset.find params[:feature_dataset_id]
+        query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features.collect{|f| f.smarts} )
+        neighbors = []
+
+        # find neighbors
+        feature_dataset.data_entries.each_with_index do |fingerprint, i|
+          sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
+          if sim > params[:min_sim]
+            neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+          end
+        end
+        neighbors
+      end
+    end
+  end
+end
diff --git a/lib/regression.rb b/lib/regression.rb
new file mode 100644
index 0000000..891d7f9
--- /dev/null
+++ b/lib/regression.rb
@@ -0,0 +1,199 @@
+# TODO install R packages kernlab, caret, doMC, class, e1071
+
+
+        # log transform activities (create new dataset)
+        # scale, normalize features, might not be necessary
+        # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
+        # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
+        # zero-order correlation and the semi-partial correlation
+        # seems to be necessary for svm
+        #   http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
+        #   http://stackoverflow.com/questions/15436367/svm-scaling-input-values
+        # use lasso or elastic net??
+        # select relevant features
+        #   remove features with a single value
+        #   remove correlated features
+        #   remove features not correlated with endpoint
+module OpenTox
+  module Algorithm
+    
+    class Regression
+
+      def self.weighted_average neighbors
+        weighted_sum = 0.0
+        sim_sum = 0.0
+        neighbors.each do |row|
+          n,sim,acts = row
+          acts.each do |act|
+            weighted_sum += sim*Math.log10(act)
+            sim_sum += sim
+          end
+        end
+        confidence = sim_sum/neighbors.size.to_f
+        sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+        [prediction,confidence]
+      end
+
+      # Local support vector regression from neighbors 
+      # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
+
+        confidence = 0.0
+        prediction = nil
+
+        $logger.debug "Local SVM."
+        props = neighbors.collect{|row| row[3] }
+        neighbors.shift
+        activities = neighbors.collect{|n| n[2]}
+        prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+        prediction = nil if (!prediction.nil? && prediction.infinite?)
+        $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
+        if prediction
+          confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
+        else
+          confidence = nil if prediction.nil?
+        end
+          [prediction, confidence]
+
+      end
+
+
+      # Local support vector prediction from neighbors. 
+      # Uses propositionalized setting.
+      # Not to be called directly (use local_svm_regression or local_svm_classification).
+      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+      # @param [Array] activities, activities for neighbors.
+      # @param [Float] min_train_performance, parameter to control censoring
+      # @return [Numeric] A prediction value.
+      def self.local_svm_prop(props, activities, min_train_performance)
+
+        $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
+        n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
+        q_prop = props[0] # is an Array.
+
+        prediction = nil
+        if activities.uniq.size == 1
+          prediction = activities[0]
+        else
+          t = Time.now
+          #$logger.debug gram_matrix.to_yaml
+          #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
+          @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
+          rs = []
+          ["caret", "doMC", "class"].each do |lib|
+            #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
+            rs << "suppressPackageStartupMessages(library('#{lib}'))"
+          end
+          #@r.eval "registerDoMC()" # switch on parallel processing
+          rs << "registerDoMC()" # switch on parallel processing
+          #@r.eval "set.seed(1)"
+          rs << "set.seed(1)"
+          $logger.debug "Loading R packages: #{Time.now-t}"
+          t = Time.now
+          p n_prop
+          begin
+
+            # set data
+            rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
+            rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
+            rs << "n_prop_x_size <- c(#{n_prop.size})"
+            rs << "n_prop_y_size <- c(#{n_prop[0].size})"
+            rs << "y <- c(#{activities.join(',')})"
+            rs << "q_prop <- c(#{q_prop.join(',')})"
+            rs << "y = matrix(y)"
+            rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
+            rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
+
+            $logger.debug "Setting R data: #{Time.now-t}"
+            t = Time.now
+            # prepare data
+            rs << "
+              weights=NULL
+              if (!(class(y) == 'numeric')) { 
+                y = factor(y)
+                weights=unlist(as.list(prop.table(table(y))))
+                weights=(weights-1)^2
+              }
+            "
+
+            rs << "
+              rem = nearZeroVar(prop_matrix)
+              if (length(rem) > 0) {
+                prop_matrix = prop_matrix[,-rem,drop=F]
+                q_prop = q_prop[,-rem,drop=F]
+              }
+              rem = findCorrelation(cor(prop_matrix))
+              if (length(rem) > 0) {
+                prop_matrix = prop_matrix[,-rem,drop=F]
+                q_prop = q_prop[,-rem,drop=F]
+              }
+            "
+
+            #p @r.eval("y").to_ruby
+            #p "weights"
+            #p @r.eval("weights").to_ruby
+            $logger.debug "Preparing R data: #{Time.now-t}"
+            t = Time.now
+            # model + support vectors
+            #train_success = @r.eval <<-EOR
+            rs << '
+              model = train(prop_matrix,y,
+                             method="svmRadial",
+                             preProcess=c("center", "scale"),
+                             class.weights=weights,
+                             trControl=trainControl(method="LGOCV",number=10),
+                             tuneLength=8
+                           )
+              perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+            '
+            File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
+            p rs.join("\n")
+            p `Rscript /tmp/r.r`
+=begin
+            @r.void_eval <<-EOR
+              model = train(prop_matrix,y,
+                             method="svmRadial",
+                             #preProcess=c("center", "scale"),
+                             #class.weights=weights,
+                             #trControl=trainControl(method="LGOCV",number=10),
+                             #tuneLength=8
+                           )
+              perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+            EOR
+=end
+
+            $logger.debug "Creating R SVM model: #{Time.now-t}"
+            t = Time.now
+            if train_success
+              # prediction
+              @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
+              #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
+              @r.eval "if (class(y)!='numeric') p = as.character(p)"
+              prediction = @r.p
+
+              # censoring
+              prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
+              prediction = nil if prediction =~ /NA/
+              $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
+            else
+              $logger.debug "Model creation failed."
+              prediction = nil 
+            end
+            $logger.debug "R Prediction: #{Time.now-t}"
+          rescue Exception => e
+            $logger.debug "#{e.class}: #{e.message}"
+            $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          ensure
+            #puts @r.inspect
+            #TODO: broken pipe
+            #@r.quit # free R
+          end
+        end
+        prediction
+      end
+    end
+
+  end
+end
+
diff --git a/lib/similarity.rb b/lib/similarity.rb
new file mode 100644
index 0000000..91e18db
--- /dev/null
+++ b/lib/similarity.rb
@@ -0,0 +1,58 @@
+=begin
+* Name: similarity.rb
+* Description: Similarity algorithms
+* Author: Andreas Maunz <andreas@maunz.de
+* Date: 10/2012
+=end
+
+module OpenTox
+  module Algorithm
+
+    class Similarity
+
+      #TODO weighted tanimoto
+
+      # Tanimoto similarity
+      # @param [Array] a fingerprints of first compound
+      # @param [Array] b fingerprints of second compound
+      # @return [Float] Tanimoto similarity
+      def self.tanimoto(a,b)
+        bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
+        #common = 0.0
+        #a.each_with_index do |n,i|
+          #common += 1 if n == b[i]
+        #end
+        #common/a.size
+        # TODO check if calculation speed can be improved
+        common_p_sum = 0.0
+        all_p_sum = 0.0
+        (0...a.size).each { |idx|
+          common_p_sum += [ a[idx], b[idx] ].min
+          all_p_sum += [ a[idx], b[idx] ].max
+        }
+        common_p_sum/all_p_sum
+      end
+
+
+      # Cosine similarity
+      # @param [Array] a fingerprints of first compound
+      # @param [Array] b fingerprints of second compound
+      # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
+      def self.cosine(a, b)
+        val = 0.0
+        if a.size>0 and b.size>0
+          if a.size>12 && b.size>12
+            a = a[0..11]
+            b = b[0..11]
+          end
+          a_vec = a.to_gv
+          b_vec = b.to_gv
+          val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
+        end
+        val
+      end
+
+    end
+
+  end
+end
diff --git a/lib/validation.rb b/lib/validation.rb
new file mode 100644
index 0000000..bcbe49a
--- /dev/null
+++ b/lib/validation.rb
@@ -0,0 +1,114 @@
+module OpenTox
+
+  class Validation
+
+    field :prediction_dataset_id, type: BSON::ObjectId
+    field :test_dataset_id, type: BSON::ObjectId
+    field :nr_instances, type: Integer
+    field :nr_unpredicted, type: Integer
+    field :predictions, type: Array
+
+    def prediction_dataset
+      Dataset.find prediction_dataset_id
+    end
+
+    def test_dataset
+      Dataset.find test_dataset_id
+    end
+
+  end
+
+  class ClassificationValidation < Validation
+    field :accept_values, type: String
+    field :confusion_matrix, type: Array
+    field :weighted_confusion_matrix, type: Array
+
+    def self.create model, training_set, test_set
+      validation = self.class.new
+      #feature_dataset = Dataset.find model.feature_dataset_id
+      # TODO check and delegate to Algorithm
+      #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
+      validation_model = model.class.create training_set#, features
+      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+      prediction_dataset = validation_model.predict test_set_without_activities
+      accept_values = prediction_dataset.prediction_feature.accept_values
+      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      predictions = []
+      nr_unpredicted = 0
+      prediction_dataset.data_entries.each_with_index do |pe,i|
+        if pe[0] and pe[1] and pe[1].numeric? 
+          prediction = pe[0]
+          # TODO prediction_feature, convention??
+          # TODO generalize for multiple classes
+          activity = test_set.data_entries[i].first
+          confidence = prediction_dataset.data_entries[i][1]
+          predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
+          if prediction == activity
+            if prediction == accept_values[0]
+              confusion_matrix[0][0] += 1
+              weighted_confusion_matrix[0][0] += confidence
+            elsif prediction == accept_values[1]
+              confusion_matrix[1][1] += 1
+              weighted_confusion_matrix[1][1] += confidence
+            end
+          elsif prediction != activity
+            if prediction == accept_values[0]
+              confusion_matrix[0][1] += 1
+              weighted_confusion_matrix[0][1] += confidence
+            elsif prediction == accept_values[1]
+              confusion_matrix[1][0] += 1
+              weighted_confusion_matrix[1][0] += confidence
+            end
+          end
+        else
+          nr_unpredicted += 1 if pe[0].nil?
+        end
+      end
+      validation = self.new(
+        :prediction_dataset_id => prediction_dataset.id,
+        :test_dataset_id => test_set.id,
+        :nr_instances => test_set.compound_ids.size,
+        :nr_unpredicted => nr_unpredicted,
+        :accept_values => accept_values,
+        :confusion_matrix => confusion_matrix,
+        :weighted_confusion_matrix => weighted_confusion_matrix,
+        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+      )
+      validation.save
+      validation
+    end
+  end
+
+  class RegressionValidation < Validation
+    def self.create model, training_set, test_set
+      
+      validation_model = Model::LazarRegression.create training_set
+      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+      prediction_dataset = validation_model.predict test_set_without_activities
+      predictions = []
+      nr_unpredicted = 0
+      activities = test_set.data_entries.collect{|de| de.first}
+      prediction_dataset.data_entries.each_with_index do |de,i|
+        if de[0] and de[1] and de[1].numeric? 
+          activity = activities[i]
+          prediction = de.first
+          confidence = de[1]
+          predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
+        else
+          nr_unpredicted += 1
+        end
+      end
+      validation = self.new(
+        :prediction_dataset_id => prediction_dataset.id,
+        :test_dataset_id => test_set.id,
+        :nr_instances => test_set.compound_ids.size,
+        :nr_unpredicted => nr_unpredicted,
+        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+      )
+      validation.save
+      validation
+    end
+  end
+
+end
diff --git a/test/descriptor-long.rb b/test/descriptor-long.rb
new file mode 100644
index 0000000..2752d5a
--- /dev/null
+++ b/test/descriptor-long.rb
@@ -0,0 +1,13 @@
+require_relative "setup.rb"
+class DescriptorLongTest < MiniTest::Test
+
+  def test_dataset_all
+    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
+    d = OpenTox::Algorithm::Descriptor.physchem dataset
+    assert_equal dataset.compounds, d.compounds
+    assert_equal 332, d.features.size
+    assert_equal 332, d.data_entries.first.size
+    d.delete
+  end
+
+end
diff --git a/test/descriptor.rb b/test/descriptor.rb
new file mode 100644
index 0000000..1143b87
--- /dev/null
+++ b/test/descriptor.rb
@@ -0,0 +1,83 @@
+require_relative "setup.rb"
+
+class DescriptorTest < MiniTest::Test
+
+  def test_list
+    # check available descriptors
+    @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys
+    assert_equal 111,@descriptors.size,"wrong num physchem descriptors"
+    @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES
+    assert_equal 356,@descriptor_values.size,"wrong num physchem descriptors"
+    sum = 0
+    [ @descriptors, @descriptor_values ].each do |desc|
+      {"Openbabel"=>16,"Cdk"=>(desc==@descriptors ? 50 : 295),"Joelib"=>45}.each do |k,v|
+        assert_equal v,desc.select{|x| x=~/^#{k}\./}.size,"wrong num #{k} descriptors"
+        sum += v
+      end
+    end
+    assert_equal (111+356),sum
+  end
+
+  def test_smarts
+    c = OpenTox::Compound.from_smiles "N=C=C1CCC(=F=FO)C1"
+    s = Smarts.find_or_create_by(:smarts => "FF")
+    result = OpenTox::Algorithm::Descriptor.smarts_match c, s
+    assert_equal [1], result
+    smarts = ["CC", "C", "C=C", "CO", "FF", "C1CCCC1", "NN"].collect{|s| Smarts.find_or_create_by(:smarts => s)}
+    result = OpenTox::Algorithm::Descriptor.smarts_match c, smarts
+    assert_equal [1, 1, 1, 0, 1, 1, 0], result
+    smarts_count = [10, 6, 2, 0, 2, 10, 0]
+    result = OpenTox::Algorithm::Descriptor.smarts_count c, smarts
+    assert_equal smarts_count, result
+  end
+
+  def test_compound_openbabel_single
+    c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
+    result = OpenTox::Algorithm::Descriptor.physchem c, ["Openbabel.logP"]
+    assert_equal [1.12518], result
+  end
+
+  def test_compound_cdk_single
+    c = OpenTox::Compound.from_smiles "c1ccccc1"
+    result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"]
+    assert_equal [12], result
+    c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
+    result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"]
+    assert_equal [17], result
+    result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.CarbonTypes"]
+    c_types = {"Cdk.CarbonTypes.C1SP1"=>1, "Cdk.CarbonTypes.C2SP1"=>0, "Cdk.CarbonTypes.C1SP2"=>0, "Cdk.CarbonTypes.C2SP2"=>1, "Cdk.CarbonTypes.C3SP2"=>0, "Cdk.CarbonTypes.C1SP3"=>2, "Cdk.CarbonTypes.C2SP3"=>1, "Cdk.CarbonTypes.C3SP3"=>1, "Cdk.CarbonTypes.C4SP3"=>0}
+    assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result
+  end
+
+  def test_compound_joelib_single
+    c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
+    result = OpenTox::Algorithm::Descriptor.physchem c, ["Joelib.LogP"]
+    assert_equal [2.65908], result
+  end
+
+  def test_compound_all
+    c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
+    result = OpenTox::Algorithm::Descriptor.physchem c
+    assert_equal 332, result.size
+    assert_equal 30.8723, result[2]
+    assert_equal 1.12518, result[328]
+  end
+
+  def test_compound_descriptor_parameters
+    c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
+    result = OpenTox::Algorithm::Descriptor.physchem c, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ], true
+    assert_equal 12, result.last.size
+    assert_equal ["Openbabel.logP", "Cdk.AtomCount.nAtom", "Cdk.CarbonTypes.C1SP1", "Cdk.CarbonTypes.C2SP1", "Cdk.CarbonTypes.C1SP2", "Cdk.CarbonTypes.C2SP2", "Cdk.CarbonTypes.C3SP2", "Cdk.CarbonTypes.C1SP3", "Cdk.CarbonTypes.C2SP3", "Cdk.CarbonTypes.C3SP3", "Cdk.CarbonTypes.C4SP3", "Joelib.LogP"], result.first
+    assert_equal [1.12518, 17, 1, 0, 0, 1, 0, 2, 1, 1, 0, 2.65908], result.last
+  end
+
+  def test_dataset_descriptor_parameters
+    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
+    d = OpenTox::Algorithm::Descriptor.physchem dataset, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ]
+    assert_kind_of Dataset, d
+    assert_equal dataset.compounds, d.compounds
+    assert_equal dataset.compounds.size, d.data_entries.size
+    assert_equal 12, d.data_entries.first.size
+  end
+
+end
diff --git a/test/fminer-long.rb b/test/fminer-long.rb
new file mode 100644
index 0000000..826f206
--- /dev/null
+++ b/test/fminer-long.rb
@@ -0,0 +1,37 @@
+require_relative "setup.rb"
+
+class FminerTest < MiniTest::Test
+
+  def test_fminer_multicell
+    skip "multicell segfaults"
+    # TODO aborts, probably fminer
+    # or OpenBabel segfault
+    dataset = OpenTox::Dataset.new 
+    #multi_cell_call.csv
+    dataset.upload File.join(DATA_DIR,"multi_cell_call.csv")
+    feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
+    dataset.delete
+    feature_dataset.delete
+  end
+
+  def test_fminer_isscan
+    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv")
+    feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
+    assert_equal feature_dataset.compounds.size, dataset.compounds.size
+    p feature_dataset
+    dataset.delete
+    feature_dataset.delete
+  end
+
+  def test_fminer_kazius
+    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
+    # TODO reactivate default settings
+    feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20)
+    assert_equal feature_dataset.compounds.size, dataset.compounds.size
+    feature_dataset = Dataset.find feature_dataset.id
+    assert feature_dataset.data_entries.size, dataset.compounds.size
+    dataset.delete
+    feature_dataset.delete
+  end
+
+end
diff --git a/test/fminer.rb b/test/fminer.rb
new file mode 100644
index 0000000..17dcbe1
--- /dev/null
+++ b/test/fminer.rb
@@ -0,0 +1,46 @@
+require_relative "setup.rb"
+
+class FminerTest < MiniTest::Test
+
+  def test_fminer_bbrc
+    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
+    refute_nil dataset.id
+    feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset
+    feature_dataset = Dataset.find feature_dataset.id
+    assert_equal dataset.compounds.size, feature_dataset.compounds.size
+    assert_equal 54, feature_dataset.features.size
+    assert_equal "C-C-C=C", feature_dataset.features.first.smarts
+    compounds = feature_dataset.compounds
+    smarts = feature_dataset.features
+    match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
+    feature_dataset.data_entries.each_with_index do |fingerprint,i|
+      assert_equal match[i], fingerprint
+    end
+
+    dataset.delete
+    feature_dataset.delete
+  end
+
+  def test_fminer_last
+    skip "last features have to be activated"
+    dataset = OpenTox::Dataset.new
+    dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv")
+    feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset
+    assert_equal dataset.compounds.size, feature_dataset.compounds.size
+    assert_equal 21, feature_dataset.features.size
+    assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts
+
+    compounds = feature_dataset.compounds
+    smarts = feature_dataset.features.collect{|f| f.smarts}
+    match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
+    compounds.each_with_index do |c,i|
+      smarts.each_with_index do |s,j|
+        assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i
+      end
+    end
+
+    dataset.delete
+    feature_dataset.delete
+  end
+
+end
-- 
cgit v1.2.3