Merge branch 'development' of github.com:opentox/opentox-ruby into development

author: mguetlein <martin.guetlein@gmail.com> 2012-02-28 10:34:58 +0100
committer: mguetlein <martin.guetlein@gmail.com> 2012-02-28 10:34:58 +0100
commit: 32cd79cd055ccd0aa9de29d68ca8c3e63663b126 (patch)
tree: 5058c65d249e39983eac1d2ae0641d069f68937e
parent: 04e8d118ec8e4d09b4e3ad6538aa7e8974613076 (diff)
parent: 754fd31f83be3ac7557a2f7832ff44fb262a98a5 (diff)
12 files changed, 1382 insertions, 851 deletions
diff --git a/Rakefile b/Rakefile
index 2f8da26..4d1d0c3 100644
--- a/Rakefile
+++ b/Rakefile
@@ -45,7 +45,6 @@ begin
     gem.add_dependency "ruby-plot", "=0.6.1"
     gem.add_dependency "gsl", "=1.14.7"
     gem.add_dependency "statsample", "=1.1.0"
-    #gem.add_dependency "statsample-optimization", "=2.1.0"
 
     gem.add_development_dependency 'jeweler'
     gem.files =  FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 3bfe789..db21c46 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -5,6 +5,8 @@ R = nil
 require "rinruby" 
 require "statsample"
 require 'uri'
+require 'transform.rb'
+require 'utils.rb'
 
 module OpenTox
 
@@ -13,7 +15,7 @@ module OpenTox
 
     include OpenTox
 
-    # Execute algorithm with parameters, please consult the OpenTox API and the webservice documentation for acceptable parameters
+    # Execute algorithm with parameters, consult OpenTox API and webservice documentation for acceptable parameters
     # @param [optional,Hash] params Algorithm parameters
     # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
     # @return [String] URI of new resource (dataset, model, ...)
@@ -21,7 +23,7 @@ module OpenTox
       LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
       RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
     end
-    
+
     # Get OWL-DL representation in RDF/XML format
     # @return [application/rdf+xml] RDF/XML representation
     def to_rdfxml
@@ -33,7 +35,7 @@ module OpenTox
     # Generic Algorithm class, should work with all OpenTox webservices
     class Generic 
       include Algorithm
-      
+
       # Find Generic Opentox Algorithm via URI, and loads metadata, could raise NotFound/NotAuthorized error
       # @param [String] uri Algorithm URI
       # @return [OpenTox::Algorithm::Generic] Algorithm instance
@@ -44,14 +46,14 @@ module OpenTox
         raise "cannot load algorithm metadata" if alg.metadata==nil or alg.metadata.size==0
         alg
       end
-      
+
     end
 
     # Fminer algorithms (https://github.com/amaunz/fminer2)
     class Fminer
       include Algorithm
       attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
-      
+
       def check_params(params,per_mil,subjectid=nil)
         raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and  !params[:dataset_uri].nil?
         raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and  !params[:prediction_feature].nil?
@@ -81,7 +83,7 @@ module OpenTox
             LOGGER.warn "Cannot find smiles for #{compound.to_s}."
             next
           end
-          
+
           value_map=params[:value_map] unless params[:value_map].nil?
           entry.each do |feature,values|
             if feature == @prediction_feature.uri
@@ -115,23 +117,23 @@ module OpenTox
 
     end
 
-      # Backbone Refinement Class mining (http://bbrc.maunz.de/)
-      class BBRC < Fminer
-        # Initialize bbrc algorithm
-        def initialize(subjectid=nil)
-          super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
-          load_metadata(subjectid)
-        end
+    # Backbone Refinement Class mining (http://bbrc.maunz.de/)
+    class BBRC < Fminer
+      # Initialize bbrc algorithm
+      def initialize(subjectid=nil)
+        super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
+        load_metadata(subjectid)
       end
+    end
 
-      # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
-      class LAST < Fminer
-        # Initialize last algorithm
-        def initialize(subjectid=nil)
-          super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
-          load_metadata(subjectid)
-        end
+    # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
+    class LAST < Fminer
+      # Initialize last algorithm
+      def initialize(subjectid=nil)
+        super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
+        load_metadata(subjectid)
       end
+    end
 
 
     # Create lazar prediction model
@@ -144,72 +146,6 @@ module OpenTox
       end
     end
 
-    # Utility methods without dedicated webservices
-
-    # Similarity calculations
-    module Similarity
-      include Algorithm
-
-      # Tanimoto similarity
-      # @param [Array] features_a Features of first compound
-      # @param [Array] features_b Features of second compound
-      # @param [optional, Hash] weights Weights for all features
-      # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
-      # @return [Float] (Weighted) tanimoto similarity
-      def self.tanimoto(features_a,features_b,weights=nil,params=nil)
-        common_features = features_a & features_b
-        all_features = (features_a + features_b).uniq
-        #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
-        if common_features.size > 0
-          if weights
-            #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
-            if !params.nil? && params[:nr_hits]
-              params[:weights] = weights
-              params[:mode] = "min"
-              params[:features] = common_features
-              common_p_sum = Algorithm.p_sum_support(params)
-              params[:mode] = "max"
-              params[:features] = all_features
-              all_p_sum = Algorithm.p_sum_support(params)
-            else
-              common_p_sum = 0.0
-              common_features.each{|f| common_p_sum += weights[f]}
-              all_p_sum = 0.0
-              all_features.each{|f| all_p_sum += weights[f]}
-            end
-            #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
-            common_p_sum/all_p_sum
-          else
-            #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
-            common_features.size.to_f/all_features.size.to_f
-          end
-        else
-          0.0
-        end
-      end
-
-      # Euclidean similarity
-      # @param [Hash] properties_a Properties of first compound
-      # @param [Hash] properties_b Properties of second compound
-      # @param [optional, Hash] weights Weights for all properties
-      # @return [Float] (Weighted) euclidean similarity
-      def self.euclidean(properties_a,properties_b,weights=nil)
-        common_properties = properties_a.keys & properties_b.keys
-        if common_properties.size > 1
-          dist_sum = 0
-          common_properties.each do |p|
-            if weights
-              dist_sum += ( (properties_a[p] - properties_b[p]) * weights[p] )**2
-            else
-              dist_sum += (properties_a[p] - properties_b[p])**2
-            end
-          end
-          1/(1+Math.sqrt(dist_sum))
-        else
-          0.0
-        end
-      end
-    end
 
     # Structural Graph Clustering by TU Munich
     # Finds clusters similar to a query structure in a given training dataset
@@ -226,7 +162,7 @@ module OpenTox
           raise "Invalid URI."
         end
         @training_dataset_uri = training_dataset_uri
-        if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
+        if !self.numeric? training_threshold || training_threshold <0 || training_threshold >1
           raise "Training threshold out of bounds."
         end
         @training_threshold = training_threshold.to_f
@@ -259,7 +195,7 @@ module OpenTox
       # @params[Float]  Similarity threshold for query to clusters (optional)
       def get_clusters query_compound_uri, query_threshold = 0.5
 
-        if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
+        if !self.numeric? query_threshold || query_threshold <0 || query_threshold >1
           raise "Query threshold out of bounds."
         end
         @query_threshold = query_threshold.to_f
@@ -285,7 +221,7 @@ module OpenTox
           metadata[DC.title][pattern]=""
           feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
         }
-        
+
         # Integrity check
         unless cluster_query_dataset.compounds.size == 1
           raise "Number of predicted compounds is != 1."
@@ -295,11 +231,11 @@ module OpenTox
         query_compound_uri = cluster_query_dataset.compounds[0]
         @target_clusters_array = Array.new
         cluster_query_dataset.features.keys.each { |cluster_membership_feature|
-        
+
           # Getting dataset URI for cluster
           target_cluster = feature_clusterid_map[cluster_membership_feature]
           dataset = @clusterid_dataset_map[target_cluster]
-        
+
           # Finally look up presence
           data_entry = cluster_query_dataset.data_entries[query_compound_uri]
           present = data_entry[cluster_membership_feature][0]
@@ -311,89 +247,13 @@ module OpenTox
 
     end
 
-    module Neighbors
-
-      # Local multi-linear regression (MLR) prediction from neighbors. 
-      # Uses propositionalized setting.
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
-      # @return [Numeric] A prediction value.
-      def self.local_mlr_prop(params)
-
-        confidence=0.0
-        prediction=nil
-
-        if params[:neighbors].size>0
-          props = params[:prop_kernel] ? get_props(params) : nil
-          acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
-          sims = params[:neighbors].collect { |n| n[:similarity] }
-          LOGGER.debug "Local MLR (Propositionalization / GSL)."
-          prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
-          if params[:transform].to_s == "classNOP"
-            transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}])")
-          else
-            transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}]), #{params[:transform]["offset"]})")
-          end 
-          prediction = transformer.values[0]
-          prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction  
-          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
-          params[:conf_stdev] = false if params[:conf_stdev].nil?
-          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
-          confidence = nil if prediction.nil?
-        end
-        {:prediction => prediction, :confidence => confidence}
-
-      end
-
-      # Multi-linear regression weighted by similarity.
-      # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
-      # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
-      # @return [Numeric] A prediction value.
-      def self.mlr(params)
-
-        # GSL matrix operations: 
-        # to_a : row-wise conversion to nested array
-        #
-        # Statsample operations (build on GSL):
-        # to_scale: convert into Statsample format
-
-        begin
-          n_prop = params[:n_prop].collect { |v| v }
-          q_prop = params[:q_prop].collect { |v| v }
-          n_prop << q_prop # attach q_prop
-          nr_cases, nr_features = get_sizes n_prop
-          data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
-
-          # Principal Components Analysis
-          LOGGER.debug "PCA..."
-          pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
-          data_matrix = pca.data_transformed_matrix
-
-          # Attach intercept column to data
-          intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
-          data_matrix = data_matrix.horzcat(intercept)
-          (0..data_matrix.size2-2).each { |i|
-            autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
-            data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
-          }
 
-          # Detach query instance
-          n_prop = data_matrix.to_a
-          q_prop = n_prop.pop 
-          nr_cases, nr_features = get_sizes n_prop
-          data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
 
-          # model + support vectors
-          LOGGER.debug "Creating MLR model ..."
-          c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
-          GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
-        rescue Exception => e
-          LOGGER.debug "#{e.class}: #{e.message}"
-        end
+    module Neighbors
 
-      end
 
       # Classification with majority vote from neighbors weighted by similarity
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @param [Hash] params Keys `:acts, :sims, :value_map` are required
       # @return [Numeric] A prediction value.
       def self.weighted_majority_vote(params)
 
@@ -402,12 +262,13 @@ module OpenTox
         confidence = 0.0
         prediction = nil
 
-        params[:neighbors].each do |neighbor|
-          neighbor_weight = neighbor[:similarity].to_f
-          neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
+        LOGGER.debug "Weighted Majority Vote Classification."
 
+        params[:acts].each_index do |idx|
+          neighbor_weight = params[:sims][1][idx]
+          neighbor_contribution += params[:acts][idx] * neighbor_weight
           if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
-            case neighbor[:activity]
+            case params[:acts][idx]
             when 1
               confidence_sum -= neighbor_weight
             when 2
@@ -417,298 +278,257 @@ module OpenTox
             confidence_sum += neighbor_weight
           end
         end
-
         if params[:value_map].size == 2 
           if confidence_sum >= 0.0
-            prediction = 2 unless params[:neighbors].size==0
+            prediction = 2 unless params[:acts].size==0
           elsif confidence_sum < 0.0
-            prediction = 1 unless params[:neighbors].size==0
+            prediction = 1 unless params[:acts].size==0
           end
         else 
-          prediction = (neighbor_contribution/confidence_sum).round  unless params[:neighbors].size==0  # AM: new multinomial prediction
+          prediction = (neighbor_contribution/confidence_sum).round  unless params[:acts].size==0  # AM: new multinomial prediction
         end 
+
         LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
-        confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
+        confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0
         LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
         return {:prediction => prediction, :confidence => confidence.abs}
       end
 
+
+
       # Local support vector regression from neighbors 
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
       # @return [Numeric] A prediction value.
       def self.local_svm_regression(params)
 
-        confidence = 0.0
-        prediction = nil
-        if params[:neighbors].size>0
-          props = params[:prop_kernel] ? get_props(params) : nil
-          acts = params[:neighbors].collect{ |n| n[:activity].to_f }
-          sims = params[:neighbors].collect{ |n| n[:similarity] }
-          prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
-          if params[:transform].to_s == "classNOP"
-            transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}])")
-          else
-            transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}]), #{params[:transform]["offset"]})")
+        begin
+          confidence = 0.0
+          prediction = nil
+
+          LOGGER.debug "Local SVM."
+          if params[:acts].size>0
+            if params[:props]
+              n_prop = params[:props][0].collect
+              q_prop = params[:props][1].collect
+              props = [ n_prop, q_prop ]
+            end
+            acts = params[:acts].collect
+            prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+            prediction = nil if (!prediction.nil? && prediction.infinite?)
+            LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+            confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
+            confidence = 0.0 if prediction.nil?
           end
-          prediction = transformer.values[0]
-          prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction  
-          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
-          params[:conf_stdev] = false if params[:conf_stdev].nil?
-          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
-          confidence = nil if prediction.nil?
+          {:prediction => prediction, :confidence => confidence}
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
         end
-        {:prediction => prediction, :confidence => confidence}
-        
+
       end
 
-      # Local support vector classification from neighbors 
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+
+      # Local support vector regression from neighbors 
+      # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
       # @return [Numeric] A prediction value.
       def self.local_svm_classification(params)
 
-        confidence = 0.0
-        prediction = nil
-        if params[:neighbors].size>0
-          props = params[:prop_kernel] ? get_props(params) : nil
-          acts = params[:neighbors].collect { |n| act = n[:activity] }
-          sims = params[:neighbors].collect{ |n| n[:similarity] } # similarity values btwn q and nbors
-          prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
-          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
-          params[:conf_stdev] = false if params[:conf_stdev].nil?
-          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
+        begin
+          confidence = 0.0
+          prediction = nil
+
+          LOGGER.debug "Local SVM."
+          if params[:acts].size>0
+            if params[:props]
+              n_prop = params[:props][0].collect
+              q_prop = params[:props][1].collect
+              props = [ n_prop, q_prop ]
+            end
+            acts = params[:acts].collect
+            acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
+            prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+            prediction = prediction.sub(/Val/,"") if prediction # Convert back to Float
+            confidence = 0.0 if prediction.nil?
+            LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+            confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
+          end
+          {:prediction => prediction, :confidence => confidence}
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
         end
-        {:prediction => prediction, :confidence => confidence}
-        
+
       end
 
 
+
       # Local support vector prediction from neighbors. 
-      # Uses pre-defined Kernel Matrix.
+      # Uses propositionalized setting.
       # Not to be called directly (use local_svm_regression or local_svm_classification).
+      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
       # @param [Array] acts, activities for neighbors.
-      # @param [Array] sims, similarities for neighbors.
-      # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @param [Float] min_train_performance, parameter to control censoring
       # @return [Numeric] A prediction value.
-      def self.local_svm(acts, sims, type, params)
-        LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
-        neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
-        gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
+      def self.local_svm_prop(props, acts, min_train_performance)
+
+        LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
+        n_prop = props[0] # is a matrix, i.e. two nested Arrays.
+        q_prop = props[1] # is an Array.
 
         prediction = nil
         if Algorithm::zero_variance? acts
           prediction = acts[0]
         else
-          # gram matrix
-          (0..(neighbor_matches.length-1)).each do |i|
-            neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
-            gram_matrix[i] = [] unless gram_matrix[i]
-            # upper triangle
-            ((i+1)..(neighbor_matches.length-1)).each do |j|
-              neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
-              sim_params = {}
-              if params[:nr_hits]
-                sim_params[:nr_hits] = true
-                sim_params[:compound_features_hits] = neighbor_i_hits
-                sim_params[:training_compound_features_hits] = neighbor_j_hits
-              end
-              sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
-              gram_matrix[i][j] = sim
-              gram_matrix[j] = [] unless gram_matrix[j] 
-              gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
-            end
-            gram_matrix[i][i] = 1.0
-          end
-
-
           #LOGGER.debug gram_matrix.to_yaml
           @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
-          @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
-          LOGGER.debug "Setting R data ..."
-          # set data
-          @r.gram_matrix = gram_matrix.flatten
-          @r.n = neighbor_matches.size
-          @r.y = acts
-          @r.sims = sims
-
+          @r.eval "set.seed(1)"
+          @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
+          @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
+          @r.eval "registerDoMC()" # switch on parallel processing
           begin
-            LOGGER.debug "Preparing R data ..."
-            # prepare data
-            @r.eval "y<-as.vector(y)"
-            @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
-            @r.eval "sims<-as.vector(sims)"
-            
-            # model + support vectors
-            LOGGER.debug "Creating SVM model ..."
-            @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
-            @r.eval "sv<-as.vector(SVindex(model))"
-            @r.eval "sims<-sims[sv]"
-            @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
-            LOGGER.debug "Predicting ..."
-            if type == "nu-svr" 
-              @r.eval "p<-predict(model,sims)[1,1]"
-            elsif type == "C-bsvc"
-              @r.eval "p<-predict(model,sims)"
-            end
-            if type == "nu-svr"
-              prediction = @r.p
-            elsif type == "C-bsvc"
-              #prediction = (@r.p.to_f == 1.0 ? true : false)
-              prediction = @r.p
-            end
-            @r.quit # free R
-          rescue Exception => e
-            LOGGER.debug "#{e.class}: #{e.message}"
-            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-          end
 
-        end
-        prediction
-      end
-
-      # Local support vector prediction from neighbors. 
-      # Uses propositionalized setting.
-      # Not to be called directly (use local_svm_regression or local_svm_classification).
-      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
-      # @param [Array] acts, activities for neighbors.
-      # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
-      # @return [Numeric] A prediction value.
-      def self.local_svm_prop(props, acts, type)
-
-          LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
-          n_prop = props[0] # is a matrix, i.e. two nested Arrays.
-          q_prop = props[1] # is an Array.
-
-          prediction = nil
-          if Algorithm::zero_variance? acts
-            prediction = acts[0]
-          else
-            #LOGGER.debug gram_matrix.to_yaml
-            @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
-            @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
-            LOGGER.debug "Setting R data ..."
             # set data
+            LOGGER.debug "Setting R data ..."
             @r.n_prop = n_prop.flatten
             @r.n_prop_x_size = n_prop.size
             @r.n_prop_y_size = n_prop[0].size
             @r.y = acts
             @r.q_prop = q_prop
+            #@r.eval "y = matrix(y)"
+            @r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
+            @r.eval "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
 
-            begin
-              LOGGER.debug "Preparing R data ..."
-              # prepare data
-              @r.eval "y<-matrix(y)"
-              @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
-              @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
-              
-              # model + support vectors
-              LOGGER.debug "Creating SVM model ..."
-              @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
-              LOGGER.debug "Predicting ..."
-              if type == "nu-svr" 
-                @r.eval "p<-predict(model,q_prop)[1,1]"
-              elsif type == "C-bsvc"
-                @r.eval "p<-predict(model,q_prop)"
-              end
-              if type == "nu-svr"
-                prediction = @r.p
-              elsif type == "C-bsvc"
-                #prediction = (@r.p.to_f == 1.0 ? true : false)
-                prediction = @r.p
-              end
-              @r.quit # free R
-            rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message}"
-              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-            end
-          end
-          prediction
-      end
+            # prepare data
+            LOGGER.debug "Preparing R data ..."
+            @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
+
+            @r.eval <<-EOR
+              rem = nearZeroVar(prop_matrix)
+              if (length(rem) > 0) {
+                prop_matrix = prop_matrix[,-rem,drop=F]
+                q_prop = q_prop[,-rem,drop=F]
+              }
+              rem = findCorrelation(cor(prop_matrix))
+              if (length(rem) > 0) {
+                prop_matrix = prop_matrix[,-rem,drop=F]
+                q_prop = q_prop[,-rem,drop=F]
+              }
+            EOR
 
-      # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
-      # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
-      # @return[Float] Confidence
-      def self.get_confidence(params)
-        if params[:conf_stdev]
-          sim_median = params[:sims].to_scale.median
-          if sim_median.nil?
-            confidence = nil
-          else
-            standard_deviation = params[:acts].to_scale.standard_deviation_sample
-            confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
-            if confidence.nan?
-              confidence = nil
-            end
-          end
-        else
-          conf = params[:sims].inject{|sum,x| sum + x }
-          confidence = conf/params[:neighbors].size
-        end
-        LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
-        return confidence
-      end
+            # model + support vectors
+            LOGGER.debug "Creating R SVM model ..."
+            @r.eval <<-EOR
+              model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
+              perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+            EOR
 
-      # Get X and Y size of a nested Array (Matrix)
-      def self.get_sizes(matrix)
-        begin
-          nr_cases = matrix.size
-          nr_features = matrix[0].size
-        rescue Exception => e
-          LOGGER.debug "#{e.class}: #{e.message}"
-          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-        end
-        #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
-        [ nr_cases, nr_features ]
-      end
 
-      # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
-      # Same for the vector describing the query compound
-      # @param[Array] neighbors.
-      # @param[OpenTox::Compound] query compound.
-      # @param[Array] Dataset Features.
-      # @param[Array] Fingerprints of neighbors.
-      # @param[Float] p-values of Features.
-      def self.get_props (params)
-        matrix = Array.new
-        begin 
-          params[:neighbors].each do |n|
-            n = n[:compound]
-            row = []
-            params[:features].each do |f|
-              if ! params[:fingerprints][n].nil? 
-                row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
-              else
-                row << 0.0
-              end
-            end
-            matrix << row
-          end
-          row = []
-          params[:features].each do |f|
-            if params[:nr_hits]
-              compound_feature_hits = params[:compound].match_hits([f])
-              row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
-            else
-              row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
-            end
+            # prediction
+            LOGGER.debug "Predicting ..."
+            @r.eval "p = predict(model,q_prop)"
+            @r.eval "if (class(y)!='numeric') p = as.character(p)"
+            prediction = @r.p
+
+            # censoring
+            prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
+            LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
           end
-        rescue Exception => e
-          LOGGER.debug "get_props failed with '" + $! + "'"
+          @r.quit # free R
         end
-        [ matrix, row ]
+        prediction
       end
 
     end
 
+    module FeatureSelection
+      include Algorithm
+      # Recursive Feature Elimination using caret
+      # @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).
+      # @return [String] feature dataset CSV file composed of selected features.
+      def self.rfe(params)
+        @r=RinRuby.new(false,false)
+        @r.ds_csv_file = params[:ds_csv_file].to_s
+        @r.prediction_feature = params[:prediction_feature].to_s
+        @r.fds_csv_file = params[:fds_csv_file].to_s
+        @r.del_missing = params[:del_missing] == true ? 1 : 0
+        r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
+        @r.f_fds_r = r_result_file.to_s
+        
+        # need packs 'randomForest', 'RANN'
+        @r.eval <<-EOR
+          set.seed(1)
+          suppressPackageStartupMessages(library('caret'))
+          suppressPackageStartupMessages(library('randomForest'))
+          suppressPackageStartupMessages(library('RANN'))
+          suppressPackageStartupMessages(library('doMC'))
+          registerDoMC()
+          
+          acts = read.csv(ds_csv_file, check.names=F)
+          feats = read.csv(fds_csv_file, check.names=F)
+          ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
+          
+          features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
+          y = ds[,which(names(ds) == prediction_feature)] 
+          
+          # assumes a data matrix 'features' and a vector 'y' of target values
+          row.names(features)=NULL
+          
+          pp = NULL
+          if (del_missing) {
+            # needed if rows should be removed
+            na_ids = apply(features,1,function(x)any(is.na(x)))
+            features = features[!na_ids,]
+            y = y[!na_ids]
+            pp = preProcess(features, method=c("scale", "center"))
+          } else {
+            # Use imputation if NA's random (only then!)
+            pp = preProcess(features, method=c("scale", "center", "knnImpute"))
+          }
+          features = predict(pp, features)
+          
+          # determine subsets
+          subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+          subsets = c(2,3,4,5,7,10,subsets)
+          subsets = unique(sort(round(subsets))) 
+          subsets = subsets[subsets<=dim(features)[2]]
+          subsets = subsets[subsets>1] 
+          
+          # Recursive feature elimination
+          rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
+          
+          # read existing dataset and select most useful features
+          csv=feats[,c("SMILES", rfProfile$optVariables)]
+          write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
+        EOR
+        r_result_file
+      end
+    end
+
     module Substructure
       include Algorithm
       # Substructure matching
-      # @param [OpenTox::Compound] compound Compound
-      # @param [Array] features Array with Smarts strings
+      # @param [Hash] required keys: compound, features
       # @return [Array] Array with matching Smarts
-      def self.match(compound,features)
-        compound.match(features)
+      def self.match(params)
+        params[:compound].match(params[:features])
       end
+
+      # Substructure matching with number of non-unique hits
+      # @param [Hash] required keys: compound, features
+      # @return [Hash] Hash with matching Smarts and number of hits 
+      def self.match_hits(params)
+        params[:compound].match_hits(params[:features])
+      end
+
+      # Substructure matching with number of non-unique hits
+      # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
+      # @return [Hash] Hash with matching Smarts and number of hits 
+      def self.lookup(params)
+        params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type])
+      end  
     end
 
     module Dataset
@@ -717,283 +537,5 @@ module OpenTox
       def features(dataset_uri,compound_uri)
       end
     end
-
-    module Transform
-      include Algorithm
-
-      # The transformer that inverts values.
-      # 1/x is used, after values have been moved >= 1.
-      class Inverter
-        attr_accessor :offset, :values
-
-        # @params[Array] Values to transform.
-        # @params[Float] Offset for restore.
-        def initialize *args
-          case args.size
-          when 1
-            begin
-              values=args[0]
-              raise "Cannot transform, values empty." if @values.size==0
-              @values = values.collect { |v| -1.0 * v }  
-              @offset = 1.0 - @values.minmax[0] 
-              @offset = -1.0 * @offset if @offset>0.0 
-              @values.collect! { |v| v - @offset }   # slide >1
-              @values.collect! { |v| 1 / v }         # invert to [0,1]
-            rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message}"
-              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-            end
-          when 2
-            @offset = args[1].to_f
-            @values = args[0].collect { |v| 1 / v }
-            @values.collect! { |v| v + @offset }
-            @values.collect! { |v| -1.0 * v }
-          end
-        end
-      end
-
-      # The transformer that takes logs.
-      # Log10 is used, after values have been moved > 0.
-      class Log10
-        attr_accessor :offset, :values
-
-        # @params[Array] Values to transform / restore.
-        # @params[Float] Offset for restore.
-        def initialize *args
-          @distance_to_zero = 0.000000001 # 1 / 1 billion
-          case args.size
-          when 1
-            begin
-              values=args[0]
-              raise "Cannot transform, values empty." if values.size==0
-              @offset = values.minmax[0] 
-              @offset = -1.0 * @offset if @offset>0.0 
-              @values = values.collect { |v| v - @offset }   # slide > anchor
-              @values.collect! { |v| v + @distance_to_zero }  #
-              @values.collect! { |v| Math::log10 v } # log10 (can fail)
-            rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message}"
-              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-            end
-          when 2
-            @offset = args[1].to_f
-            @values = args[0].collect { |v| 10**v }
-            @values.collect! { |v| v - @distance_to_zero }
-            @values.collect! { |v| v + @offset }
-          end
-        end
-      end
-
-      # The transformer that does nothing (No OPeration).
-      class NOP
-        attr_accessor :offset, :values
-
-        # @params[Array] Values to transform / restore.
-        # @params[Float] Offset for restore.
-        def initialize *args
-          @offset = 0.0
-          @distance_to_zero = 0.0
-          case args.size
-          when 1
-            @values = args[0]
-          when 2
-            @values = args[0]
-          end
-        end
-      end
-
-
-      # Auto-Scaler for Arrays
-      # Center on mean and divide by standard deviation
-      class AutoScale 
-        attr_accessor :scaled_values, :mean, :stdev
-
-        # @params[Array] Values to transform.
-        def initialize values
-          @scaled_values = values
-          @mean = @scaled_values.to_scale.mean
-          @stdev = @scaled_values.to_scale.standard_deviation_sample
-          @scaled_values = @scaled_values.collect {|vi| vi - @mean }
-          @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
-        end
-      end
-
-      # Principal Components Analysis
-      # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
-      class PCA
-        attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
-
-        # Creates a transformed dataset as GSL::Matrix.
-        # @param [GSL::Matrix] Data matrix.
-        # @param [Float] Compression ratio from [0,1].
-        # @return [GSL::Matrix] Data transformed matrix.
-        def initialize data_matrix, compression=0.05
-          begin
-            @data_matrix = data_matrix
-            @compression = compression.to_f
-            @stdev = Array.new
-            @mean = Array.new
-
-            # Objective Feature Selection
-            raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
-            @data_matrix_selected = nil
-            (0..@data_matrix.size2-1).each { |i|
-              if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
-                if @data_matrix_selected.nil?
-                  @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) 
-                  @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
-                else
-                  @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
-                end
-              end             
-            }
-            raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
-
-            # Scaling of Axes
-            @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
-            (0..@data_matrix_selected.size2-1).each { |i|
-              @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
-              @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
-              @stdev << @autoscaler.stdev
-              @mean << @autoscaler.mean
-            }
-
-            data_matrix_hash = Hash.new
-            (0..@data_matrix_scaled.size2-1).each { |i|
-              column_view = @data_matrix_scaled.col(i)
-              data_matrix_hash[i] = column_view.to_scale
-            }
-            dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
-            cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
-            pca=Statsample::Factor::PCA.new(cor_matrix)
-            pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
-            @eigenvalue_sums = Array.new
-            (0..dataset_hash.fields.size-1).each { |i|
-              @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
-            }
-            eigenvectors_selected = Array.new
-            pca.eigenvectors.each_with_index { |ev, i|
-              if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
-                eigenvectors_selected << ev.to_a
-              end
-            }
-            @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
-            dataset_matrix = dataset_hash.to_gsl.transpose
-            @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
-          rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message}"
-              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-          end
-        end
-
-        # Restores data in the original feature space (possibly with compression loss).
-        # @return [GSL::Matrix] Data matrix.
-        def restore
-          begin 
-            data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
-            # reverse scaling
-            (0..data_matrix_restored.size2-1).each { |i|
-              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
-              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
-            }
-            data_matrix_restored
-          rescue Exception => e
-            LOGGER.debug "#{e.class}: #{e.message}"
-            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-          end
-        end
-
-      end
-
-    end
-   
-    # DV: Removed Gauss kernel in this file since the additional calculations has no benefit for the result.
-    # See experiment summary: http://goo.gl/uwiKt
-    # Gauss kernel
-    # @return [Float] 
-    #def self.gauss(x, sigma = 0.3) 
-    #  d = 1.0 - x.to_f
-    #  Math.exp(-(d*d)/(2*sigma*sigma))
-    #end
-
-    # For symbolic features
-    # @param [Array] Array to test, must indicate non-occurrence with 0.
-    # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
-    def self.isnull_or_singular?(array)
-      nr_zeroes = array.count(0)
-      return (nr_zeroes == array.size) ||    # remove non-occurring feature
-             (nr_zeroes == array.size-1) ||  # remove singular feature
-             (nr_zeroes == 0)                # also remove feature present everywhere
-    end
-
-    # Numeric value test
-    # @param[Object] value
-    # @return [Boolean] Whether value is a number
-    def self.numeric?(value)
-      true if Float(value) rescue false
-    end
-
-    # For symbolic features
-    # @param [Array] Array to test, must indicate non-occurrence with 0.
-    # @return [Boolean] Whether the feature has variance zero.
-    def self.zero_variance?(array)
-      return (array.to_scale.variance_population == 0.0)
-    end
-    
-    # Sum of an array for Arrays.
-    # @param [Array] Array with values
-    # @return [Integer] Sum of size of values
-    def self.sum_size(array)
-      sum=0
-      array.each { |e| sum += e.size }
-      return sum
-    end
-
-    # Minimum Frequency
-    # @param [Integer] per-mil value
-    # return [Integer] min-frequency
-    def self.min_frequency(training_dataset,per_mil)
-      minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
-      minfreq = 2 unless minfreq > 2
-      Integer (minfreq)
-    end
-
-    # Effect calculation for classification
-    # @param [Array] Array of occurrences per class in the form of Enumerables.
-    # @param [Array] Array of database instance counts per class.
-    def self.effect(occurrences, db_instances)
-      max=0
-      max_value=0
-      nr_o = self.sum_size(occurrences)
-      nr_db = db_instances.to_scale.sum
-
-      occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
-        actual = o.size.to_f/nr_o
-        expected = db_instances[i].to_f/nr_db
-        if actual > expected
-          if ((actual - expected) / actual) > max_value
-           max_value = (actual - expected) / actual # 'Schleppzeiger'
-            max = i
-          end
-        end
-      }
-      max
-    end
-    
-    # Returns Support value of an fingerprint
-    # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
-    # return [Numeric] Support value 
-    def self.p_sum_support(params)
-      p_sum = 0.0
-        params[:features].each{|f|
-        compound_hits = params[:compound_features_hits][f]
-        neighbor_hits = params[:training_compound_features_hits][f] 
-        p_sum += eval("(params[:weights][f] * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
-      }
-      p_sum 
-    end
-                
   end
 end
-
-
diff --git a/lib/compound.rb b/lib/compound.rb
index e7b4da0..a2b7e48 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -6,13 +6,15 @@ module OpenTox
   # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
 	class Compound 
 
+    include OpenTox
+
 		attr_accessor :inchi, :uri
 
 		# Create compound with optional uri
     # @example
-    #   compound = OpenTox::Compound.new("http://webservices.in-silico.ch/compound/InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"")
+    #   compound = Compound.new("http://webservices.in-silico.ch/compound/InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"")
     # @param [optional, String] uri Compound URI
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
 		def initialize(uri=nil)
       @uri = uri
       case @uri
@@ -36,9 +38,9 @@ module OpenTox
 
     # Create a compound from smiles string
     # @example
-    #   compound = OpenTox::Compound.from_smiles("c1ccccc1")
+    #   compound = Compound.from_smiles("c1ccccc1")
     # @param [String] smiles Smiles string
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
     def self.from_smiles(smiles)
       c = Compound.new
       c.inchi = Compound.smiles2inchi(smiles)
@@ -48,7 +50,7 @@ module OpenTox
 
     # Create a compound from inchi string
     # @param [String] smiles InChI string
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
     def self.from_inchi(inchi)
       c = Compound.new
       c.inchi = inchi
@@ -58,7 +60,7 @@ module OpenTox
 
     # Create a compound from sdf string
     # @param [String] smiles SDF string
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
     def self.from_sdf(sdf)
       c = Compound.new
       c.inchi = Compound.sdf2inchi(sdf)
@@ -68,9 +70,9 @@ module OpenTox
 
     # Create a compound from name. Relies on an external service for name lookups.
     # @example
-    #   compound = OpenTox::Compound.from_name("Benzene")
+    #   compound = Compound.from_name("Benzene")
     # @param [String] name name can be also an InChI/InChiKey, CAS number, etc
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
     def self.from_name(name)
       c = Compound.new
       # paranoid URI encoding to keep SMILES charges and brackets
@@ -131,7 +133,7 @@ module OpenTox
 
 		# Match a smarts string
     # @example
-    #   compound = OpenTox::Compound.from_name("Benzene")
+    #   compound = Compound.from_name("Benzene")
     #   compound.match?("cN") # returns false
     # @param [String] smarts Smarts string
 		def match?(smarts)
@@ -146,7 +148,7 @@ module OpenTox
 
 		# Match an array of smarts strings, returns array with matching smarts
     # @example
-    #   compound = OpenTox::Compound.from_name("Benzene")
+    #   compound = Compound.from_name("Benzene")
     #   compound.match(['cc','cN']) # returns ['cc']
     # @param [Array] smarts_array Array with Smarts strings
     # @return [Array] Array with matching Smarts strings
@@ -166,7 +168,7 @@ module OpenTox
 
     # Match_hits an array of smarts strings, returns hash with matching smarts as key and number of non-unique hits as value 
     # @example
-    #   compound = OpenTox::Compound.from_name("Benzene")
+    #   compound = Compound.from_name("Benzene")
     #   compound.match(['cc','cN']) # returns ['cc']
     # @param [Array] smarts_array Array with Smarts strings
     # @return [Hash] Hash with matching smarts as key and number of non-unique hits as value
@@ -191,6 +193,43 @@ module OpenTox
       return smarts_hits
       #smarts_array.collect { |s| s if match?(s)}.compact
 		end
+    
+    # Lookup numerical values, returns hash with feature name as key and value as value 
+    # @param [Array] Array of feature names
+    # @param [String] Feature dataset uri
+    # @return [Hash] Hash with feature name as key and value as value
+		def lookup(feature_array,feature_dataset_uri,pc_type)
+      ds = OpenTox::Dataset.find(feature_dataset_uri)
+
+      #entry = ds.data_entries[self.uri]
+      entry = nil 
+      ds.data_entries.each { |c_uri, values| 
+        if c_uri.split('/compound/').last == self.to_inchi
+          entry = ds.data_entries[self.uri]
+          break
+        end
+      }
+
+      LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
+
+      if entry.nil?
+        uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
+        uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi)
+        ds = OpenTox::Dataset.find(uri)
+        entry = ds.data_entries[self.uri]
+        ds.delete
+      end
+
+      features = entry.keys
+      features.each { |feature| 
+        new_feature = File.join(feature_dataset_uri, "feature", feature.split("/").last) 
+        entry[new_feature] = entry[feature].flatten.first.to_f # see algorithm/lazar.rb:182, to_f because feature type detection doesn't work w 1 entry
+        entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
+      }
+      #res = feature_array.collect {|v| entry[v]}
+      #LOGGER.debug "----- am #{entry.to_yaml}"
+      entry
+		end
 
 
     # Get URI of compound image with highlighted fragments
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 3662527..95c1918 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -288,7 +288,7 @@ module OpenTox
 
     # Insert a statement (compound_uri,feature_uri,value)
     # @example Insert a statement (compound_uri,feature_uri,value)
-    #   dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", true
+    #   dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", 1
     # @param [String] compound Compound URI
     # @param [String] feature Compound URI
     # @param [Boolean,Float] value Feature value
@@ -315,6 +315,16 @@ module OpenTox
       @features[feature] = metadata
     end
 
+    # Complete feature values by adding zeroes
+    def complete_data_entries
+      all_features = @features.keys
+      @data_entries.each { |c, e|
+        (Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
+          self.add(c,f,0)
+        }
+      }
+    end
+
     # Add/modify metadata for a feature
     # @param [String] feature Feature URI
     # @param [Hash] metadata Hash with feature metadata
diff --git a/lib/environment.rb b/lib/environment.rb
index 6a72ba5..c1b8312 100644
--- a/lib/environment.rb
+++ b/lib/environment.rb
@@ -91,5 +91,5 @@ DC =  OwlNamespace.new 'http://purl.org/dc/elements/1.1/'
 OT =  OwlNamespace.new 'http://www.opentox.org/api/1.1#'
 OTA =  OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#'
 XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#'
-BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
+#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
 
diff --git a/lib/model.rb b/lib/model.rb
index 0b116c2..b3de1a3 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -102,8 +102,8 @@ module OpenTox
       include Algorithm
       include Model
 
-      attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim, :subjectid, :prop_kernel, :value_map, :nr_hits, :transform, :conf_stdev, :prediction_min_max
 
+      attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
       def initialize(uri=nil)
 
         if uri
@@ -120,18 +120,11 @@ module OpenTox
         @p_values = {}
         @fingerprints = {}
         @value_map = {}
-        @prediction_min_max = [] 
 
         @feature_calculation_algorithm = "Substructure.match"
         @similarity_algorithm = "Similarity.tanimoto"
         @prediction_algorithm = "Neighbors.weighted_majority_vote"
         
-        @nr_hits = false
-        @min_sim = 0.3
-        @prop_kernel = false
-        @transform = { "class" => "NOP"  }
-        @conf_stdev = false
-
       end
 
       # Get URIs of all lazar models
@@ -174,19 +167,14 @@ module OpenTox
         lazar.feature_calculation_algorithm = hash["feature_calculation_algorithm"] if hash["feature_calculation_algorithm"]
         lazar.similarity_algorithm = hash["similarity_algorithm"] if hash["similarity_algorithm"]
         lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
-        lazar.min_sim = hash["min_sim"] if hash["min_sim"]
         lazar.subjectid = hash["subjectid"] if hash["subjectid"]
-        lazar.prop_kernel = hash["prop_kernel"] if hash["prop_kernel"]
         lazar.value_map = hash["value_map"] if hash["value_map"]
-        lazar.nr_hits = hash["nr_hits"] if hash["nr_hits"]
-        lazar.transform = hash["transform"] if hash["transform"]
-        lazar.conf_stdev = hash["conf_stdev"] if hash["conf_stdev"]
-        lazar.prediction_min_max = hash["prediction_min_max"] if hash["prediction_min_max"]
+
         lazar
       end
 
       def to_json
-        Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :min_sim => @min_sim, :subjectid => @subjectid, :prop_kernel => @prop_kernel, :value_map => @value_map, :nr_hits => @nr_hits, :transform => @transform, :conf_stdev => @conf_stdev, :prediction_min_max => @prediction_min_max})
+        Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
       end
 
       def run( params, accept_header=nil, waiting_task=nil )
@@ -230,8 +218,11 @@ module OpenTox
             predict(compound_uri,false,subjectid)
             count += 1
             waiting_task.progress( count/d.compounds.size.to_f*100.0 ) if waiting_task
-          rescue => ex
-            LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+ex.message+" subjectid: #{subjectid}"
+          rescue => e
+            LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+e.message+" subjectid: #{subjectid}"
+            #LOGGER.debug "#{e.class}: #{e.message}"
+            #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+
           end
         end
         #@prediction_dataset.save(subjectid)
@@ -261,25 +252,41 @@ module OpenTox
         if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
           all_activities = [] 
           all_activities = @activities.values.flatten.collect! { |i| i.to_f }
-          @prediction_min_max[0] = (all_activities.to_scale.min/2)
-          @prediction_min_max[1] = (all_activities.to_scale.max*2)
         end
 
         unless database_activity(subjectid) # adds database activity to @prediction_dataset
 
-          neighbors
-          prediction = eval("#{@prediction_algorithm} ( { :neighbors => @neighbors, 
-                                                          :compound => @compound,
-                                                          :features => @features, 
-                                                          :p_values => @p_values, 
-                                                          :fingerprints => @fingerprints,
-                                                          :similarity_algorithm => @similarity_algorithm, 
-                                                          :prop_kernel => @prop_kernel,
+          # Calculation of needed values for query compound
+          @compound_features = eval("#{@feature_calculation_algorithm}({
+                                    :compound => @compound, 
+                                    :features => @features, 
+                                    :feature_dataset_uri => @metadata[OT.featureDataset],
+                                    :pc_type => self.parameter(\"pc_type\")
+                                    })")
+          
+          # Adding fingerprint of query compound with features and values(p_value*nr_hits)
+          @compound_fingerprints = {}
+          @compound_features.each do |feature, value| # value is nil if "Substructure.match"
+            if @feature_calculation_algorithm == "Substructure.match_hits" 
+              @compound_fingerprints[feature] = @p_values[feature] * value
+            elsif @feature_calculation_algorithm == "Substructure.match"
+              @compound_fingerprints[feature] = @p_values[feature]
+            elsif @feature_calculation_algorithm == "Substructure.lookup"
+              @compound_fingerprints[feature] = value
+            end
+          end
+
+          # Transform model data to machine learning scheme (tables of data)
+          mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self)
+          mtf.transform
+
+          # Make a prediction
+          prediction = eval("#{@prediction_algorithm}( { :props => mtf.props,
+                                                          :acts => mtf.acts,
+                                                          :sims => mtf.sims,
                                                           :value_map => @value_map,
-                                                          :nr_hits => @nr_hits,
-                                                          :conf_stdev => @conf_stdev,
-                                                          :prediction_min_max => @prediction_min_max,
-                                                          :transform => @transform } ) ")
+                                                          :min_train_performance => self.parameter(\"min_train_performance\")
+                                                        } ) ")
 
           value_feature_uri = File.join( @uri, "predicted", "value")
           confidence_feature_uri = File.join( @uri, "predicted", "confidence")
@@ -355,44 +362,6 @@ module OpenTox
         @prediction_dataset
       end
 
-      
-
-      # Find neighbors and store them as object variable, access all compounds for that.
-      def neighbors
-        @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm
-        @neighbors = []
-        @fingerprints.keys.each do |training_compound| # AM: access all compounds
-          add_neighbor @fingerprints[training_compound].keys, training_compound
-        end
-      end
-
-      # Adds a neighbor to @neighbors if it passes the similarity threshold.
-      def add_neighbor(training_features, training_compound)
-        compound_features_hits = {}
-        training_compound_features_hits = {}
-        if @nr_hits
-          compound_features_hits = @compound.match_hits(@compound_features)
-          training_compound_features_hits = @fingerprints[training_compound]
-          #LOGGER.debug "dv ------------ training_compound_features_hits:#{training_compound_features_hits.class}  #{training_compound_features_hits}"
-        end
-        params = {}
-        params[:nr_hits] = @nr_hits
-        params[:compound_features_hits] = compound_features_hits
-        params[:training_compound_features_hits] = training_compound_features_hits
-
-        sim = eval("#{@similarity_algorithm}(training_features, @compound_features, @p_values, params)")
-        if sim > @min_sim
-          @activities[training_compound].each do |act|
-            @neighbors << {
-              :compound => training_compound,
-              :similarity => sim,
-              :features => training_features,
-              :activity => act
-            }
-          end
-        end
-      end
-
       # Find database activities and store them in @prediction_dataset
       # @return [Boolean] true if compound has databasse activities, false if not
       def database_activity(subjectid)
diff --git a/lib/parser.rb b/lib/parser.rb
index 749a41d..56e4fed 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -290,10 +290,11 @@ module OpenTox
         @features = []
         @feature_types = {}
 
-        @format_errors = ""
-        @smiles_errors = []
+        @format_errors = []
+        @id_errors = []
         @activity_errors = []
         @duplicates = {}
+        @max_class_values = 3
       end
 
       def detect_new_values(row, value_maps)
@@ -309,9 +310,10 @@ module OpenTox
       # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
       # @param [Excel] book Excel workbook object (created with roo gem)
       # @return [OpenTox::Dataset] Dataset object with Excel data
-      def load_spreadsheet(book)
+      def load_spreadsheet(book, drop_missing=false)
         book.default_sheet = 0
-        add_features book.row(1)
+        headers = book.row(1)
+        add_features headers
         value_maps = Array.new
         regression_features=Array.new
 
@@ -319,15 +321,27 @@ module OpenTox
           row = book.row(i)
           value_maps = detect_new_values(row, value_maps)
           value_maps.each_with_index { |vm,j|
-            if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
+            if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
               regression_features[j]=true 
             else
               regression_features[j]=false
             end
           }
         }
+
         2.upto(book.last_row) { |i| 
-          add_values book.row(i), regression_features
+          drop=false
+          row = book.row(i)
+          raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
+          if row.include?("")
+            @format_errors << "Row #{i} has #{row.count("")} missing values" 
+            drop=true
+            drop_missing=true if (row.count("") == row.size-1) 
+          end
+          add_values(row, regression_features) unless (drop_missing && drop)
+          if (drop_missing && drop) 
+            @format_errors << "Row #{i} not added" 
+          end
         }
         warnings
         @dataset
@@ -336,10 +350,11 @@ module OpenTox
       # Load CSV string (format specification: http://toxcreate.org/help)
       # @param [String] csv CSV representation of the dataset
       # @return [OpenTox::Dataset] Dataset object with CSV data
-      def load_csv(csv)
+      def load_csv(csv, drop_missing=false)
         row = 0
         input = csv.split("\n")
-        add_features split_row(input.shift)
+        headers = split_row(input.shift)
+        add_features(headers)
         value_maps = Array.new
         regression_features=Array.new
 
@@ -347,15 +362,27 @@ module OpenTox
           row = split_row(row)
           value_maps = detect_new_values(row, value_maps)
           value_maps.each_with_index { |vm,j|
-            if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
+            if vm.size > @max_class_values # max @max_class_values classes.
               regression_features[j]=true 
             else
               regression_features[j]=false
             end
           }
         }
-        input.each { |row| 
-          add_values split_row(row), regression_features
+
+        input.each_with_index { |row, i| 
+          drop=false
+          row = split_row(row)
+          raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
+          if row.include?("")
+            @format_errors << "Row #{i} has #{row.count("")} missing values" 
+            drop=true
+            drop_missing=true if (row.count("") == row.size-1) 
+          end
+          add_values(row, regression_features) unless (drop_missing && drop)
+          if (drop_missing && drop) 
+            @format_errors << "Row #{i} not added" 
+          end
         }
         warnings
         @dataset
@@ -367,88 +394,115 @@ module OpenTox
 
         info = ''
         @feature_types.each do |feature,types|
-          if types.uniq.size > 1
+          if types.uniq.size == 0
+            type = "helper#MissingFeature"
+          elsif types.uniq.size > 1
             type = OT.NumericFeature
           else
             type = types.first
           end
           @dataset.add_feature_metadata(feature,{RDF.type => [type]})
-          info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
+          info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
 
           # TODO: rewrite feature values
-          # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored."
+          # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
         end
 
         @dataset.metadata[OT.Info] = info 
 
         warnings = ''
-        warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty?
+        warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty?
         warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
+        warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty?
         duplicate_warnings = ''
         @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
-        warnings += "<p>Duplicated structures (all structures/activities used for model building, please  make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
+        warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
 
         @dataset.metadata[OT.Warnings] = warnings 
 
       end
 
+      # Adds a row of features to a dataset
+      # @param Array A row split up as an array
+      # @return Array Indices for duplicate features
       def add_features(row)
-        row.shift  # get rid of smiles entry
-        row.each do |feature_name|
+        row=row.collect
+        row.shift  # get rid of id entry
+        @duplicate_feature_indices = [] # starts with 0 at first f after id
+        row.each_with_index do |feature_name, idx|
           feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
-          @feature_types[feature_uri] = []
-          @features << feature_uri
-          @dataset.add_feature(feature_uri,{DC.title => feature_name})
+          unless @features.include? feature_uri
+            @feature_types[feature_uri] = []
+            @features << feature_uri
+            @dataset.add_feature(feature_uri,{DC.title => feature_name})
+          else
+            @duplicate_feature_indices << idx
+            @format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}"
+          end
         end
       end
 
       # Adds a row to a dataset
       # @param Array A row split up as an array
       # @param Array Indicator for regression for each field
+      # @param Array Indices for duplicate features
       def add_values(row, regression_features)
 
-        smiles = row.shift
-        compound = Compound.from_smiles(smiles)
+        id = row.shift
+        case id
+        when /InChI/
+          compound = Compound.from_inchi(URI.decode_www_form_component(id))
+        else
+          compound = Compound.from_smiles(id)
+        end
+
         if compound.nil? or compound.inchi.nil? or compound.inchi == ""
-          @smiles_errors << smiles+", "+row.join(", ") 
+          @id_errors << id+", "+row.join(", ") 
           return false
         end
         @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
-        @duplicates[compound.inchi] << smiles+", "+row.join(", ")
+        @duplicates[compound.inchi] << id+", "+row.join(", ")
 
+        feature_idx = 0
         row.each_index do |i|
-          value = row[i]
-          feature = @features[i]
 
-          type = nil
-          if (regression_features[i])
-            type = feature_type(value)
-            if type != OT.NumericFeature
-              raise "Error! Expected numeric values."
+          unless @duplicate_feature_indices.include? i
+
+            value = row[i]
+            #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
+            feature = @features[feature_idx]
+  
+            type = feature_type(value) # May be NIL
+            type = OT.NominalFeature unless (type.nil? || regression_features[i])
+            @feature_types[feature] << type if type
+  
+            val = nil
+            case type
+            when OT.NumericFeature
+              val = value.to_f
+            when OT.NominalFeature
+              val = value.to_s
             end
-          else
-            type = OT.NominalFeature
-          end
-          @feature_types[feature] << type 
 
-          case type
-          when OT.NumericFeature
-            val = value.to_f
-          when OT.NominalFeature
-            val = value.to_s
-          end
-          if val!=nil
-            @dataset.add(compound.uri, feature, val)
-            if type!=OT.NumericFeature
-              @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
-              @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+            feature_idx += 1
+  
+            if val != nil
+              @dataset.add(compound.uri, feature, val)
+              if type != OT.NumericFeature
+                @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
+                @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+              end
             end
+
           end
+
         end
       end
 
       def feature_type(value)
-        if OpenTox::Algorithm::numeric? value
+        if value == ""
+          return nil
+        elsif OpenTox::Algorithm::numeric? value
           return OT.NumericFeature
         else
           return OT.NominalFeature
@@ -456,7 +510,7 @@ module OpenTox
       end
 
       def split_row(row)
-        row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes
+        row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells
       end
 
     end
@@ -468,6 +522,7 @@ module OpenTox
       def initialize
         @data = {}
         @activity_errors = []
+        @max_class_values = 3
       end
 
       def feature_values(feature)
@@ -485,14 +540,14 @@ module OpenTox
       def clean_features
         ignored_features = []
         features.each do |feature|
-          if feature_values(feature).size > 5
+          if feature_values(feature).size > @max_class_values
             if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
               # REGRESSION
             elsif feature_types(feature).include? OT.NumericFeature
               @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
               @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
             else
-              @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
+              @activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)."
               ignored_features << feature
               next
             end
@@ -543,12 +598,15 @@ module OpenTox
       private
 
       def feature_type(value)
-        if OpenTox::Algorithm::numeric? value
+        if value.nil?
+          return nil
+        elsif OpenTox::Algorithm::numeric? value
           return OT.NumericFeature
         else
           return OT.NominalFeature
         end
       end
+
     end
 
     # quick hack to enable sdf import via csv
@@ -589,20 +647,20 @@ module OpenTox
             @duplicates[inchi] << rec #inchi#+", "+row.join(", ")
             compound = Compound.from_inchi inchi
           rescue
-            @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
+            @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}"
             next
           end
           row = {}
           obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
           table.data[compound.uri] = row
         end
-
-        # finda and remove ignored_features
+        
+        # find and remove ignored_features
         @activity_errors = table.clean_features
         table.add_to_dataset @dataset
 
         warnings = ''
-        warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
+        warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
         warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
         duplicate_warnings = ''
         @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
diff --git a/lib/rest_client_wrapper.rb b/lib/rest_client_wrapper.rb
index 6d25bb3..fcadebb 100644
--- a/lib/rest_client_wrapper.rb
+++ b/lib/rest_client_wrapper.rb
@@ -70,7 +70,7 @@ module OpenTox
       
       begin
         #LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect
-        resource = RestClient::Resource.new(uri,{:timeout => 60})
+        resource = RestClient::Resource.new(uri,{:timeout => 600})
         if rest_call=="post" || rest_call=="put"
           result = resource.send(rest_call, payload, headers)
         else
diff --git a/lib/serializer.rb b/lib/serializer.rb
index a010ce5..30cb2ba 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -55,7 +55,7 @@ module OpenTox
           OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
 
-          #object props for validation#           
+          #object props for validation#
           OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
@@ -87,7 +87,7 @@ module OpenTox
           OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
 
-          # annotation props for validation        
+          # annotation props for validation
           OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
@@ -143,8 +143,8 @@ module OpenTox
         @data_entries = {}
         @values_id = 0
         @parameter_id = 0
-        
-        @classes = Set.new 
+
+        @classes = Set.new
         @object_properties = Set.new
         @annotation_properties = Set.new
         @datatype_properties = Set.new
@@ -208,7 +208,7 @@ module OpenTox
         @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] }
         add_metadata uri, metadata
       end
-      
+
       # Add a resource defined by resource_class and content
       # (see documentation of add_content for example)
       # @param [String] uri of resource
@@ -223,10 +223,10 @@ module OpenTox
       def add_uri(uri,type)
         @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] }
       end
-      
+
       private
       @@content_id = 1
-      
+
       #Recursiv function to add content
       #@example
       #  { DC.description => "bla",
@@ -244,7 +244,7 @@ module OpenTox
         hash.each do |u,v|
           if v.is_a? Hash
             # value is again a hash, i.e. a new owl class is added
-            # first make sure type (==class) is set 
+            # first make sure type (==class) is set
             type = v[RDF.type]
             raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type
             raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type)
@@ -256,7 +256,7 @@ module OpenTox
             # add content to new class
             add_content(genid,v)
           elsif v.is_a? Array
-            # value is an array, i.e. a list of values with property is added 
+            # value is an array, i.e. a list of values with property is added
             v.each{ |vv| add_content( uri, { u => vv } ) }
           else # v.is_a? String
             # simple string value
@@ -268,7 +268,7 @@ module OpenTox
           end
         end
       end
-      
+
       public
 
       # Add metadata
@@ -329,7 +329,7 @@ module OpenTox
           v = [{ "type" => "uri", "value" => value}]
         when "literal"
           v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }]
-        else 
+        else
           raise "Illegal type #{type(value)} for #{value}."
         end
         @object[values] = {
@@ -342,7 +342,7 @@ module OpenTox
       end
 
       # Serializers
-      
+
       # Convert to N-Triples
       # @return [text/plain] Object OWL-DL in N-Triples format
       def to_ntriples
@@ -353,7 +353,7 @@ module OpenTox
           entry.each do |p,objects|
             p = url(p)
             objects.each do |o|
-              case o["type"] 
+              case o["type"]
               when "uri"
                 o = url(o["value"])
               when "literal"
@@ -371,9 +371,15 @@ module OpenTox
       # Convert to RDF/XML
       # @return [text/plain] Object OWL-DL in RDF/XML format
       def to_rdfxml
-        Tempfile.open("owl-serializer"){|f| f.write(self.to_ntriples); @path = f.path}
+        tmpf = Tempfile.open("owl-serializer")
+        tmpf.write(self.to_ntriples)
+        tmpf.flush
+        @path = tmpf.path
         # TODO: add base uri for ist services
-        `rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
+        res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
+        tmpf.close
+        tmpf.delete
+        res
       end
 
       # Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification
@@ -427,20 +433,20 @@ module OpenTox
       end
 
       def literal(value,type)
-        # concat and << are faster string concatination operators than + 
+        # concat and << are faster string concatination operators than +
         '"'.concat(value.to_s).concat('"^^<').concat(type).concat('>')
       end
 
       def url(uri)
-        # concat and << are faster string concatination operators than + 
+        # concat and << are faster string concatination operators than +
         '<'.concat(uri).concat('>')
       end
 
       def rdf_types
-        @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } } 
-        @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } } 
-        @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } } 
-        @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } } 
+        @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
+        @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
+        @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
+        @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
       end
 
     end
@@ -457,27 +463,38 @@ module OpenTox
         @rows.first << features
         @rows.first.flatten!
         dataset.data_entries.each do |compound,entries|
-          smiles = Compound.new(compound).to_smiles
+          cmpd = Compound.new(compound)
+          smiles = cmpd.to_smiles
+          inchi = URI.encode_www_form_component(cmpd.to_inchi)
+          row_container = Array.new
           row = Array.new(@rows.first.size)
-          row[0] = smiles
+          row_container << row
+          #row[0] = smiles
+          row[0] = inchi
           entries.each do |feature, values|
             i = features.index(feature)+1
             values.each do |value|
-              if row[i] 
-                row[i] = "#{row[i]} #{value}" # multiple values
+              if row_container[0][i]
+                #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
+                row_container << row_container.last.collect
+                row_container.last[i] = value
+                #LOGGER.debug "RC: #{row_container.to_yaml}"
               else
-                row[i] = value 
+                row_container.each { |r| r[i] = value }
               end
             end
           end
-          @rows << row
+          row_container.each { |r| @rows << r }
         end
       end
 
       # Convert to CSV string
       # @return [String] CSV string
       def to_csv
-        @rows.collect{|r| r.join(", ")}.join("\n")
+        rows = @rows.collect
+        result = ""
+        result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
+        result << rows.collect{ |r| r.join(",") }.join("\n")
       end
 
       # Convert to spreadsheet workbook
diff --git a/lib/transform.rb b/lib/transform.rb
new file mode 100644
index 0000000..8fe1093
--- /dev/null
+++ b/lib/transform.rb
@@ -0,0 +1,520 @@
+module OpenTox
+    module Transform
+    # Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
+    
+      # LogAutoScaler for GSL vectors.
+      # Take log and scale.
+      class LogAutoScale
+        attr_accessor :vs, :offset, :autoscaler
+
+        # @param [GSL::Vector] Values to transform using LogAutoScaling.
+        def initialize values
+          @distance_to_zero = 1.0
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            @offset = vs.min - @distance_to_zero
+            @autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs)
+            @vs = @autoscaler.vs
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] values to restore.
+        # @return [GSL::Vector] transformed values.
+        def restore values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            rv = @autoscaler.restore(vs)
+            rv.to_a.collect { |v| (10**v) + @offset }.to_gv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def mvlog values 
+          values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv
+        end
+
+      end
+
+
+      # Auto-Scaler for GSL vectors.
+      # Center on mean and divide by standard deviation.
+      class AutoScale 
+        attr_accessor :vs, :mean, :stdev
+
+        # @param [GSL::Vector] values to transform using AutoScaling.
+        def initialize values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            @mean = vs.to_scale.mean
+            @stdev = vs.to_scale.standard_deviation_population
+            @stdev = 0.0 if @stdev.nan? 
+            @vs = transform vs
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def transform values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            autoscale values.clone
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] Values to restore.
+        # @return [GSL::Vector] transformed values.
+        def restore values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
+            (rv_ss + @mean).to_gsl
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def autoscale values
+          vs_ss = values.clone.to_scale - @mean
+          @stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl
+        end
+
+      end
+
+
+      # Principal Components Analysis.
+      class PCA
+        attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
+
+        # Creates a transformed dataset as GSL::Matrix.
+        #
+        # @param [GSL::Matrix] Data matrix.
+        # @param [Float] Compression ratio from [0,1], default 0.05.
+        # @return [GSL::Matrix] Data transformed matrix.
+        def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
+          begin
+            @data_matrix = data_matrix.clone
+            @compression = compression.to_f
+            @mean = Array.new
+            @autoscaler = Array.new
+            @cols = Array.new
+            @maxcols = maxcols
+
+            # Objective Feature Selection
+            raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
+            @data_matrix_selected = nil
+            (0..@data_matrix.size2-1).each { |i|
+              if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
+                if @data_matrix_selected.nil?
+                  @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) 
+                  @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
+                else
+                  @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
+                end
+                @cols << i
+              end             
+            }
+            raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
+
+            # PCA uses internal centering on 0
+            @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
+            (0..@cols.size-1).each { |i|
+              as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
+              @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
+              @mean << as.mean
+              @autoscaler << as
+            }
+
+            # PCA
+            data_matrix_hash = Hash.new
+            (0..@cols.size-1).each { |i|
+              column_view = @data_matrix_scaled.col(i)
+              data_matrix_hash[i] = column_view.to_scale
+            }
+            dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
+            cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
+            pca=Statsample::Factor::PCA.new(cor_matrix)
+
+            # Select best eigenvectors
+            pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
+            @eigenvalue_sums = Array.new
+            (0..@cols.size-1).each { |i|
+              @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
+            }
+            eigenvectors_selected = Array.new
+            pca.eigenvectors.each_with_index { |ev, i|
+              if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
+                eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
+              end
+            }
+            @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
+            @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
+
+          rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # Transforms data to feature space found by PCA.
+        #
+        # @param [GSL::Matrix] Data matrix.
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform values
+          begin
+            vs = values.clone
+            raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
+            data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
+            @cols.each_with_index { |i,j|
+              data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
+            }
+            (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # Restores data in the original feature space (possibly with compression loss).
+        #
+        # @param [GSL::Matrix] Transformed data matrix.
+        # @return [GSL::Matrix] Data matrix.
+        def restore
+          begin 
+            data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
+            # reverse scaling
+            (0..@cols.size-1).each { |i|
+              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
+            }
+            data_matrix_restored
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+      end
+
+
+      # Singular Value Decomposition
+      class SVD
+        attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv
+
+        # Creates a transformed dataset as GSL::Matrix.
+        #
+        # @param [GSL::Matrix] Data matrix
+        # @param [Float] Compression ratio from [0,1], default 0.05
+        # @return [GSL::Matrix] Data transformed matrix
+
+        def initialize data_matrix, compression=0.05
+          begin
+            @data_matrix = data_matrix.clone
+            @compression = compression
+
+            # Compute the SV Decomposition X=USV
+            # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
+            u, vt, s = data_matrix.SV_decomp 
+            
+            # Determine cutoff index
+            s2 = s.mul(s) ; s2_sum = s2.sum
+            s2_run = 0
+            k = s2.size - 1
+            s2.to_a.reverse.each { |v| 
+              s2_run += v
+              frac = s2_run / s2_sum
+              break if frac > compression
+              k -= 1
+            }
+            k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
+            
+            # Take the k-rank approximation of the Matrix
+            #   - Take first k columns of u
+            #   - Take first k columns of vt
+            #   - Take the first k eigenvalues
+            @uk = u.submatrix(nil, (0..k)) # used to transform column format data
+            @vk = vt.submatrix(nil, (0..k)) # used to transform row format data
+            s = GSL::Matrix.diagonal(s)
+            @eigk = s.submatrix((0..k), (0..k))
+            @eigk_inv = @eigk.inv
+
+            # Transform data
+            @data_transformed_matrix = @uk # = u for all SVs
+            # NOTE: @data_transformed_matrix is also equal to
+            # @data_matrix * @vk * @eigk_inv
+
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+
+        # Transforms data instance (1 row) to feature space found by SVD.
+        #
+        # @param [GSL::Matrix] Data matrix (1 x m).
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform_instance values
+          begin
+            values * @vk * @eigk_inv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        alias :transform :transform_instance # make this the default (see PCA interface)
+
+        # Transforms data feature (1 column) to feature space found by SVD.
+        #
+        # @param [GSL::Matrix] Data matrix (1 x n).
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform_feature values
+          begin
+            values * @uk * @eigk_inv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+
+        # Restores data in the original feature space (possibly with compression loss).
+        #
+        # @param [GSL::Matrix] Transformed data matrix.
+        # @return [GSL::Matrix] Data matrix.
+        def restore
+          begin 
+            @data_transformed_matrix * @eigk * @vk.transpose  # reverse svd
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+
+      end
+
+
+
+      # Attaches transformations to an OpenTox::Model
+      # Stores props, sims, performs similarity calculations
+      class ModelTransformer
+        attr_accessor :model, :similarity_algorithm, :acts, :sims
+
+        # @params[OpenTox::Model] model to transform
+        def initialize model
+          @model = model
+          @similarity_algorithm = @model.similarity_algorithm
+        end
+
+        def transform
+          get_matrices # creates @n_prop, @q_prop, @acts from ordered fps
+          @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors
+
+          # Preprocessing
+          if (@model.similarity_algorithm == "Similarity.cosine")
+            # truncate nil-columns and -rows
+            LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+            while @q_prop.size>0
+              idx = @q_prop.index(nil)
+              break if idx.nil?
+              @q_prop.slice!(idx)
+              @n_prop.each { |r| r.slice!(idx) }
+            end
+            LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+            remove_nils  # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors
+            LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+
+            # adjust rest
+            fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp
+            cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp
+            acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
+
+            # scale and svd
+            nr_cases, nr_features = @n_prop.size, @n_prop[0].size
+            gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup
+            gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup
+            (0...nr_features).each { |i|
+               autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i))
+               gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs
+               gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i)
+            }
+            svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0)
+            @n_prop = svd.data_transformed_matrix.to_a
+            @q_prop = svd.transform(gsl_q_prop).row(0).to_a
+            LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+          else
+            convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched
+          end
+
+          # neighbor calculation
+          @ids = [] # surviving compounds become neighbors
+          @sims = [] # calculated by neighbor routine
+          neighbors
+          n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
+          acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
+
+
+          # Sims between neighbors, if necessary
+          gram_matrix = []
+          if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.)
+            @n_prop.each_index do |i|
+              gram_matrix[i] = [] unless gram_matrix[i]
+              @n_prop.each_index do |j|
+                if (j>i)
+                  sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
+                  gram_matrix[i][j] = sim
+                  gram_matrix[j] = [] unless gram_matrix[j]
+                  gram_matrix[j][i] = gram_matrix[i][j]
+                end
+              end
+              gram_matrix[i][i] = 1.0
+            end
+          end
+
+          # reclaim original data (if svd was performed)
+          if svd
+            @n_prop = gsl_n_prop_orig.to_a 
+            n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp
+            @q_prop = gsl_q_prop_orig.row(0).to_a
+          end
+
+          LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+          LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
+
+          @sims = [ gram_matrix, @sims ] 
+
+        end
+
+
+          
+
+        # Find neighbors and store them as object variable, access all compounds for that.
+        def neighbors
+          @model.neighbors = []
+          @n_prop.each_with_index do |fp, idx| # AM: access all compounds
+            add_neighbor fp, idx
+          end
+        end
+
+
+        # Adds a neighbor to @neighbors if it passes the similarity threshold
+        # adjusts @ids to signal the
+        def add_neighbor(training_props, idx)
+
+          sim = similarity(training_props)
+          if sim > @model.parameter("min_sim")
+            if @model.activities[@cmpds[idx]]
+              @model.activities[@cmpds[idx]].each do |act|
+                @model.neighbors << {
+                  :compound => @cmpds[idx],
+                  :similarity => sim,
+                  :features => @fps[idx].keys,
+                  :activity => act
+                }
+                @sims << sim
+                @ids << idx
+              end
+            end
+          end
+        end
+
+
+        # Removes nil entries from n_prop and q_prop.
+        # Matrix is a nested two-dimensional array.
+        # Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed.
+        # Tie break: columns take precedence.
+        # Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero.
+        # Enables the use of cosine similarity / SVD
+        def remove_nils
+         return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0)
+          col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
+          row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
+          m_cols = col_nr_nils.max
+          m_rows = row_nr_nils.max
+          idx_cols = col_nr_nils.index(m_cols)
+          idx_rows = row_nr_nils.index(m_rows)
+          while ((m_cols > 0) || (m_rows > 0)) do
+            if m_cols >= m_rows
+              @n_prop.each { |row| row.slice!(idx_cols) }
+              @q_prop.slice!(idx_cols)
+            else
+              @n_prop.slice!(idx_rows)
+              @ids.slice!(idx_rows)
+            end
+            break if (@n_prop.length == 0) || (@n_prop[0].length == 0)
+            col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
+            row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
+            m_cols = col_nr_nils.max
+            m_rows = row_nr_nils.max
+            idx_cols= col_nr_nils.index(m_cols)
+            idx_rows = row_nr_nils.index(m_rows)
+          end
+        end
+
+
+        # Replaces nils by zeroes in n_prop and q_prop
+        # Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop)
+        def convert_nils
+          @n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } }
+          @q_prop.collect! { |v| v.nil? ? 0 : v }
+        end
+
+
+        # Executes model similarity_algorithm
+        def similarity(training_props)
+          eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)")
+        end
+
+
+        # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
+        # Same for compound fingerprints.
+        def get_matrices
+
+          @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
+          
+          @model.fingerprints.each { |fp|
+            cmpd = fp[0]; fp = fp[1]
+            if @model.activities[cmpd] # row good
+              acts = @model.activities[cmpd]; @acts += acts
+              LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
+              row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
+              acts.size.times { # multiple additions for multiple activities
+                @n_prop << row.collect
+                @cmpds << cmpd
+                @fps << Marshal.load(Marshal.dump(fp))
+              } 
+            else
+              LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
+            end
+          }
+
+          @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
+
+        end
+
+        def props
+          @model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil
+        end
+
+      end
+
+    end
+end
diff --git a/lib/utils.rb b/lib/utils.rb
new file mode 100644
index 0000000..c9e9eca
--- /dev/null
+++ b/lib/utils.rb
@@ -0,0 +1,372 @@
+require 'csv'
+
+
+module OpenTox
+
+  module Algorithm
+
+    include OpenTox
+
+    # Calculate physico-chemical descriptors.
+    # @param[Hash] Required keys: :dataset_uri, :pc_type
+    # @return[String] dataset uri
+
+    def self.pc_descriptors(params)
+
+      begin
+        ds = OpenTox::Dataset.find(params[:dataset_uri])
+        compounds = ds.compounds.collect
+        ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
+        #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
+        LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
+        load_ds_csv(ambit_result_uri, smiles_to_inchi)
+      rescue Exception => e
+        LOGGER.debug "#{e.class}: #{e.message}"
+        LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+      end
+
+    end
+    
+    # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
+    # @param[Hash] Required keys: :compounds, :pc_type
+    # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
+    def self.get_pc_descriptors(params)
+
+      begin
+
+        ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
+        ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
+        descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
+        descs_uris = []
+        params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
+        types = params[:pc_type].split(",")
+        descs.each { |uri, cat_name| 
+          if types.include? cat_name[:category]
+            descs_uris << uri
+          end
+        }
+        if descs_uris.size == 0
+          raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
+        end
+        #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
+
+        begin
+          # Create SMI
+          smiles_array = []; smiles_to_inchi = {}
+          params[:compounds].each do |n|
+            cmpd = OpenTox::Compound.new(n)
+            smiles_string = cmpd.to_smiles
+            smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
+            smiles_array << smiles_string
+          end
+          smi_file = Tempfile.open(['pc_ambit', '.csv'])
+          pc_descriptors = nil
+
+          # Create Ambit dataset
+          smi_file.puts( "SMILES\n" )
+          smi_file.puts( smiles_array.join("\n") )
+          smi_file.flush
+          ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+        ensure
+          smi_file.close! if smi_file
+        end
+        ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
+
+        # Calculate 3D for CPSA
+        if types.include? "cpsa"
+          ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} ) 
+          LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
+        end
+
+        # Get Ambit results
+        ambit_result_uri = [] # 1st pos: base uri, then features
+        ambit_result_uri << ambit_ds_uri + "?"
+        ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
+        descs_uris.each_with_index do |uri, i|
+          algorithm = Algorithm::Generic.new(uri)
+          result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
+          ambit_result_uri << result_uri.split("?")[1] + "&"
+          LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}"
+        end
+        #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
+        [ ambit_result_uri, smiles_to_inchi ]
+
+      rescue Exception => e
+        LOGGER.debug "#{e.class}: #{e.message}"
+        LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+      end
+    end
+
+
+    # Load dataset via CSV
+    # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
+    # @return[String] dataset uri 
+    def self.load_ds_csv(ambit_result_uri, smiles_to_inchi)
+      
+      master=nil
+      (1...ambit_result_uri.size).collect { |idx|
+        curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
+        LOGGER.debug "Requesting #{curr_uri}"
+        csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv"}) )
+        if csv_data[0] && csv_data[0].size>1
+          if master.nil? # This is the smiles entry
+            (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
+            master = csv_data
+            next
+          else
+            index_uri = csv_data[0].index("SMILES")
+            csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
+            
+            nr_cols = (csv_data[0].size)-1
+            LOGGER.debug "Merging #{nr_cols} new columns"
+            master.each {|row| nr_cols.times { row.push(nil) }  } # Adds empty columns to all rows
+            csv_data.each do |row|
+              temp = master.assoc(row[0]) # Finds the appropriate line in master
+              ((-1*nr_cols)..-1).collect.each { |idx|
+                temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
+              }
+            end
+          end
+        end
+      }
+
+      index_uri = master[0].index("Compound")
+      master.map {|i| i.delete_at(index_uri)}
+      master[0].each {|cell| cell.chomp!(" ")}
+      master[0][0] = "Compound" #"SMILES" 
+      index_smi = master[0].index("SMILES")
+      master.map {|i| i.delete_at(index_smi)} if index_smi
+      #master[0][0] = "SMILES" 
+       
+      #LOGGER.debug "-------- AM: Writing to dumpfile"
+      #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
+     
+      parser = OpenTox::Parser::Spreadsheets.new
+      ds = OpenTox::Dataset.new
+      ds.save
+      parser.dataset = ds
+      ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
+      ds.save
+    end
+
+
+    # Gauss kernel
+    # @return [Float] 
+    def self.gauss(x, sigma = 0.3) 
+      d = 1.0 - x.to_f
+      Math.exp(-(d*d)/(2*sigma*sigma))
+    end
+
+
+    # For symbolic features
+    # @param [Array] Array to test, must indicate non-occurrence with 0.
+    # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
+    def self.isnull_or_singular?(array)
+      nr_zeroes = array.count(0)
+      return (nr_zeroes == array.size) ||    # remove non-occurring feature
+             (nr_zeroes == array.size-1) ||  # remove singular feature
+             (nr_zeroes == 0)                # also remove feature present everywhere
+    end
+
+
+    # Numeric value test
+    # @param[Object] value
+    # @return [Boolean] Whether value is a number
+    def self.numeric?(value)
+      true if Float(value) rescue false
+    end
+
+
+    # For symbolic features
+    # @param [Array] Array to test, must indicate non-occurrence with 0.
+    # @return [Boolean] Whether the feature has variance zero.
+    def self.zero_variance?(array)
+      return array.uniq.size == 1
+    end
+    
+
+    # Sum of an array for Arrays.
+    # @param [Array] Array with values
+    # @return [Integer] Sum of size of values
+    def self.sum_size(array)
+      sum=0
+      array.each { |e| sum += e.size }
+      return sum
+    end
+
+
+    # Minimum Frequency
+    # @param [Integer] per-mil value
+    # return [Integer] min-frequency
+    def self.min_frequency(training_dataset,per_mil)
+      minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+      minfreq = 2 unless minfreq > 2
+      Integer (minfreq)
+    end
+
+
+    # Effect calculation for classification
+    # @param [Array] Array of occurrences per class in the form of Enumerables.
+    # @param [Array] Array of database instance counts per class.
+    def self.effect(occurrences, db_instances)
+      max=0
+      max_value=0
+      nr_o = self.sum_size(occurrences)
+      nr_db = db_instances.to_scale.sum
+
+      occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
+        actual = o.size.to_f/nr_o
+        expected = db_instances[i].to_f/nr_db
+        if actual > expected
+          if ((actual - expected) / actual) > max_value
+           max_value = (actual - expected) / actual # 'Schleppzeiger'
+            max = i
+          end
+        end
+      }
+      max
+    end
+    
+
+    # neighbors
+
+    module Neighbors
+      
+      # Get confidence.
+      # @param[Hash] Required keys: :sims, :acts
+      # @return[Float] Confidence
+      def self.get_confidence(params)
+        conf = params[:sims].inject{|sum,x| sum + x }
+        confidence = conf/params[:sims].size
+        LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
+        return confidence
+      end
+
+    end
+
+
+    # Similarity calculations
+    module Similarity
+
+      # Tanimoto similarity
+      # @param [Hash, Array] fingerprints of first compound
+      # @param [Hash, Array] fingerprints of second compound
+      # @return [Float] (Weighted) tanimoto similarity
+      def self.tanimoto(fingerprints_a,fingerprints_b,weights=nil,params=nil)
+
+        common_p_sum = 0.0
+        all_p_sum = 0.0
+
+        # fingerprints are hashes
+        if fingerprints_a.class == Hash && fingerprints_b.class == Hash
+          common_features = fingerprints_a.keys & fingerprints_b.keys
+          all_features = (fingerprints_a.keys + fingerprints_b.keys).uniq
+          if common_features.size > 0
+            common_features.each{ |f| common_p_sum += [ fingerprints_a[f], fingerprints_b[f] ].min }
+            all_features.each{ |f| all_p_sum += [ fingerprints_a[f],fingerprints_b[f] ].compact.max } # compact, since one fp may be empty at that pos
+          end
+
+        # fingerprints are arrays
+        elsif fingerprints_a.class == Array && fingerprints_b.class == Array
+          size = [ fingerprints_a.size, fingerprints_b.size ].min
+          LOGGER.warn "fingerprints don't have equal size" if fingerprints_a.size != fingerprints_b.size
+          (0...size).each { |idx|
+            common_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].min
+            all_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].max
+          }
+        end
+
+        (all_p_sum > 0.0) ? (common_p_sum/all_p_sum) : 0.0
+
+      end
+
+
+      # Cosine similarity
+      # @param [Hash] properties_a key-value properties of first compound
+      # @param [Hash] properties_b key-value properties of second compound
+      # @return [Float] cosine of angle enclosed between vectors induced by keys present in both a and b
+      def self.cosine(fingerprints_a,fingerprints_b,weights=nil)
+
+        # fingerprints are hashes
+        if fingerprints_a.class == Hash && fingerprints_b.class == Hash
+          a = []; b = []
+          common_features = fingerprints_a.keys & fingerprints_b.keys
+          if common_features.size > 1
+            common_features.each do |p|
+              a << fingerprints_a[p]
+              b << fingerprints_b[p]
+            end
+          end
+
+        # fingerprints are arrays
+        elsif fingerprints_a.class == Array && fingerprints_b.class == Array
+          a = fingerprints_a
+          b = fingerprints_b
+        end
+
+        (a.size > 0 && b.size > 0) ? self.cosine_num(a.to_gv, b.to_gv) : 0.0
+
+      end
+
+
+      # Cosine similarity
+      # @param [GSL::Vector] a
+      # @param [GSL::Vector] b
+      # @return [Float] cosine of angle enclosed between a and b
+      def self.cosine_num(a, b)
+        if a.size>12 && b.size>12
+          a = a[0..11]
+          b = b[0..11]
+        end
+        a.dot(b) / (a.norm * b.norm)
+      end
+
+
+      # Outlier detection based on Mahalanobis distances
+      # Multivariate detection on X, univariate detection on y
+      # Uses an existing Rinruby instance, if possible
+      # @param[Hash] Keys query_matrix, data_matrix, acts are required; r, p_outlier optional
+      # @return[Array] indices identifying outliers (may occur several times, this is intended)
+      def self.outliers(params)
+        outlier_array = []
+        data_matrix = params[:data_matrix]
+        query_matrix = params[:query_matrix]
+        acts = params[:acts]
+        begin
+          LOGGER.debug "Outliers (p=#{params[:p_outlier] || 0.9999})..."
+          r = ( params[:r] || RinRuby.new(false,false) )
+          r.eval "suppressPackageStartupMessages(library(\"robustbase\"))"
+          r.eval "outlier_threshold = #{params[:p_outlier] || 0.999}"
+          nr_cases, nr_features = data_matrix.to_a.size, data_matrix.to_a[0].size
+          r.odx = data_matrix.to_a.flatten
+          r.q = query_matrix.to_a.flatten
+          r.y = acts.to_a.flatten
+          r.eval "odx = matrix(odx, #{nr_cases}, #{nr_features}, byrow=T)"
+          r.eval 'odx = rbind(q,odx)' # query is nr 0 (1) in ruby (R)
+          r.eval 'mah = covMcd(odx)$mah' # run MCD alg
+          r.eval "mah = pchisq(mah,#{nr_features})"
+          r.eval 'outlier_array = which(mah>outlier_threshold)'  # multivariate outliers using robust mahalanobis
+          outlier_array = r.outlier_array.to_a.collect{|v| v-2 }  # translate to ruby index (-1 for q, -1 due to ruby)
+          r.eval 'fqu = matrix(summary(y))[2]'
+          r.eval 'tqu = matrix(summary(y))[5]'
+          r.eval 'outlier_array = which(y>(tqu+1.5*IQR(y)))'     # univariate outliers due to Tukey (http://goo.gl/mwzNH)
+          outlier_array += r.outlier_array.to_a.collect{|v| v-1 } # translate to ruby index (-1 due to ruby)
+          r.eval 'outlier_array = which(y<(fqu-1.5*IQR(y)))'
+          outlier_array += r.outlier_array.to_a.collect{|v| v-1 }
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+        end
+        outlier_array
+      end
+
+    end
+
+
+  end
+
+end
+
diff --git a/opentox-ruby.gemspec b/opentox-ruby.gemspec
index d316689..d3ae2d7 100644
--- a/opentox-ruby.gemspec
+++ b/opentox-ruby.gemspec
@@ -48,6 +48,8 @@ Gem::Specification.new do |s|
     "lib/templates/default_guest_policy.xml",
     "lib/templates/default_policy.xml",
     "lib/to-html.rb",
+    "lib/transform.rb",
+    "lib/utils.rb"
     "lib/validation.rb"
   ]
   s.homepage = %q{http://github.com/helma/opentox-ruby}
@@ -88,6 +90,7 @@ Gem::Specification.new do |s|
       s.add_runtime_dependency(%q<dm-sqlite-adapter>, [">= 0"])
       s.add_runtime_dependency(%q<haml>, [">= 3"])
       s.add_runtime_dependency(%q<ruby-plot>, ["~> 0.4.0"])
+      s.add_runtime_dependency(%q<statsample>, [">= 0"])
       s.add_development_dependency(%q<jeweler>, [">= 0"])
     else
       s.add_dependency(%q<sinatra>, [">= 0"])
@@ -119,6 +122,7 @@ Gem::Specification.new do |s|
       s.add_dependency(%q<dm-sqlite-adapter>, [">= 0"])
       s.add_dependency(%q<haml>, [">= 3"])
       s.add_dependency(%q<ruby-plot>, ["~> 0.4.0"])
+      s.add_dependency(%q<statsample>, [">= 0"])
       s.add_dependency(%q<jeweler>, [">= 0"])
     end
   else
@@ -151,6 +155,7 @@ Gem::Specification.new do |s|
     s.add_dependency(%q<dm-sqlite-adapter>, [">= 0"])
     s.add_dependency(%q<haml>, [">= 3"])
     s.add_dependency(%q<ruby-plot>, ["~> 0.4.0"])
+    s.add_dependency(%q<statsample>, [">= 0"])
     s.add_dependency(%q<jeweler>, [">= 0"])
   end
 end
author	mguetlein <martin.guetlein@gmail.com>	2012-02-28 10:34:58 +0100
committer	mguetlein <martin.guetlein@gmail.com>	2012-02-28 10:34:58 +0100
commit	32cd79cd055ccd0aa9de29d68ca8c3e63663b126 (patch)
tree	5058c65d249e39983eac1d2ae0641d069f68937e
parent	04e8d118ec8e4d09b4e3ad6538aa7e8974613076 (diff)
parent	754fd31f83be3ac7557a2f7832ff44fb262a98a5 (diff)