pre v3.1.0

author: rautenberg <rautenberg@in-silico.ch> 2012-03-13 15:32:57 +0100
committer: rautenberg <rautenberg@in-silico.ch> 2012-03-13 15:32:57 +0100
commit: 6b064515f11623e0209f265b32be6889e28def52 (patch)
tree: 66e9182b001a94c96270c153a659b6b8eb0055c2
parent: 1687a218b1593478bae1ab43a3eb8e5596def684 (diff)
parent: 4f14262609d58bf856675ae01195dd2c5f70b97b (diff)
19 files changed, 2058 insertions, 869 deletions
diff --git a/ChangeLog b/ChangeLog
index de9e01b..5872d56 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+v3.1.0 2012-02-24
+		* utils.rb: added for special routines (e.g. descriptor calculation)
+		* task.rb: Polling with increasing interval
+		* parser.rb: CSV up and download fixed
+		* transform.rb: routines to create machine learning data matrices
+		* algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
+		gauss() removed
+
 v3.0.1 2011-10-19
     * feature: model registration to ontology service
     * ontology lib gets endpoints from ontology service
diff --git a/Rakefile b/Rakefile
index 952affe..dddea1b 100644
--- a/Rakefile
+++ b/Rakefile
@@ -16,7 +16,7 @@ begin
     gem.add_dependency "sinatra-respond_to", "=0.7.0"
     gem.add_dependency "sinatra-static-assets", "=0.5.0"
     gem.add_dependency "rest-client", "=1.6.1"
-    gem.add_dependency "rack", "=1.3.1"
+    gem.add_dependency "rack", "=1.3.5"
     gem.add_dependency "rack-contrib", "=1.1.0"
     gem.add_dependency "rack-flash", "=0.1.1"
     gem.add_dependency "nokogiri", "=1.4.4"
@@ -42,10 +42,9 @@ begin
     gem.add_dependency "dm-migrations",  "=1.1.0"
     gem.add_dependency "dm-validations",  "=1.1.0"
     gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
-    gem.add_dependency "ruby-plot", "=0.5.0"
+    gem.add_dependency "ruby-plot", "=0.6.0"
     gem.add_dependency "gsl", "=1.14.7"
     gem.add_dependency "statsample", "=1.1.0"
-    #gem.add_dependency "statsample-optimization", "=2.1.0"
 
     gem.add_development_dependency 'jeweler'
     gem.files =  FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index cf88bab..c026c56 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -5,6 +5,8 @@ R = nil
 require "rinruby" 
 require "statsample"
 require 'uri'
+require 'transform.rb'
+require 'utils.rb'
 
 module OpenTox
 
@@ -13,7 +15,7 @@ module OpenTox
 
     include OpenTox
 
-    # Execute algorithm with parameters, please consult the OpenTox API and the webservice documentation for acceptable parameters
+    # Execute algorithm with parameters, consult OpenTox API and webservice documentation for acceptable parameters
     # @param [optional,Hash] params Algorithm parameters
     # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
     # @return [String] URI of new resource (dataset, model, ...)
@@ -21,7 +23,7 @@ module OpenTox
       LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
       RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
     end
-    
+
     # Get OWL-DL representation in RDF/XML format
     # @return [application/rdf+xml] RDF/XML representation
     def to_rdfxml
@@ -33,7 +35,7 @@ module OpenTox
     # Generic Algorithm class, should work with all OpenTox webservices
     class Generic 
       include Algorithm
-      
+
       # Find Generic Opentox Algorithm via URI, and loads metadata, could raise NotFound/NotAuthorized error
       # @param [String] uri Algorithm URI
       # @return [OpenTox::Algorithm::Generic] Algorithm instance
@@ -44,14 +46,14 @@ module OpenTox
         raise "cannot load algorithm metadata" if alg.metadata==nil or alg.metadata.size==0
         alg
       end
-      
+
     end
 
     # Fminer algorithms (https://github.com/amaunz/fminer2)
     class Fminer
       include Algorithm
       attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
-      
+
       def check_params(params,per_mil,subjectid=nil)
         raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and  !params[:dataset_uri].nil?
         raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and  !params[:prediction_feature].nil?
@@ -81,7 +83,7 @@ module OpenTox
             LOGGER.warn "Cannot find smiles for #{compound.to_s}."
             next
           end
-          
+
           value_map=params[:value_map] unless params[:value_map].nil?
           entry.each do |feature,values|
             if feature == @prediction_feature.uri
@@ -90,7 +92,7 @@ module OpenTox
                   LOGGER.warn "No #{feature} activity for #{compound.to_s}."
                 else
                   if @prediction_feature.feature_type == "classification"
-                    activity= value_map.invert[value].to_i # activities are mapped to 1..n
+                    activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
                     @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
                   elsif @prediction_feature.feature_type == "regression"
                     activity= value.to_f 
@@ -115,23 +117,23 @@ module OpenTox
 
     end
 
-      # Backbone Refinement Class mining (http://bbrc.maunz.de/)
-      class BBRC < Fminer
-        # Initialize bbrc algorithm
-        def initialize(subjectid=nil)
-          super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
-          load_metadata(subjectid)
-        end
+    # Backbone Refinement Class mining (http://bbrc.maunz.de/)
+    class BBRC < Fminer
+      # Initialize bbrc algorithm
+      def initialize(subjectid=nil)
+        super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
+        load_metadata(subjectid)
       end
+    end
 
-      # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
-      class LAST < Fminer
-        # Initialize last algorithm
-        def initialize(subjectid=nil)
-          super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
-          load_metadata(subjectid)
-        end
+    # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
+    class LAST < Fminer
+      # Initialize last algorithm
+      def initialize(subjectid=nil)
+        super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
+        load_metadata(subjectid)
       end
+    end
 
 
     # Create lazar prediction model
@@ -144,72 +146,6 @@ module OpenTox
       end
     end
 
-    # Utility methods without dedicated webservices
-
-    # Similarity calculations
-    module Similarity
-      include Algorithm
-
-      # Tanimoto similarity
-      # @param [Array] features_a Features of first compound
-      # @param [Array] features_b Features of second compound
-      # @param [optional, Hash] weights Weights for all features
-      # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
-      # @return [Float] (Weighted) tanimoto similarity
-      def self.tanimoto(features_a,features_b,weights=nil,params=nil)
-        common_features = features_a & features_b
-        all_features = (features_a + features_b).uniq
-        #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
-        if common_features.size > 0
-          if weights
-            #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
-            if !params.nil? && params[:nr_hits]
-              params[:weights] = weights
-              params[:mode] = "min"
-              params[:features] = common_features
-              common_p_sum = Algorithm.p_sum_support(params)
-              params[:mode] = "max"
-              params[:features] = all_features
-              all_p_sum = Algorithm.p_sum_support(params)
-            else
-              common_p_sum = 0.0
-              common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
-              all_p_sum = 0.0
-              all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
-            end
-            #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
-            common_p_sum/all_p_sum
-          else
-            #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
-            common_features.size.to_f/all_features.size.to_f
-          end
-        else
-          0.0
-        end
-      end
-
-      # Euclidean similarity
-      # @param [Hash] properties_a Properties of first compound
-      # @param [Hash] properties_b Properties of second compound
-      # @param [optional, Hash] weights Weights for all properties
-      # @return [Float] (Weighted) euclidean similarity
-      def self.euclidean(properties_a,properties_b,weights=nil)
-        common_properties = properties_a.keys & properties_b.keys
-        if common_properties.size > 1
-          dist_sum = 0
-          common_properties.each do |p|
-            if weights
-              dist_sum += ( (properties_a[p] - properties_b[p]) * Algorithm.gauss(weights[p]) )**2
-            else
-              dist_sum += (properties_a[p] - properties_b[p])**2
-            end
-          end
-          1/(1+Math.sqrt(dist_sum))
-        else
-          0.0
-        end
-      end
-    end
 
     # Structural Graph Clustering by TU Munich
     # Finds clusters similar to a query structure in a given training dataset
@@ -226,7 +162,7 @@ module OpenTox
           raise "Invalid URI."
         end
         @training_dataset_uri = training_dataset_uri
-        if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
+        if !self.numeric? training_threshold || training_threshold <0 || training_threshold >1
           raise "Training threshold out of bounds."
         end
         @training_threshold = training_threshold.to_f
@@ -259,7 +195,7 @@ module OpenTox
       # @params[Float]  Similarity threshold for query to clusters (optional)
       def get_clusters query_compound_uri, query_threshold = 0.5
 
-        if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
+        if !self.numeric? query_threshold || query_threshold <0 || query_threshold >1
           raise "Query threshold out of bounds."
         end
         @query_threshold = query_threshold.to_f
@@ -285,7 +221,7 @@ module OpenTox
           metadata[DC.title][pattern]=""
           feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
         }
-        
+
         # Integrity check
         unless cluster_query_dataset.compounds.size == 1
           raise "Number of predicted compounds is != 1."
@@ -295,11 +231,11 @@ module OpenTox
         query_compound_uri = cluster_query_dataset.compounds[0]
         @target_clusters_array = Array.new
         cluster_query_dataset.features.keys.each { |cluster_membership_feature|
-        
+
           # Getting dataset URI for cluster
           target_cluster = feature_clusterid_map[cluster_membership_feature]
           dataset = @clusterid_dataset_map[target_cluster]
-        
+
           # Finally look up presence
           data_entry = cluster_query_dataset.data_entries[query_compound_uri]
           present = data_entry[cluster_membership_feature][0]
@@ -311,85 +247,13 @@ module OpenTox
 
     end
 
-    module Neighbors
-
-      # Local multi-linear regression (MLR) prediction from neighbors. 
-      # Uses propositionalized setting.
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
-      # @return [Numeric] A prediction value.
-      def self.local_mlr_prop(params)
-
-        confidence=0.0
-        prediction=nil
-
-        if params[:neighbors].size>0
-          props = params[:prop_kernel] ? get_props(params) : nil
-          acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
-          sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
-          LOGGER.debug "Local MLR (Propositionalization / GSL)."
-          prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
-          transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
-          prediction = transformer.values[0]
-          prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction  
-          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
-          params[:conf_stdev] = false if params[:conf_stdev].nil?
-          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
-          confidence = nil if prediction.nil?
-        end
-        {:prediction => prediction, :confidence => confidence}
-
-      end
-
-      # Multi-linear regression weighted by similarity.
-      # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
-      # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
-      # @return [Numeric] A prediction value.
-      def self.mlr(params)
-
-        # GSL matrix operations: 
-        # to_a : row-wise conversion to nested array
-        #
-        # Statsample operations (build on GSL):
-        # to_scale: convert into Statsample format
-
-        begin
-          n_prop = params[:n_prop].collect { |v| v }
-          q_prop = params[:q_prop].collect { |v| v }
-          n_prop << q_prop # attach q_prop
-          nr_cases, nr_features = get_sizes n_prop
-          data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
-
-          # Principal Components Analysis
-          LOGGER.debug "PCA..."
-          pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
-          data_matrix = pca.data_transformed_matrix
-
-          # Attach intercept column to data
-          intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
-          data_matrix = data_matrix.horzcat(intercept)
-          (0..data_matrix.size2-2).each { |i|
-            autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
-            data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
-          }
 
-          # Detach query instance
-          n_prop = data_matrix.to_a
-          q_prop = n_prop.pop 
-          nr_cases, nr_features = get_sizes n_prop
-          data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
 
-          # model + support vectors
-          LOGGER.debug "Creating MLR model ..."
-          c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
-          GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
-        rescue Exception => e
-          LOGGER.debug "#{e.class}: #{e.message}"
-        end
+    module Neighbors
 
-      end
 
       # Classification with majority vote from neighbors weighted by similarity
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @param [Hash] params Keys `:acts, :sims, :value_map` are required
       # @return [Numeric] A prediction value.
       def self.weighted_majority_vote(params)
 
@@ -398,12 +262,13 @@ module OpenTox
         confidence = 0.0
         prediction = nil
 
-        params[:neighbors].each do |neighbor|
-          neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
-          neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
+        LOGGER.debug "Weighted Majority Vote Classification."
 
+        params[:acts].each_index do |idx|
+          neighbor_weight = params[:sims][1][idx]
+          neighbor_contribution += params[:acts][idx] * neighbor_weight
           if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
-            case neighbor[:activity]
+            case params[:acts][idx]
             when 1
               confidence_sum -= neighbor_weight
             when 2
@@ -413,294 +278,257 @@ module OpenTox
             confidence_sum += neighbor_weight
           end
         end
-
         if params[:value_map].size == 2 
           if confidence_sum >= 0.0
-            prediction = 2 unless params[:neighbors].size==0
+            prediction = 2 unless params[:acts].size==0
           elsif confidence_sum < 0.0
-            prediction = 1 unless params[:neighbors].size==0
+            prediction = 1 unless params[:acts].size==0
           end
         else 
-          prediction = (neighbor_contribution/confidence_sum).round  unless params[:neighbors].size==0  # AM: new multinomial prediction
+          prediction = (neighbor_contribution/confidence_sum).round  unless params[:acts].size==0  # AM: new multinomial prediction
         end 
+
         LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
-        confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
+        confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0
         LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
         return {:prediction => prediction, :confidence => confidence.abs}
       end
 
+
+
       # Local support vector regression from neighbors 
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
       # @return [Numeric] A prediction value.
       def self.local_svm_regression(params)
 
-        confidence = 0.0
-        prediction = nil
-        if params[:neighbors].size>0
-          props = params[:prop_kernel] ? get_props(params) : nil
-          acts = params[:neighbors].collect{ |n| n[:activity].to_f }
-          sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
-          prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
-          transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
-          prediction = transformer.values[0]
-          prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction  
-          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
-          params[:conf_stdev] = false if params[:conf_stdev].nil?
-          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
-          confidence = nil if prediction.nil?
+        begin
+          confidence = 0.0
+          prediction = nil
+
+          LOGGER.debug "Local SVM."
+          if params[:acts].size>0
+            if params[:props]
+              n_prop = params[:props][0].collect
+              q_prop = params[:props][1].collect
+              props = [ n_prop, q_prop ]
+            end
+            acts = params[:acts].collect
+            prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+            prediction = nil if (!prediction.nil? && prediction.infinite?)
+            LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+            confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
+            confidence = 0.0 if prediction.nil?
+          end
+          {:prediction => prediction, :confidence => confidence}
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
         end
-        {:prediction => prediction, :confidence => confidence}
-        
+
       end
 
-      # Local support vector classification from neighbors 
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+
+      # Local support vector regression from neighbors 
+      # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
       # @return [Numeric] A prediction value.
       def self.local_svm_classification(params)
 
-        confidence = 0.0
-        prediction = nil
-        if params[:neighbors].size>0
-          props = params[:prop_kernel] ? get_props(params) : nil
-          acts = params[:neighbors].collect { |n| act = n[:activity] }
-          sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
-          prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
-          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
-          params[:conf_stdev] = false if params[:conf_stdev].nil?
-          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
+        begin
+          confidence = 0.0
+          prediction = nil
+
+          LOGGER.debug "Local SVM."
+          if params[:acts].size>0
+            if params[:props]
+              n_prop = params[:props][0].collect
+              q_prop = params[:props][1].collect
+              props = [ n_prop, q_prop ]
+            end
+            acts = params[:acts].collect
+            acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
+            prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+            prediction = prediction.sub(/Val/,"") if prediction # Convert back to Float
+            confidence = 0.0 if prediction.nil?
+            LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+            confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
+          end
+          {:prediction => prediction, :confidence => confidence}
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
         end
-        {:prediction => prediction, :confidence => confidence}
-        
+
       end
 
 
+
       # Local support vector prediction from neighbors. 
-      # Uses pre-defined Kernel Matrix.
+      # Uses propositionalized setting.
       # Not to be called directly (use local_svm_regression or local_svm_classification).
+      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
       # @param [Array] acts, activities for neighbors.
-      # @param [Array] sims, similarities for neighbors.
-      # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
-      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @param [Float] min_train_performance, parameter to control censoring
       # @return [Numeric] A prediction value.
-      def self.local_svm(acts, sims, type, params)
-        LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
-        neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
-        gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
+      def self.local_svm_prop(props, acts, min_train_performance)
+
+        LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
+        n_prop = props[0] # is a matrix, i.e. two nested Arrays.
+        q_prop = props[1] # is an Array.
 
         prediction = nil
         if Algorithm::zero_variance? acts
           prediction = acts[0]
         else
-          # gram matrix
-          (0..(neighbor_matches.length-1)).each do |i|
-            neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
-            gram_matrix[i] = [] unless gram_matrix[i]
-            # upper triangle
-            ((i+1)..(neighbor_matches.length-1)).each do |j|
-              neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
-              sim_params = {}
-              if params[:nr_hits]
-                sim_params[:nr_hits] = true
-                sim_params[:compound_features_hits] = neighbor_i_hits
-                sim_params[:training_compound_features_hits] = neighbor_j_hits
-              end
-              sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
-              gram_matrix[i][j] = Algorithm.gauss(sim)
-              gram_matrix[j] = [] unless gram_matrix[j] 
-              gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
-            end
-            gram_matrix[i][i] = 1.0
-          end
-
-
           #LOGGER.debug gram_matrix.to_yaml
           @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
-          @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
-          LOGGER.debug "Setting R data ..."
-          # set data
-          @r.gram_matrix = gram_matrix.flatten
-          @r.n = neighbor_matches.size
-          @r.y = acts
-          @r.sims = sims
-
+          @r.eval "set.seed(1)"
+          @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
+          @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
+          @r.eval "registerDoMC()" # switch on parallel processing
           begin
-            LOGGER.debug "Preparing R data ..."
-            # prepare data
-            @r.eval "y<-as.vector(y)"
-            @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
-            @r.eval "sims<-as.vector(sims)"
-            
-            # model + support vectors
-            LOGGER.debug "Creating SVM model ..."
-            @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
-            @r.eval "sv<-as.vector(SVindex(model))"
-            @r.eval "sims<-sims[sv]"
-            @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
-            LOGGER.debug "Predicting ..."
-            if type == "nu-svr" 
-              @r.eval "p<-predict(model,sims)[1,1]"
-            elsif type == "C-bsvc"
-              @r.eval "p<-predict(model,sims)"
-            end
-            if type == "nu-svr"
-              prediction = @r.p
-            elsif type == "C-bsvc"
-              #prediction = (@r.p.to_f == 1.0 ? true : false)
-              prediction = @r.p
-            end
-            @r.quit # free R
-          rescue Exception => e
-            LOGGER.debug "#{e.class}: #{e.message}"
-            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-          end
-
-        end
-        prediction
-      end
-
-      # Local support vector prediction from neighbors. 
-      # Uses propositionalized setting.
-      # Not to be called directly (use local_svm_regression or local_svm_classification).
-      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
-      # @param [Array] acts, activities for neighbors.
-      # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
-      # @return [Numeric] A prediction value.
-      def self.local_svm_prop(props, acts, type)
-
-          LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
-          n_prop = props[0] # is a matrix, i.e. two nested Arrays.
-          q_prop = props[1] # is an Array.
 
-          prediction = nil
-          if Algorithm::zero_variance? acts
-            prediction = acts[0]
-          else
-            #LOGGER.debug gram_matrix.to_yaml
-            @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
-            @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
-            LOGGER.debug "Setting R data ..."
             # set data
+            LOGGER.debug "Setting R data ..."
             @r.n_prop = n_prop.flatten
             @r.n_prop_x_size = n_prop.size
             @r.n_prop_y_size = n_prop[0].size
             @r.y = acts
             @r.q_prop = q_prop
+            #@r.eval "y = matrix(y)"
+            @r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
+            @r.eval "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
 
-            begin
-              LOGGER.debug "Preparing R data ..."
-              # prepare data
-              @r.eval "y<-matrix(y)"
-              @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
-              @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
-              
-              # model + support vectors
-              LOGGER.debug "Creating SVM model ..."
-              @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
-              LOGGER.debug "Predicting ..."
-              if type == "nu-svr" 
-                @r.eval "p<-predict(model,q_prop)[1,1]"
-              elsif type == "C-bsvc"
-                @r.eval "p<-predict(model,q_prop)"
-              end
-              if type == "nu-svr"
-                prediction = @r.p
-              elsif type == "C-bsvc"
-                #prediction = (@r.p.to_f == 1.0 ? true : false)
-                prediction = @r.p
-              end
-              @r.quit # free R
-            rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message}"
-              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-            end
-          end
-          prediction
-      end
+            # prepare data
+            LOGGER.debug "Preparing R data ..."
+            @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
+
+            @r.eval <<-EOR
+              rem = nearZeroVar(prop_matrix)
+              if (length(rem) > 0) {
+                prop_matrix = prop_matrix[,-rem,drop=F]
+                q_prop = q_prop[,-rem,drop=F]
+              }
+              rem = findCorrelation(cor(prop_matrix))
+              if (length(rem) > 0) {
+                prop_matrix = prop_matrix[,-rem,drop=F]
+                q_prop = q_prop[,-rem,drop=F]
+              }
+            EOR
 
-      # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
-      # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
-      # @return[Float] Confidence
-      def self.get_confidence(params)
-        if params[:conf_stdev]
-          sim_median = params[:sims].to_scale.median
-          if sim_median.nil?
-            confidence = nil
-          else
-            standard_deviation = params[:acts].to_scale.standard_deviation_sample
-            confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
-            if confidence.nan?
-              confidence = nil
-            end
-          end
-        else
-          conf = params[:sims].inject{|sum,x| sum + x }
-          confidence = conf/params[:neighbors].size
-        end
-        LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
-        return confidence
-      end
+            # model + support vectors
+            LOGGER.debug "Creating R SVM model ..."
+            @r.eval <<-EOR
+              model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
+              perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+            EOR
 
-      # Get X and Y size of a nested Array (Matrix)
-      def self.get_sizes(matrix)
-        begin
-          nr_cases = matrix.size
-          nr_features = matrix[0].size
-        rescue Exception => e
-          LOGGER.debug "#{e.class}: #{e.message}"
-          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-        end
-        #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
-        [ nr_cases, nr_features ]
-      end
 
-      # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
-      # Same for the vector describing the query compound
-      # @param[Array] neighbors.
-      # @param[OpenTox::Compound] query compound.
-      # @param[Array] Dataset Features.
-      # @param[Array] Fingerprints of neighbors.
-      # @param[Float] p-values of Features.
-      def self.get_props (params)
-        matrix = Array.new
-        begin 
-          params[:neighbors].each do |n|
-            n = n[:compound]
-            row = []
-            params[:features].each do |f|
-              if ! params[:fingerprints][n].nil? 
-                row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
-              else
-                row << 0.0
-              end
-            end
-            matrix << row
-          end
-          row = []
-          params[:features].each do |f|
-            if params[:nr_hits]
-              compound_feature_hits = params[:compound].match_hits([f])
-              row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
-            else
-              row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
-            end
+            # prediction
+            LOGGER.debug "Predicting ..."
+            @r.eval "p = predict(model,q_prop)"
+            @r.eval "if (class(y)!='numeric') p = as.character(p)"
+            prediction = @r.p
+
+            # censoring
+            prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
+            LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
           end
-        rescue Exception => e
-          LOGGER.debug "get_props failed with '" + $! + "'"
+          @r.quit # free R
         end
-        [ matrix, row ]
+        prediction
       end
 
     end
 
+    module FeatureSelection
+      include Algorithm
+      # Recursive Feature Elimination using caret
+      # @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).
+      # @return [String] feature dataset CSV file composed of selected features.
+      def self.rfe(params)
+        @r=RinRuby.new(false,false)
+        @r.ds_csv_file = params[:ds_csv_file].to_s
+        @r.prediction_feature = params[:prediction_feature].to_s
+        @r.fds_csv_file = params[:fds_csv_file].to_s
+        @r.del_missing = params[:del_missing] == true ? 1 : 0
+        r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
+        @r.f_fds_r = r_result_file.to_s
+        
+        # need packs 'randomForest', 'RANN'
+        @r.eval <<-EOR
+          set.seed(1)
+          suppressPackageStartupMessages(library('caret'))
+          suppressPackageStartupMessages(library('randomForest'))
+          suppressPackageStartupMessages(library('RANN'))
+          suppressPackageStartupMessages(library('doMC'))
+          registerDoMC()
+          
+          acts = read.csv(ds_csv_file, check.names=F)
+          feats = read.csv(fds_csv_file, check.names=F)
+          ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
+          
+          features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
+          y = ds[,which(names(ds) == prediction_feature)] 
+          
+          # assumes a data matrix 'features' and a vector 'y' of target values
+          row.names(features)=NULL
+          
+          pp = NULL
+          if (del_missing) {
+            # needed if rows should be removed
+            na_ids = apply(features,1,function(x)any(is.na(x)))
+            features = features[!na_ids,]
+            y = y[!na_ids]
+            pp = preProcess(features, method=c("scale", "center"))
+          } else {
+            # Use imputation if NA's random (only then!)
+            pp = preProcess(features, method=c("scale", "center", "knnImpute"))
+          }
+          features = predict(pp, features)
+          
+          # determine subsets
+          subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+          subsets = c(2,3,4,5,7,10,subsets)
+          subsets = unique(sort(round(subsets))) 
+          subsets = subsets[subsets<=dim(features)[2]]
+          subsets = subsets[subsets>1] 
+          
+          # Recursive feature elimination
+          rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
+          
+          # read existing dataset and select most useful features
+          csv=feats[,c("SMILES", rfProfile$optVariables)]
+          write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
+        EOR
+        r_result_file
+      end
+    end
+
     module Substructure
       include Algorithm
       # Substructure matching
-      # @param [OpenTox::Compound] compound Compound
-      # @param [Array] features Array with Smarts strings
+      # @param [Hash] required keys: compound, features
       # @return [Array] Array with matching Smarts
-      def self.match(compound,features)
-        compound.match(features)
+      def self.match(params)
+        params[:compound].match(params[:features])
       end
+
+      # Substructure matching with number of non-unique hits
+      # @param [Hash] required keys: compound, features
+      # @return [Hash] Hash with matching Smarts and number of hits 
+      def self.match_hits(params)
+        params[:compound].match_hits(params[:features])
+      end
+
+      # Substructure matching with number of non-unique hits
+      # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
+      # @return [Hash] Hash with matching Smarts and number of hits 
+      def self.lookup(params)
+        params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
+      end  
     end
 
     module Dataset
@@ -709,281 +537,5 @@ module OpenTox
       def features(dataset_uri,compound_uri)
       end
     end
-
-    module Transform
-      include Algorithm
-
-      # The transformer that inverts values.
-      # 1/x is used, after values have been moved >= 1.
-      class Inverter
-        attr_accessor :offset, :values
-
-        # @params[Array] Values to transform.
-        # @params[Float] Offset for restore.
-        def initialize *args
-          case args.size
-          when 1
-            begin
-              values=args[0]
-              raise "Cannot transform, values empty." if @values.size==0
-              @values = values.collect { |v| -1.0 * v }  
-              @offset = 1.0 - @values.minmax[0] 
-              @offset = -1.0 * @offset if @offset>0.0 
-              @values.collect! { |v| v - @offset }   # slide >1
-              @values.collect! { |v| 1 / v }         # invert to [0,1]
-            rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message}"
-              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-            end
-          when 2
-            @offset = args[1].to_f
-            @values = args[0].collect { |v| 1 / v }
-            @values.collect! { |v| v + @offset }
-            @values.collect! { |v| -1.0 * v }
-          end
-        end
-      end
-
-      # The transformer that takes logs.
-      # Log10 is used, after values have been moved > 0.
-      class Log10
-        attr_accessor :offset, :values
-
-        # @params[Array] Values to transform / restore.
-        # @params[Float] Offset for restore.
-        def initialize *args
-          @distance_to_zero = 0.000000001 # 1 / 1 billion
-          case args.size
-          when 1
-            begin
-              values=args[0]
-              raise "Cannot transform, values empty." if values.size==0
-              @offset = values.minmax[0] 
-              @offset = -1.0 * @offset if @offset>0.0 
-              @values = values.collect { |v| v - @offset }   # slide > anchor
-              @values.collect! { |v| v + @distance_to_zero }  #
-              @values.collect! { |v| Math::log10 v } # log10 (can fail)
-            rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message}"
-              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-            end
-          when 2
-            @offset = args[1].to_f
-            @values = args[0].collect { |v| 10**v }
-            @values.collect! { |v| v - @distance_to_zero }
-            @values.collect! { |v| v + @offset }
-          end
-        end
-      end
-
-      # The transformer that does nothing (No OPeration).
-      class NOP
-        attr_accessor :offset, :values
-
-        # @params[Array] Values to transform / restore.
-        # @params[Float] Offset for restore.
-        def initialize *args
-          @offset = 0.0
-          @distance_to_zero = 0.0
-          case args.size
-          when 1
-            @values = args[0]
-          when 2
-            @values = args[0]
-          end
-        end
-      end
-
-
-      # Auto-Scaler for Arrays
-      # Center on mean and divide by standard deviation
-      class AutoScale 
-        attr_accessor :scaled_values, :mean, :stdev
-
-        # @params[Array] Values to transform.
-        def initialize values
-          @scaled_values = values
-          @mean = @scaled_values.to_scale.mean
-          @stdev = @scaled_values.to_scale.standard_deviation_sample
-          @scaled_values = @scaled_values.collect {|vi| vi - @mean }
-          @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
-        end
-      end
-
-      # Principal Components Analysis
-      # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
-      class PCA
-        attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
-
-        # Creates a transformed dataset as GSL::Matrix.
-        # @param [GSL::Matrix] Data matrix.
-        # @param [Float] Compression ratio from [0,1].
-        # @return [GSL::Matrix] Data transformed matrix.
-        def initialize data_matrix, compression=0.05
-          begin
-            @data_matrix = data_matrix
-            @compression = compression.to_f
-            @stdev = Array.new
-            @mean = Array.new
-
-            # Objective Feature Selection
-            raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
-            @data_matrix_selected = nil
-            (0..@data_matrix.size2-1).each { |i|
-              if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
-                if @data_matrix_selected.nil?
-                  @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) 
-                  @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
-                else
-                  @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
-                end
-              end             
-            }
-            raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
-
-            # Scaling of Axes
-            @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
-            (0..@data_matrix_selected.size2-1).each { |i|
-              @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
-              @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
-              @stdev << @autoscaler.stdev
-              @mean << @autoscaler.mean
-            }
-
-            data_matrix_hash = Hash.new
-            (0..@data_matrix_scaled.size2-1).each { |i|
-              column_view = @data_matrix_scaled.col(i)
-              data_matrix_hash[i] = column_view.to_scale
-            }
-            dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
-            cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
-            pca=Statsample::Factor::PCA.new(cor_matrix)
-            pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
-            @eigenvalue_sums = Array.new
-            (0..dataset_hash.fields.size-1).each { |i|
-              @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
-            }
-            eigenvectors_selected = Array.new
-            pca.eigenvectors.each_with_index { |ev, i|
-              if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
-                eigenvectors_selected << ev.to_a
-              end
-            }
-            @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
-            dataset_matrix = dataset_hash.to_gsl.transpose
-            @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
-          rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message}"
-              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-          end
-        end
-
-        # Restores data in the original feature space (possibly with compression loss).
-        # @return [GSL::Matrix] Data matrix.
-        def restore
-          begin 
-            data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
-            # reverse scaling
-            (0..data_matrix_restored.size2-1).each { |i|
-              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
-              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
-            }
-            data_matrix_restored
-          rescue Exception => e
-            LOGGER.debug "#{e.class}: #{e.message}"
-            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-          end
-        end
-
-      end
-
-    end
-    
-    # Gauss kernel
-    # @return [Float] 
-    def self.gauss(x, sigma = 0.3) 
-      d = 1.0 - x.to_f
-      Math.exp(-(d*d)/(2*sigma*sigma))
-    end
-
-    # For symbolic features
-    # @param [Array] Array to test, must indicate non-occurrence with 0.
-    # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
-    def self.isnull_or_singular?(array)
-      nr_zeroes = array.count(0)
-      return (nr_zeroes == array.size) ||    # remove non-occurring feature
-             (nr_zeroes == array.size-1) ||  # remove singular feature
-             (nr_zeroes == 0)                # also remove feature present everywhere
-    end
-
-    # Numeric value test
-    # @param[Object] value
-    # @return [Boolean] Whether value is a number
-    def self.numeric?(value)
-      true if Float(value) rescue false
-    end
-
-    # For symbolic features
-    # @param [Array] Array to test, must indicate non-occurrence with 0.
-    # @return [Boolean] Whether the feature has variance zero.
-    def self.zero_variance?(array)
-      return (array.to_scale.variance_population == 0.0)
-    end
-    
-    # Sum of an array for Arrays.
-    # @param [Array] Array with values
-    # @return [Integer] Sum of size of values
-    def self.sum_size(array)
-      sum=0
-      array.each { |e| sum += e.size }
-      return sum
-    end
-
-    # Minimum Frequency
-    # @param [Integer] per-mil value
-    # return [Integer] min-frequency
-    def self.min_frequency(training_dataset,per_mil)
-      minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
-      minfreq = 2 unless minfreq > 2
-      Integer (minfreq)
-    end
-
-    # Effect calculation for classification
-    # @param [Array] Array of occurrences per class in the form of Enumerables.
-    # @param [Array] Array of database instance counts per class.
-    def self.effect(occurrences, db_instances)
-      max=0
-      max_value=0
-      nr_o = self.sum_size(occurrences)
-      nr_db = db_instances.to_scale.sum
-
-      occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
-        actual = o.size.to_f/nr_o
-        expected = db_instances[i].to_f/nr_db
-        if actual > expected
-          if ((actual - expected) / actual) > max_value
-           max_value = (actual - expected) / actual # 'Schleppzeiger'
-            max = i
-          end
-        end
-      }
-      max
-    end
-    
-    # Returns Support value of an fingerprint
-    # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
-    # return [Numeric] Support value 
-    def self.p_sum_support(params)
-      p_sum = 0.0
-        params[:features].each{|f|
-        compound_hits = params[:compound_features_hits][f]
-        neighbor_hits = params[:training_compound_features_hits][f] 
-        p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
-      }
-      p_sum 
-    end
-                
   end
 end
-
-
diff --git a/lib/authorization.rb b/lib/authorization.rb
index 5d57781..a9744e9 100644
--- a/lib/authorization.rb
+++ b/lib/authorization.rb
@@ -37,13 +37,15 @@ module OpenTox
       
       #Loads and sends Policyfile(XML) to open-sso server
       # @param [String] URI to create a policy for      
-      def send(uri)    
+      def send(uri)
         xml = get_xml(uri)
         ret = false
-        ret = Authorization.create_policy(xml, @subjectid) 
+        ret = Authorization.create_policy(xml, @subjectid)
+        LOGGER.warn "Create policy on openSSO failed for URI: #{uri} subjectid: #{@subjectid}. Will try again." if !ret
+        ret = Authorization.create_policy(xml, @subjectid) if !ret
         LOGGER.debug "Policy send with subjectid: #{@subjectid}"
         LOGGER.warn "Not created Policy is: #{xml}" if !ret
-        ret  
+        ret
       end
       
     end
@@ -337,7 +339,7 @@ module OpenTox
     # @param [String] subjectid
     # @return [Boolean] true if access granted, else otherwise
     def self.authorized?(uri, request_method, subjectid)
-      if CONFIG[:authorization][:free_request].include?(request_method)  
+      if CONFIG[:authorization][:free_request].include?(request_method)
         #LOGGER.debug "authorized? >>true<< (request is free), method: #{request_method}, URI: #{uri}, subjectid: #{subjectid}"
         true
       elsif OpenTox::Authorization.free_uri?(uri, request_method)
@@ -360,7 +362,7 @@ module OpenTox
         false
       end
     end
-    
+
     private
     def self.free_uri?(uri, request_method)
       if CONFIG[:authorization][:free_uris]
@@ -374,7 +376,7 @@ module OpenTox
       end    
       return false
     end
-    
+
     def self.authorize_exception?(uri, request_method)
       if CONFIG[:authorization][:authorize_exceptions]
         CONFIG[:authorization][:authorize_exceptions].each do |request_methods,uris|
@@ -387,6 +389,6 @@ module OpenTox
       end    
       return false
     end    
-    
+
   end
-end
-\ No newline at end of file
+end
diff --git a/lib/compound.rb b/lib/compound.rb
index e7b4da0..c25125b 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -6,13 +6,15 @@ module OpenTox
   # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
 	class Compound 
 
+    include OpenTox
+
 		attr_accessor :inchi, :uri
 
 		# Create compound with optional uri
     # @example
-    #   compound = OpenTox::Compound.new("http://webservices.in-silico.ch/compound/InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"")
+    #   compound = Compound.new("http://webservices.in-silico.ch/compound/InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"")
     # @param [optional, String] uri Compound URI
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
 		def initialize(uri=nil)
       @uri = uri
       case @uri
@@ -36,9 +38,9 @@ module OpenTox
 
     # Create a compound from smiles string
     # @example
-    #   compound = OpenTox::Compound.from_smiles("c1ccccc1")
+    #   compound = Compound.from_smiles("c1ccccc1")
     # @param [String] smiles Smiles string
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
     def self.from_smiles(smiles)
       c = Compound.new
       c.inchi = Compound.smiles2inchi(smiles)
@@ -48,7 +50,7 @@ module OpenTox
 
     # Create a compound from inchi string
     # @param [String] smiles InChI string
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
     def self.from_inchi(inchi)
       c = Compound.new
       c.inchi = inchi
@@ -58,7 +60,7 @@ module OpenTox
 
     # Create a compound from sdf string
     # @param [String] smiles SDF string
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
     def self.from_sdf(sdf)
       c = Compound.new
       c.inchi = Compound.sdf2inchi(sdf)
@@ -68,9 +70,9 @@ module OpenTox
 
     # Create a compound from name. Relies on an external service for name lookups.
     # @example
-    #   compound = OpenTox::Compound.from_name("Benzene")
+    #   compound = Compound.from_name("Benzene")
     # @param [String] name name can be also an InChI/InChiKey, CAS number, etc
-    # @return [OpenTox::Compound] Compound
+    # @return [Compound] Compound
     def self.from_name(name)
       c = Compound.new
       # paranoid URI encoding to keep SMILES charges and brackets
@@ -131,7 +133,7 @@ module OpenTox
 
 		# Match a smarts string
     # @example
-    #   compound = OpenTox::Compound.from_name("Benzene")
+    #   compound = Compound.from_name("Benzene")
     #   compound.match?("cN") # returns false
     # @param [String] smarts Smarts string
 		def match?(smarts)
@@ -146,7 +148,7 @@ module OpenTox
 
 		# Match an array of smarts strings, returns array with matching smarts
     # @example
-    #   compound = OpenTox::Compound.from_name("Benzene")
+    #   compound = Compound.from_name("Benzene")
     #   compound.match(['cc','cN']) # returns ['cc']
     # @param [Array] smarts_array Array with Smarts strings
     # @return [Array] Array with matching Smarts strings
@@ -166,7 +168,7 @@ module OpenTox
 
     # Match_hits an array of smarts strings, returns hash with matching smarts as key and number of non-unique hits as value 
     # @example
-    #   compound = OpenTox::Compound.from_name("Benzene")
+    #   compound = Compound.from_name("Benzene")
     #   compound.match(['cc','cN']) # returns ['cc']
     # @param [Array] smarts_array Array with Smarts strings
     # @return [Hash] Hash with matching smarts as key and number of non-unique hits as value
@@ -191,6 +193,40 @@ module OpenTox
       return smarts_hits
       #smarts_array.collect { |s| s if match?(s)}.compact
 		end
+    
+    # Lookup numerical values, returns hash with feature name as key and value as value 
+    # @param [Array] Array of feature names
+    # @param [String] Feature dataset uri
+    # @return [Hash] Hash with feature name as key and value as value
+    def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil)
+      ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid)
+      #entry = ds.data_entries[self.uri]
+      entry = nil 
+      ds.data_entries.each { |c_uri, values| 
+        if c_uri.split('/compound/').last == self.to_inchi
+          entry = ds.data_entries[self.uri]
+          break
+        end
+      }
+      LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
+
+      if entry.nil?
+        uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
+        uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid)
+        ds = OpenTox::Dataset.find(uri,subjectid)
+        entry = ds.data_entries[self.uri]
+        ds.delete(subjectid)
+      end
+      features = entry.keys
+      features.each { |feature| 
+        new_feature = File.join(feature_dataset_uri, "feature", feature.split("/").last) 
+        entry[new_feature] = entry[feature].flatten.first.to_f # see algorithm/lazar.rb:182, to_f because feature type detection doesn't work w 1 entry
+        entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
+      }
+      #res = feature_array.collect {|v| entry[v]}
+      #LOGGER.debug "----- am #{entry.to_yaml}"
+      entry
+		end
 
 
     # Get URI of compound image with highlighted fragments
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 0911073..95c1918 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -288,7 +288,7 @@ module OpenTox
 
     # Insert a statement (compound_uri,feature_uri,value)
     # @example Insert a statement (compound_uri,feature_uri,value)
-    #   dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", true
+    #   dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", 1
     # @param [String] compound Compound URI
     # @param [String] feature Compound URI
     # @param [Boolean,Float] value Feature value
@@ -315,6 +315,16 @@ module OpenTox
       @features[feature] = metadata
     end
 
+    # Complete feature values by adding zeroes
+    def complete_data_entries
+      all_features = @features.keys
+      @data_entries.each { |c, e|
+        (Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
+          self.add(c,f,0)
+        }
+      }
+    end
+
     # Add/modify metadata for a feature
     # @param [String] feature Feature URI
     # @param [Hash] metadata Hash with feature metadata
@@ -363,7 +373,45 @@ module OpenTox
       dataset.save(subjectid)
       dataset
     end
-
+    
+    # merges two dataset into a new dataset (by default uses all compounds and features)
+    # precondition: both datasets are fully loaded
+    # @param [OpenTox::Dataset] dataset1 to merge
+    # @param [OpenTox::Dataset] dataset2 to merge
+    # @param [Hash] metadata
+    # @param [optional,String] subjectid
+    # @param [optional,Array] features1, if specified only this features of dataset1 are used
+    # @param [optional,Array] features2, if specified only this features of dataset2 are used
+    # @param [optional,Array] compounds1, if specified only this compounds of dataset1 are used
+    # @param [optional,Array] compounds2, if specified only this compounds of dataset2 are used
+    # example: if you want no features from dataset2, give empty array as features2
+    def self.merge( dataset1, dataset2, metadata, subjectid=nil, features1=nil, features2=nil, compounds1=nil, compounds2=nil )
+      features1 = dataset1.features.keys unless features1
+      features2 = dataset2.features.keys unless features2
+      compounds1 = dataset1.compounds unless compounds1
+      compounds2 = dataset2.compounds unless compounds2
+      data_combined = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+      LOGGER.debug("merging datasets #{dataset1.uri} and #{dataset2.uri} to #{data_combined.uri}")
+      [[dataset1, features1, compounds1], [dataset2, features2, compounds2]].each do |dataset,features,compounds|
+        compounds.each{|c| data_combined.add_compound(c)}
+        features.each do |f|
+          m = dataset.features[f]
+          m[OT.hasSource] = dataset.uri unless m[OT.hasSource]
+          data_combined.add_feature(f,m)
+          compounds.each do |c|
+            dataset.data_entries[c][f].each do |v|
+              data_combined.add(c,f,v)
+            end if dataset.data_entries[c] and dataset.data_entries[c][f]
+          end
+        end
+      end
+      metadata = {} unless metadata
+      metadata[OT.hasSource] = "Merge from #{dataset1.uri} and #{dataset2.uri}" unless metadata[OT.hasSource]
+      data_combined.add_metadata(metadata)
+      data_combined.save(subjectid)
+      data_combined
+    end
+    
     # Save dataset at the dataset service 
     # - creates a new dataset if uri is not set
     # - overwrites dataset if uri exists
diff --git a/lib/environment.rb b/lib/environment.rb
index 3775797..c1b8312 100644
--- a/lib/environment.rb
+++ b/lib/environment.rb
@@ -24,7 +24,11 @@ end
 
 # database
 #`redis-server /opt/redis/redis.conf` unless File.exists? "/var/run/redis.pid" # removed by AM
-Ohm.connect :thread_safe => true
+ohm_port=6379
+if !CONFIG[:ohm_port].nil? 
+  ohm_port=CONFIG[:ohm_port].to_i
+end
+Ohm.connect(:thread_safe => true, :port => ohm_port)
 
 # load mail settings for error messages
 #load File.join config_dir,"mail.rb" if File.exists?(File.join config_dir,"mail.rb")
@@ -87,4 +91,5 @@ DC =  OwlNamespace.new 'http://purl.org/dc/elements/1.1/'
 OT =  OwlNamespace.new 'http://www.opentox.org/api/1.1#'
 OTA =  OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#'
 XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#'
+#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
 
diff --git a/lib/model.rb b/lib/model.rb
index 0b116c2..a858a0f 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -102,8 +102,8 @@ module OpenTox
       include Algorithm
       include Model
 
-      attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim, :subjectid, :prop_kernel, :value_map, :nr_hits, :transform, :conf_stdev, :prediction_min_max
 
+      attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
       def initialize(uri=nil)
 
         if uri
@@ -120,18 +120,11 @@ module OpenTox
         @p_values = {}
         @fingerprints = {}
         @value_map = {}
-        @prediction_min_max = [] 
 
         @feature_calculation_algorithm = "Substructure.match"
         @similarity_algorithm = "Similarity.tanimoto"
         @prediction_algorithm = "Neighbors.weighted_majority_vote"
         
-        @nr_hits = false
-        @min_sim = 0.3
-        @prop_kernel = false
-        @transform = { "class" => "NOP"  }
-        @conf_stdev = false
-
       end
 
       # Get URIs of all lazar models
@@ -174,19 +167,14 @@ module OpenTox
         lazar.feature_calculation_algorithm = hash["feature_calculation_algorithm"] if hash["feature_calculation_algorithm"]
         lazar.similarity_algorithm = hash["similarity_algorithm"] if hash["similarity_algorithm"]
         lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
-        lazar.min_sim = hash["min_sim"] if hash["min_sim"]
         lazar.subjectid = hash["subjectid"] if hash["subjectid"]
-        lazar.prop_kernel = hash["prop_kernel"] if hash["prop_kernel"]
         lazar.value_map = hash["value_map"] if hash["value_map"]
-        lazar.nr_hits = hash["nr_hits"] if hash["nr_hits"]
-        lazar.transform = hash["transform"] if hash["transform"]
-        lazar.conf_stdev = hash["conf_stdev"] if hash["conf_stdev"]
-        lazar.prediction_min_max = hash["prediction_min_max"] if hash["prediction_min_max"]
+
         lazar
       end
 
       def to_json
-        Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :min_sim => @min_sim, :subjectid => @subjectid, :prop_kernel => @prop_kernel, :value_map => @value_map, :nr_hits => @nr_hits, :transform => @transform, :conf_stdev => @conf_stdev, :prediction_min_max => @prediction_min_max})
+        Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
       end
 
       def run( params, accept_header=nil, waiting_task=nil )
@@ -230,8 +218,11 @@ module OpenTox
             predict(compound_uri,false,subjectid)
             count += 1
             waiting_task.progress( count/d.compounds.size.to_f*100.0 ) if waiting_task
-          rescue => ex
-            LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+ex.message+" subjectid: #{subjectid}"
+          rescue => e
+            LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+e.message+" subjectid: #{subjectid}"
+            #LOGGER.debug "#{e.class}: #{e.message}"
+            #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+
           end
         end
         #@prediction_dataset.save(subjectid)
@@ -246,7 +237,6 @@ module OpenTox
 
         @compound = Compound.new compound_uri
         features = {}
-
         #LOGGER.debug self.to_yaml
         unless @prediction_dataset
           @prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@@ -257,29 +247,42 @@ module OpenTox
             OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
           } )
         end
-
         if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
           all_activities = [] 
           all_activities = @activities.values.flatten.collect! { |i| i.to_f }
-          @prediction_min_max[0] = (all_activities.to_scale.min/2)
-          @prediction_min_max[1] = (all_activities.to_scale.max*2)
         end
-
         unless database_activity(subjectid) # adds database activity to @prediction_dataset
+          # Calculation of needed values for query compound
+          @compound_features = eval("#{@feature_calculation_algorithm}({
+                                    :compound => @compound, 
+                                    :features => @features, 
+                                    :feature_dataset_uri => @metadata[OT.featureDataset],
+                                    :pc_type => self.parameter(\"pc_type\"),
+                                    :subjectid => subjectid
+                                    })")
+          # Adding fingerprint of query compound with features and values(p_value*nr_hits)
+          @compound_fingerprints = {}
+          @compound_features.each do |feature, value| # value is nil if "Substructure.match"
+            if @feature_calculation_algorithm == "Substructure.match_hits" 
+              @compound_fingerprints[feature] = @p_values[feature] * value
+            elsif @feature_calculation_algorithm == "Substructure.match"
+              @compound_fingerprints[feature] = @p_values[feature]
+            elsif @feature_calculation_algorithm == "Substructure.lookup"
+              @compound_fingerprints[feature] = value
+            end
+          end
 
-          neighbors
-          prediction = eval("#{@prediction_algorithm} ( { :neighbors => @neighbors, 
-                                                          :compound => @compound,
-                                                          :features => @features, 
-                                                          :p_values => @p_values, 
-                                                          :fingerprints => @fingerprints,
-                                                          :similarity_algorithm => @similarity_algorithm, 
-                                                          :prop_kernel => @prop_kernel,
+          # Transform model data to machine learning scheme (tables of data)
+          mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self)
+          mtf.transform
+
+          # Make a prediction
+          prediction = eval("#{@prediction_algorithm}( { :props => mtf.props,
+                                                          :acts => mtf.acts,
+                                                          :sims => mtf.sims,
                                                           :value_map => @value_map,
-                                                          :nr_hits => @nr_hits,
-                                                          :conf_stdev => @conf_stdev,
-                                                          :prediction_min_max => @prediction_min_max,
-                                                          :transform => @transform } ) ")
+                                                          :min_train_performance => self.parameter(\"min_train_performance\")
+                                                        } ) ")
 
           value_feature_uri = File.join( @uri, "predicted", "value")
           confidence_feature_uri = File.join( @uri, "predicted", "confidence")
@@ -355,44 +358,6 @@ module OpenTox
         @prediction_dataset
       end
 
-      
-
-      # Find neighbors and store them as object variable, access all compounds for that.
-      def neighbors
-        @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm
-        @neighbors = []
-        @fingerprints.keys.each do |training_compound| # AM: access all compounds
-          add_neighbor @fingerprints[training_compound].keys, training_compound
-        end
-      end
-
-      # Adds a neighbor to @neighbors if it passes the similarity threshold.
-      def add_neighbor(training_features, training_compound)
-        compound_features_hits = {}
-        training_compound_features_hits = {}
-        if @nr_hits
-          compound_features_hits = @compound.match_hits(@compound_features)
-          training_compound_features_hits = @fingerprints[training_compound]
-          #LOGGER.debug "dv ------------ training_compound_features_hits:#{training_compound_features_hits.class}  #{training_compound_features_hits}"
-        end
-        params = {}
-        params[:nr_hits] = @nr_hits
-        params[:compound_features_hits] = compound_features_hits
-        params[:training_compound_features_hits] = training_compound_features_hits
-
-        sim = eval("#{@similarity_algorithm}(training_features, @compound_features, @p_values, params)")
-        if sim > @min_sim
-          @activities[training_compound].each do |act|
-            @neighbors << {
-              :compound => training_compound,
-              :similarity => sim,
-              :features => training_features,
-              :activity => act
-            }
-          end
-        end
-      end
-
       # Find database activities and store them in @prediction_dataset
       # @return [Boolean] true if compound has databasse activities, false if not
       def database_activity(subjectid)
diff --git a/lib/opentox-ruby.rb b/lib/opentox-ruby.rb
index 1fa2a86..d25632c 100644
--- a/lib/opentox-ruby.rb
+++ b/lib/opentox-ruby.rb
@@ -9,6 +9,6 @@ rescue LoadError
 end
 
 ['opentox', 'compound','dataset', 'parser','serializer', 'algorithm','model','task','validation','feature', 
- 'rest_client_wrapper', 'authorization', 'policy', 'helper', 'to-html', 'ontology' ].each do |lib|
+ 'rest_client_wrapper', 'authorization', 'policy', 'helper', 'to-html', 'ontology', 'r-util' ].each do |lib|
 	require lib
 end
diff --git a/lib/parser.rb b/lib/parser.rb
index d0975af..56e4fed 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -57,7 +57,7 @@ module OpenTox
         `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
           triple = line.to_triple
           if triple[0] == @uri
-            if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
+            if triple[1] == RDF.type || triple[1]==OT.predictedVariables || triple[1]==OT.independentVariables # allow multiple types
               @metadata[triple[1]] = [] unless @metadata[triple[1]]
               @metadata[triple[1]] << triple[2].split('^^').first
             else
@@ -290,10 +290,11 @@ module OpenTox
         @features = []
         @feature_types = {}
 
-        @format_errors = ""
-        @smiles_errors = []
+        @format_errors = []
+        @id_errors = []
         @activity_errors = []
         @duplicates = {}
+        @max_class_values = 3
       end
 
       def detect_new_values(row, value_maps)
@@ -309,9 +310,10 @@ module OpenTox
       # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
       # @param [Excel] book Excel workbook object (created with roo gem)
       # @return [OpenTox::Dataset] Dataset object with Excel data
-      def load_spreadsheet(book)
+      def load_spreadsheet(book, drop_missing=false)
         book.default_sheet = 0
-        add_features book.row(1)
+        headers = book.row(1)
+        add_features headers
         value_maps = Array.new
         regression_features=Array.new
 
@@ -319,15 +321,27 @@ module OpenTox
           row = book.row(i)
           value_maps = detect_new_values(row, value_maps)
           value_maps.each_with_index { |vm,j|
-            if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
+            if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
               regression_features[j]=true 
             else
               regression_features[j]=false
             end
           }
         }
+
         2.upto(book.last_row) { |i| 
-          add_values book.row(i), regression_features
+          drop=false
+          row = book.row(i)
+          raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
+          if row.include?("")
+            @format_errors << "Row #{i} has #{row.count("")} missing values" 
+            drop=true
+            drop_missing=true if (row.count("") == row.size-1) 
+          end
+          add_values(row, regression_features) unless (drop_missing && drop)
+          if (drop_missing && drop) 
+            @format_errors << "Row #{i} not added" 
+          end
         }
         warnings
         @dataset
@@ -336,10 +350,11 @@ module OpenTox
       # Load CSV string (format specification: http://toxcreate.org/help)
       # @param [String] csv CSV representation of the dataset
       # @return [OpenTox::Dataset] Dataset object with CSV data
-      def load_csv(csv)
+      def load_csv(csv, drop_missing=false)
         row = 0
         input = csv.split("\n")
-        add_features split_row(input.shift)
+        headers = split_row(input.shift)
+        add_features(headers)
         value_maps = Array.new
         regression_features=Array.new
 
@@ -347,15 +362,27 @@ module OpenTox
           row = split_row(row)
           value_maps = detect_new_values(row, value_maps)
           value_maps.each_with_index { |vm,j|
-            if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
+            if vm.size > @max_class_values # max @max_class_values classes.
               regression_features[j]=true 
             else
               regression_features[j]=false
             end
           }
         }
-        input.each { |row| 
-          add_values split_row(row), regression_features
+
+        input.each_with_index { |row, i| 
+          drop=false
+          row = split_row(row)
+          raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
+          if row.include?("")
+            @format_errors << "Row #{i} has #{row.count("")} missing values" 
+            drop=true
+            drop_missing=true if (row.count("") == row.size-1) 
+          end
+          add_values(row, regression_features) unless (drop_missing && drop)
+          if (drop_missing && drop) 
+            @format_errors << "Row #{i} not added" 
+          end
         }
         warnings
         @dataset
@@ -367,88 +394,115 @@ module OpenTox
 
         info = ''
         @feature_types.each do |feature,types|
-          if types.uniq.size > 1
+          if types.uniq.size == 0
+            type = "helper#MissingFeature"
+          elsif types.uniq.size > 1
             type = OT.NumericFeature
           else
             type = types.first
           end
           @dataset.add_feature_metadata(feature,{RDF.type => [type]})
-          info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
+          info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
 
           # TODO: rewrite feature values
-          # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored."
+          # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
         end
 
         @dataset.metadata[OT.Info] = info 
 
         warnings = ''
-        warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty?
+        warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty?
         warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
+        warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty?
         duplicate_warnings = ''
         @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
-        warnings += "<p>Duplicated structures (all structures/activities used for model building, please  make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
+        warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
 
         @dataset.metadata[OT.Warnings] = warnings 
 
       end
 
+      # Adds a row of features to a dataset
+      # @param Array A row split up as an array
+      # @return Array Indices for duplicate features
       def add_features(row)
-        row.shift  # get rid of smiles entry
-        row.each do |feature_name|
+        row=row.collect
+        row.shift  # get rid of id entry
+        @duplicate_feature_indices = [] # starts with 0 at first f after id
+        row.each_with_index do |feature_name, idx|
           feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
-          @feature_types[feature_uri] = []
-          @features << feature_uri
-          @dataset.add_feature(feature_uri,{DC.title => feature_name})
+          unless @features.include? feature_uri
+            @feature_types[feature_uri] = []
+            @features << feature_uri
+            @dataset.add_feature(feature_uri,{DC.title => feature_name})
+          else
+            @duplicate_feature_indices << idx
+            @format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}"
+          end
         end
       end
 
       # Adds a row to a dataset
       # @param Array A row split up as an array
       # @param Array Indicator for regression for each field
+      # @param Array Indices for duplicate features
       def add_values(row, regression_features)
 
-        smiles = row.shift
-        compound = Compound.from_smiles(smiles)
+        id = row.shift
+        case id
+        when /InChI/
+          compound = Compound.from_inchi(URI.decode_www_form_component(id))
+        else
+          compound = Compound.from_smiles(id)
+        end
+
         if compound.nil? or compound.inchi.nil? or compound.inchi == ""
-          @smiles_errors << smiles+", "+row.join(", ") 
+          @id_errors << id+", "+row.join(", ") 
           return false
         end
         @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
-        @duplicates[compound.inchi] << smiles+", "+row.join(", ")
+        @duplicates[compound.inchi] << id+", "+row.join(", ")
 
+        feature_idx = 0
         row.each_index do |i|
-          value = row[i]
-          feature = @features[i]
 
-          type = nil
-          if (regression_features[i])
-            type = feature_type(value)
-            if type != OT.NumericFeature
-              raise "Error! Expected numeric values."
+          unless @duplicate_feature_indices.include? i
+
+            value = row[i]
+            #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
+            feature = @features[feature_idx]
+  
+            type = feature_type(value) # May be NIL
+            type = OT.NominalFeature unless (type.nil? || regression_features[i])
+            @feature_types[feature] << type if type
+  
+            val = nil
+            case type
+            when OT.NumericFeature
+              val = value.to_f
+            when OT.NominalFeature
+              val = value.to_s
             end
-          else
-            type = OT.NominalFeature
-          end
-          @feature_types[feature] << type 
 
-          case type
-          when OT.NumericFeature
-            val = value.to_f
-          when OT.NominalFeature
-            val = value.to_s
-          end
-          if val!=nil
-            @dataset.add(compound.uri, feature, val)
-            if type!=OT.NumericFeature
-              @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
-              @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+            feature_idx += 1
+  
+            if val != nil
+              @dataset.add(compound.uri, feature, val)
+              if type != OT.NumericFeature
+                @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
+                @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+              end
             end
+
           end
+
         end
       end
 
       def feature_type(value)
-        if OpenTox::Algorithm::numeric? value
+        if value == ""
+          return nil
+        elsif OpenTox::Algorithm::numeric? value
           return OT.NumericFeature
         else
           return OT.NominalFeature
@@ -456,7 +510,7 @@ module OpenTox
       end
 
       def split_row(row)
-        row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes
+        row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells
       end
 
     end
@@ -468,6 +522,7 @@ module OpenTox
       def initialize
         @data = {}
         @activity_errors = []
+        @max_class_values = 3
       end
 
       def feature_values(feature)
@@ -485,14 +540,14 @@ module OpenTox
       def clean_features
         ignored_features = []
         features.each do |feature|
-          if feature_values(feature).size > 5
+          if feature_values(feature).size > @max_class_values
             if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
               # REGRESSION
             elsif feature_types(feature).include? OT.NumericFeature
               @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
               @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
             else
-              @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
+              @activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)."
               ignored_features << feature
               next
             end
@@ -543,12 +598,15 @@ module OpenTox
       private
 
       def feature_type(value)
-        if OpenTox::Algorithm::numeric? value
+        if value.nil?
+          return nil
+        elsif OpenTox::Algorithm::numeric? value
           return OT.NumericFeature
         else
           return OT.NominalFeature
         end
       end
+
     end
 
     # quick hack to enable sdf import via csv
@@ -589,20 +647,20 @@ module OpenTox
             @duplicates[inchi] << rec #inchi#+", "+row.join(", ")
             compound = Compound.from_inchi inchi
           rescue
-            @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
+            @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}"
             next
           end
           row = {}
           obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
           table.data[compound.uri] = row
         end
-
-        # finda and remove ignored_features
+        
+        # find and remove ignored_features
         @activity_errors = table.clean_features
         table.add_to_dataset @dataset
 
         warnings = ''
-        warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
+        warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
         warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
         duplicate_warnings = ''
         @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
diff --git a/lib/r-util.rb b/lib/r-util.rb
new file mode 100644
index 0000000..7163c46
--- /dev/null
+++ b/lib/r-util.rb
@@ -0,0 +1,354 @@
+# pending: package dir hack ---------
+# CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
+# PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
+package_dir = CONFIG[:base_dir].split("/")
+package_dir[-1] = "r-packages"
+package_dir = package_dir.join("/")
+PACKAGE_DIR = package_dir
+
+require "tempfile"
+
+module OpenTox
+  
+  class RUtil
+    
+    @@feats = {}
+      
+    def initialize
+      @r = RinRuby.new(true,false) unless defined?(@r) and @r
+      @r.eval ".libPaths('#{PACKAGE_DIR}')"
+      @r_packages = @r.pull "installed.packages()[,1]"
+      ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
+      @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
+    end
+    
+    def quit_r
+      begin
+        @r.quit
+        @r = nil
+      rescue
+      end
+    end
+    
+    def r
+      @r
+    end
+    
+    def package_installed?( package )
+      @r_packages.include?(package) 
+    end
+    
+    def install_package( package )
+      unless package_installed?(package)
+        LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
+        @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
+      end
+    end
+    
+    # <0 -> array1 << array2
+    # 0  -> no significant difference
+    # >0 -> array2 >> array1
+    def paired_ttest(array1, array2, significance_level=0.95)
+      @r.assign "v1",array1
+      @r.assign "v2",array2
+      @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
+      t = @r.pull "ttest$statistic"
+      p = @r.pull "ttest$p.value"
+      if (1-significance_level > p)
+        t
+      else
+        0
+      end
+    end
+    
+    # example: 
+    # files = ["/tmp/box.svg","/tmp/box.png"]
+    # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
+    # boxplot(files, data, "comparison1" )
+    #
+    def boxplot(files, data, title="")
+      LOGGER.debug("r-util> create boxplot")
+      assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
+      plot_to_files(files) do |file|
+        @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
+      end
+    end
+
+    # embedds feature values of two datasets into 2D and plots it
+    # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method) 
+    #        
+    def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
+        features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
+        
+      raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
+      LOGGER.debug("r-util> create feature value plot")
+      d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
+      d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
+      if features
+        [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}} 
+      else
+        raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if 
+          (d1.features.keys.sort != d2.features.keys.sort)
+        features = d1.features.keys
+      end
+      raise "at least two features needed" if d1.features.keys.size<2
+      waiting_task.progress(25) if waiting_task
+      
+      df1 = dataset_to_dataframe(d1,0,subjectid,features)
+      df2 = dataset_to_dataframe(d2,0,subjectid,features)
+      waiting_task.progress(50) if waiting_task
+      
+      @r.eval "df <- rbind(#{df1},#{df2})"
+      @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
+      @r.names = [dataset_name1, dataset_name2]
+      LOGGER.debug("r-util> - convert data to 2d")
+      @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
+      waiting_task.progress(75) if waiting_task
+      
+      if fast_plot
+        info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
+      else
+        info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
+      end
+      LOGGER.debug("r-util> - plot data")
+      plot_to_files(files) do |file|
+        @r.eval "plot_split( df.2d, split, names, #{info})"
+      end
+    end
+    
+    # plots a double histogram
+    # data1 and data2 are arrays with values, either numerical or categorial (string values)
+    # is_numerical, boolean flag indicating value types
+    # log (only for numerical), plot logarithm of values
+    def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
+      LOGGER.debug("r-util> create double hist plot")
+      all = data1 + data2
+      if (is_numerical)
+        @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
+        {
+          if (log)
+          {
+            data1 <- log(data1)
+            data2 <- log(data2)
+            xlab = paste('logarithm of',xlab,sep=' ')
+          }
+          xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
+          h <- hist(rbind(data1,data2),plot=F)
+          h1 <- hist(data1,plot=F,breaks=h$breaks)
+          h2 <- hist(data2,plot=F,breaks=h$breaks)
+          xlims = c(min(h$breaks),max(h$breaks))
+          ylims = c(0,max(h1$counts,h2$counts))
+          xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
+          plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
+            main=title, xlab=xlab, ylab='counts' )
+          plot(h2, col=rgb(0,1,0,2/4), add=T )
+          legend('topleft',names,lty=c(1,1),col=c('red','green'))
+        }" 
+        @r.assign("data1",data1)
+        @r.assign("data2",data2)
+        @r.legend = [name1, name2]
+      else
+        raise "log not valid for categorial" if log
+        vals = all.uniq.sort!
+        counts1 = vals.collect{|e| data1.count(e)}
+        counts2 = vals.collect{|e| data2.count(e)}
+        @r.data1 = counts1
+        @r.data2 = counts2
+        @r.value_names = [name1, name2]
+        @r.legend = vals
+        @r.eval("data <- cbind(data1,data2)")
+      end
+      
+      plot_to_files(files) do |file|
+        if (is_numerical)
+          @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
+        else
+          @r.eval("bp <- barplot(data, beside=T, names.arg=value_names, 
+            main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
+          @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
+        end
+      end
+    end
+    
+    # stratified splits a dataset into two dataset the feature values
+    # all features are taken into account unless <split_features> is given
+    def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+      raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
+      LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
+      
+      df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
+      @r.eval "set.seed(#{seed})"
+      @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
+      split = @r.pull 'split'
+      split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
+      split_to_datasets( df, split, subjectid )
+    end
+    
+    # dataset should be loaded completely (use Dataset.find)
+    # takes duplicates into account
+    # replaces missing values with param <missing_value>
+    # returns dataframe-variable-name in R
+    def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
+      LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
+      
+      # count duplicates
+      num_compounds = {}
+      dataset.features.keys.each do |f|
+        dataset.compounds.each do |c|
+          if dataset.data_entries[c]
+            val = dataset.data_entries[c][f]
+            size = val==nil ? 1 : val.size
+            num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
+          else
+            num_compounds[c] = 1
+          end
+        end
+      end  
+      
+      # use either all, or the provided features, sorting is important as col-index := features
+      if features
+        features.sort!
+      else
+        features = dataset.features.keys.sort
+      end
+      compounds = []
+      dataset.compounds.each do |c|
+        num_compounds[c].times do |i|
+          compounds << c
+        end
+      end
+
+      # values into 2D array, then to dataframe
+      d_values = []
+      dataset.compounds.each do |c|
+        num_compounds[c].times do |i|
+          c_values = []
+          features.each do |f|
+            if dataset.data_entries[c]
+              val = dataset.data_entries[c][f]
+              v = val==nil ? "" : val[i].to_s
+            else
+              raise "wtf" if i>0
+              v = ""
+            end
+            v = missing_value if v.size()==0
+            c_values << v
+          end
+          d_values << c_values
+        end
+      end  
+      df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
+      assign_dataframe(df_name,d_values,compounds,features)
+      
+      # set dataframe column types accordingly
+      f_count = 1 #R starts at 1
+      features.each do |f|
+        feat = OpenTox::Feature.find(f,subjectid)
+        nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+        if nominal
+          @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
+        else
+          @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
+        end
+        f_count += 1
+      end
+      #@r.eval "head(#{df_name})"
+      
+      # store compounds, and features (including metainformation)
+      @@feats[df_name] = {}
+      features.each do |f|
+        @@feats[df_name][f] = dataset.features[f]
+      end
+      df_name
+    end
+    
+    # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
+    # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
+    def dataframe_to_dataset( df, subjectid=nil )
+      dataframe_to_dataset_indices( df, subjectid, nil)
+    end
+    
+    private
+    def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
+      raise unless @@feats[df].size>0
+      values, compounds, features = pull_dataframe(df)
+      features.each{|f| raise unless @@feats[df][f]}
+      dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+      LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
+      compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
+      features.each{|f| dataset.add_feature(f,@@feats[df][f])}
+      features.size.times do |c|
+        feat = OpenTox::Feature.find(features[c],subjectid)
+        nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+        compounds.size.times do |r|
+          if compound_indices==nil or compound_indices.include?(r)
+            dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
+          end 
+        end
+      end
+      dataset.save(subjectid)
+      dataset
+    end    
+    
+    def split_to_datasets( df, split, subjectid=nil )
+      sets = []
+      (split.min.to_i .. split.max.to_i).each do |i|
+        indices = []
+        split.size.times{|j| indices<<j if split[j]==i}
+        dataset = dataframe_to_dataset_indices( df, subjectid, indices )
+        LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+        sets << dataset
+      end
+      sets
+    end
+    
+    def pull_dataframe(df)
+      tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
+      @r.eval "write.table(#{df},file='#{tmp}',sep='#')"
+      res = []; compounds = []; features = []
+      first = true
+      file = File.new(tmp, 'r')
+      file.each_line("\n") do |row|
+        if first
+           features = row.chomp.split("#").collect{|e| e.gsub("\"","")}
+           first = false
+        else
+           vals = row.chomp.split("#").collect{|e| e.gsub("\"","")}
+           compounds << vals[0]
+           res << vals[1..-1]
+        end
+      end
+      begin File.delete(tmp); rescue; end
+      return res, compounds, features
+    end
+    
+    def assign_dataframe(df,input,rownames,colnames)
+      tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
+      file = File.new(tmp, 'w')
+      input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}  
+      file.flush
+      @r.rownames = rownames if rownames
+      @r.colnames = colnames
+      @r.eval "#{df} <- read.table(file='#{tmp}',sep='#',"+
+        "#{rownames ? "row.names=rownames" : ""},col.names=colnames,check.names=F)"
+      begin File.delete(tmp); rescue; end
+    end
+    
+    def plot_to_files(files)
+      files.each do |file|
+        if file=~/(?i)\.svg/
+          @r.eval("svg('#{file}',10,8)")
+        elsif file=~/(?i)\.png/
+          @r.eval("png('#{file}')")
+        else
+          raise "invalid format: "+file.to_s
+        end
+        yield file
+        LOGGER.debug "r-util> plotted to #{file}"
+        @r.eval("dev.off()")
+      end
+    end
+  end
+end
+
+
diff --git a/lib/rest_client_wrapper.rb b/lib/rest_client_wrapper.rb
index 6d25bb3..fcadebb 100644
--- a/lib/rest_client_wrapper.rb
+++ b/lib/rest_client_wrapper.rb
@@ -70,7 +70,7 @@ module OpenTox
       
       begin
         #LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect
-        resource = RestClient::Resource.new(uri,{:timeout => 60})
+        resource = RestClient::Resource.new(uri,{:timeout => 600})
         if rest_call=="post" || rest_call=="put"
           result = resource.send(rest_call, payload, headers)
         else
diff --git a/lib/serializer.rb b/lib/serializer.rb
index b62ac45..30cb2ba 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -55,7 +55,7 @@ module OpenTox
           OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
 
-          #object props for validation#           
+          #object props for validation#
           OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
@@ -87,7 +87,7 @@ module OpenTox
           OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
 
-          # annotation props for validation        
+          # annotation props for validation
           OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
@@ -143,8 +143,8 @@ module OpenTox
         @data_entries = {}
         @values_id = 0
         @parameter_id = 0
-        
-        @classes = Set.new 
+
+        @classes = Set.new
         @object_properties = Set.new
         @annotation_properties = Set.new
         @datatype_properties = Set.new
@@ -208,7 +208,7 @@ module OpenTox
         @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] }
         add_metadata uri, metadata
       end
-      
+
       # Add a resource defined by resource_class and content
       # (see documentation of add_content for example)
       # @param [String] uri of resource
@@ -223,10 +223,10 @@ module OpenTox
       def add_uri(uri,type)
         @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] }
       end
-      
+
       private
       @@content_id = 1
-      
+
       #Recursiv function to add content
       #@example
       #  { DC.description => "bla",
@@ -244,7 +244,7 @@ module OpenTox
         hash.each do |u,v|
           if v.is_a? Hash
             # value is again a hash, i.e. a new owl class is added
-            # first make sure type (==class) is set 
+            # first make sure type (==class) is set
             type = v[RDF.type]
             raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type
             raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type)
@@ -256,7 +256,7 @@ module OpenTox
             # add content to new class
             add_content(genid,v)
           elsif v.is_a? Array
-            # value is an array, i.e. a list of values with property is added 
+            # value is an array, i.e. a list of values with property is added
             v.each{ |vv| add_content( uri, { u => vv } ) }
           else # v.is_a? String
             # simple string value
@@ -268,7 +268,7 @@ module OpenTox
           end
         end
       end
-      
+
       public
 
       # Add metadata
@@ -329,7 +329,7 @@ module OpenTox
           v = [{ "type" => "uri", "value" => value}]
         when "literal"
           v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }]
-        else 
+        else
           raise "Illegal type #{type(value)} for #{value}."
         end
         @object[values] = {
@@ -342,7 +342,7 @@ module OpenTox
       end
 
       # Serializers
-      
+
       # Convert to N-Triples
       # @return [text/plain] Object OWL-DL in N-Triples format
       def to_ntriples
@@ -353,7 +353,7 @@ module OpenTox
           entry.each do |p,objects|
             p = url(p)
             objects.each do |o|
-              case o["type"] 
+              case o["type"]
               when "uri"
                 o = url(o["value"])
               when "literal"
@@ -371,9 +371,15 @@ module OpenTox
       # Convert to RDF/XML
       # @return [text/plain] Object OWL-DL in RDF/XML format
       def to_rdfxml
-        Tempfile.open("owl-serializer"){|f| f.write(self.to_ntriples); @path = f.path}
+        tmpf = Tempfile.open("owl-serializer")
+        tmpf.write(self.to_ntriples)
+        tmpf.flush
+        @path = tmpf.path
         # TODO: add base uri for ist services
-        `rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
+        res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
+        tmpf.close
+        tmpf.delete
+        res
       end
 
       # Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification
@@ -427,20 +433,20 @@ module OpenTox
       end
 
       def literal(value,type)
-        # concat and << are faster string concatination operators than + 
+        # concat and << are faster string concatination operators than +
         '"'.concat(value.to_s).concat('"^^<').concat(type).concat('>')
       end
 
       def url(uri)
-        # concat and << are faster string concatination operators than + 
+        # concat and << are faster string concatination operators than +
         '<'.concat(uri).concat('>')
       end
 
       def rdf_types
-        @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } } 
-        @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } } 
-        @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } } 
-        @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } } 
+        @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
+        @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
+        @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
+        @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
       end
 
     end
@@ -457,35 +463,46 @@ module OpenTox
         @rows.first << features
         @rows.first.flatten!
         dataset.data_entries.each do |compound,entries|
-          smiles = Compound.new(compound).to_smiles
+          cmpd = Compound.new(compound)
+          smiles = cmpd.to_smiles
+          inchi = URI.encode_www_form_component(cmpd.to_inchi)
+          row_container = Array.new
           row = Array.new(@rows.first.size)
-          row[0] = smiles
+          row_container << row
+          #row[0] = smiles
+          row[0] = inchi
           entries.each do |feature, values|
             i = features.index(feature)+1
             values.each do |value|
-              if row[i] 
-                row[i] = "#{row[i]} #{value}" # multiple values
+              if row_container[0][i]
+                #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
+                row_container << row_container.last.collect
+                row_container.last[i] = value
+                #LOGGER.debug "RC: #{row_container.to_yaml}"
               else
-                row[i] = value 
+                row_container.each { |r| r[i] = value }
               end
             end
           end
-          @rows << row
+          row_container.each { |r| @rows << r }
         end
       end
 
       # Convert to CSV string
       # @return [String] CSV string
       def to_csv
-        @rows.collect{|r| r.join(", ")}.join("\n")
+        rows = @rows.collect
+        result = ""
+        result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
+        result << rows.collect{ |r| r.join(",") }.join("\n")
       end
 
       # Convert to spreadsheet workbook
       # @return [Spreadsheet::Workbook] Workbook object (use the spreadsheet gemc to write a file)
-      def to_spreadsheet
+      def to_spreadsheet(sheetname="sheet1")
         Spreadsheet.client_encoding = 'UTF-8'
         book = Spreadsheet::Workbook.new
-        sheet = book.create_worksheet(:name => '')
+        sheet = book.create_worksheet(:name => "#{sheetname}")
         sheet.column(0).width = 100
         i = 0
         @rows.each do |row|
diff --git a/lib/stratification.R b/lib/stratification.R
new file mode 100644
index 0000000..76ff2d8
--- /dev/null
+++ b/lib/stratification.R
@@ -0,0 +1,201 @@
+
+nominal_to_binary <- function( data )
+{
+  result = NULL
+  for (i in 1:ncol(data))
+  {
+     #print(i)
+     if (is.numeric( data[,i] ) )
+     {
+        if (is.null(result))
+          result = data.frame(data[,i])
+        else
+          result = data.frame(result, data[,i])
+        colnames(result)[ncol(result)] <- colnames(data)[i]
+     }
+     else
+     {
+        vals = unique(data[,i])
+        for (j in 1:length(vals))
+        {
+           #print(j)
+           bins = c()
+           for (k in 1:nrow(data))
+           {
+              if(data[,i][k] == vals[j])
+                bins = c(bins,1)
+              else
+                bins = c(bins,0)
+           }
+           #print(bins)
+           if (is.null(result))
+             result = data.frame(bins)
+           else
+             result = data.frame(result, bins)
+           colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
+           if (length(vals)==2) break
+        }
+     }
+  }
+  #print(head(result))
+  result
+}
+
+process_data <- function( data )
+{
+  data.num <- as.data.frame(data)
+  if (!is.numeric(data.num))
+  {
+    data.num = nominal_to_binary(data.num)
+  }
+  if(any(is.na(data.num)))
+  {
+    require("gam")
+   	data.repl = na.gam.replace(data.num)
+  }
+  else
+  	data.repl = data.num
+  data.repl
+}
+
+cluster <- function( data, min=10, max=15 )
+{
+  require("vegan")
+  max <- min(max,nrow(unique(data)))
+  max <- min(max,nrow(data)-1)
+  if (min>max)
+    min=max
+  print(paste("cascade k-means ",min," - ",max))
+  s = cascadeKM(data,min,max,iter=30)
+  m = max.col(s$results)[2]
+  print(paste("best k-means clustering result: ",((m-1)+min)," num clusters"))
+  cbind(s$partition[,m])
+}
+
+stratified_split <- function( data, ratio=0.3, method="cluster" )
+{
+    data.processed = as.matrix(process_data( data ))
+    if (method == "samplecube")
+    {
+      require("sampling")
+      # adjust ratio to make samplecube return exact number of samples
+      ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
+      pik = rep(ratio,times=nrow(data.processed))
+      data.strat = cbind(pik,data.processed)
+      samplecube(data.strat,pik,order=2,comment=F)
+    }
+    else if (method == "cluster")
+    {
+      cl = cluster(data.processed)
+#      require("caret")
+#      res = createDataPartition(cl,p=ratio)
+#      split = rep(1, times=nrow(data))
+#      for (j in 1:nrow(data))
+#        if ( is.na(match(j,res$Resample1)) )
+#          split[j]=0
+#      split
+      require("sampling")
+      stratified_split(cl,ratio,"samplecube")
+    }
+    else
+      stop("unknown method")
+}
+
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
+{
+  print(paste(num_folds,"-fold-split, data-size",nrow(data)))
+  data.processed = as.matrix(process_data( data ))
+  if (method == "samplecube")
+  {
+    folds = rep(0, times=nrow(data))
+    for (i in 1:(num_folds-1))
+    {
+      require("sampling")
+      prop = 1/(num_folds-(i-1))
+      print(paste("fold",i,"/",num_folds," prop",prop))
+      pik = rep(prop,times=nrow(data))
+      for (j in 1:nrow(data))
+        if(folds[j]!=0)
+          pik[j]=0
+      data.strat = cbind(pik,data.processed)
+      s<-samplecube(data.strat,pik,order=2,comment=F)
+      print(paste("fold size: ",sum(s)))
+      for (j in 1:nrow(data))
+        if (s[j] == 1)
+          folds[j]=i
+    }
+    for (j in 1:nrow(data))
+      if (folds[j] == 0)
+        folds[j]=num_folds
+    folds
+  }
+  else if (method == "cluster")
+  {
+    require("TunePareto")
+    cl = cluster(data.processed)
+    res = generateCVRuns(cl,ntimes=1,nfold=3)
+    folds = rep(0, times=nrow(data))
+    for (i in 1:num_folds)
+      for(j in 1:length(res[[1]][[i]]))
+        folds[res[[1]][[i]][j]]=i
+    folds
+  }
+  else
+    stop("unknown method")
+}
+
+plot_pre_process <- function( data, method="pca" )
+{
+  data.processed = process_data( data )
+  if (method == "pca")
+  {
+    data.pca <- prcomp(data.processed, scale=TRUE)
+    as.data.frame(data.pca$x)[1:2]
+  }
+  else if (method == "smacof")
+  {
+    require("smacof")
+    data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
+    data.emb$conf
+  }
+  else
+    stop("unknown method")
+}
+
+plot_split <- function( data, split, names=NULL, ... )
+{
+  if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
+    stop("data not suitable for plotting, plot_pre_process() first")
+
+  plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
+  if (is.null(names))
+    names <- c("split 1","split 2")
+  colos = as.double(rep(2:(max(split)+2)))
+  legend("topleft",names,pch=2,col=colos)
+
+  for (j in max(split):0)
+  {
+    set = c()
+    for (i in 1:nrow(data))
+      if (split[i] == j)
+        set = c(set,i)
+    points(data[set,], pch = 2, col=(j+2))
+  }
+}
+
+#a<-matrix(rnorm(100, mean=50,  sd=4), ncol=5)
+#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
+#data<-rbind(a,b)
+#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
+#data<-rbind(data,c)
+#data=iris
+#split = stratified_k_fold_split(data, num_folds=3)
+#split = stratified_split(data, ratio=0.33, method="cluster")
+#print(sum(split))
+#plot_split(plot_pre_process(data),split,c("training","test"))
+
+#cl = cluster(data)
+
+
+
+
diff --git a/lib/task.rb b/lib/task.rb
index e6fa5e1..102f4dc 100644
--- a/lib/task.rb
+++ b/lib/task.rb
@@ -242,16 +242,20 @@ module OpenTox
     # waits for a task, unless time exceeds or state is no longer running
     # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
     # @param [optional,Numeric] dur seconds pausing before cheking again for completion
-    def wait_for_completion( waiting_task=nil, dur=0.3)
+    def wait_for_completion( waiting_task=nil)
       
       waiting_task.waiting_for(self.uri) if waiting_task
       due_to_time = Time.new + DEFAULT_TASK_MAX_DURATION
+      start_time = Time.new
+      dur = 0
       LOGGER.debug "start waiting for task "+@uri.to_s+" at: "+Time.new.to_s+", waiting at least until "+due_to_time.to_s
       
       load_metadata # for extremely fast tasks
       check_state
       while self.running? or self.queued?
         sleep dur
+        dur = [[(Time.new - start_time)/20.0,0.3].max,300.0].min
+        #LOGGER.debug "task-object-id: #{self.object_id} - wait: #{"%.2f"%(Time.new - start_time)} - dur: #{"%.2f"%dur}"
         load_metadata 
         # if another (sub)task is waiting for self, set progress accordingly 
         waiting_task.progress(@metadata[OT.percentageCompleted].to_f) if waiting_task
diff --git a/lib/transform.rb b/lib/transform.rb
new file mode 100644
index 0000000..8fe1093
--- /dev/null
+++ b/lib/transform.rb
@@ -0,0 +1,520 @@
+module OpenTox
+    module Transform
+    # Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
+    
+      # LogAutoScaler for GSL vectors.
+      # Take log and scale.
+      class LogAutoScale
+        attr_accessor :vs, :offset, :autoscaler
+
+        # @param [GSL::Vector] Values to transform using LogAutoScaling.
+        def initialize values
+          @distance_to_zero = 1.0
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            @offset = vs.min - @distance_to_zero
+            @autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs)
+            @vs = @autoscaler.vs
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] values to restore.
+        # @return [GSL::Vector] transformed values.
+        def restore values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            rv = @autoscaler.restore(vs)
+            rv.to_a.collect { |v| (10**v) + @offset }.to_gv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def mvlog values 
+          values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv
+        end
+
+      end
+
+
+      # Auto-Scaler for GSL vectors.
+      # Center on mean and divide by standard deviation.
+      class AutoScale 
+        attr_accessor :vs, :mean, :stdev
+
+        # @param [GSL::Vector] values to transform using AutoScaling.
+        def initialize values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            @mean = vs.to_scale.mean
+            @stdev = vs.to_scale.standard_deviation_population
+            @stdev = 0.0 if @stdev.nan? 
+            @vs = transform vs
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def transform values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            autoscale values.clone
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] Values to restore.
+        # @return [GSL::Vector] transformed values.
+        def restore values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
+            (rv_ss + @mean).to_gsl
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def autoscale values
+          vs_ss = values.clone.to_scale - @mean
+          @stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl
+        end
+
+      end
+
+
+      # Principal Components Analysis.
+      class PCA
+        attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
+
+        # Creates a transformed dataset as GSL::Matrix.
+        #
+        # @param [GSL::Matrix] Data matrix.
+        # @param [Float] Compression ratio from [0,1], default 0.05.
+        # @return [GSL::Matrix] Data transformed matrix.
+        def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
+          begin
+            @data_matrix = data_matrix.clone
+            @compression = compression.to_f
+            @mean = Array.new
+            @autoscaler = Array.new
+            @cols = Array.new
+            @maxcols = maxcols
+
+            # Objective Feature Selection
+            raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
+            @data_matrix_selected = nil
+            (0..@data_matrix.size2-1).each { |i|
+              if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
+                if @data_matrix_selected.nil?
+                  @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) 
+                  @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
+                else
+                  @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
+                end
+                @cols << i
+              end             
+            }
+            raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
+
+            # PCA uses internal centering on 0
+            @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
+            (0..@cols.size-1).each { |i|
+              as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
+              @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
+              @mean << as.mean
+              @autoscaler << as
+            }
+
+            # PCA
+            data_matrix_hash = Hash.new
+            (0..@cols.size-1).each { |i|
+              column_view = @data_matrix_scaled.col(i)
+              data_matrix_hash[i] = column_view.to_scale
+            }
+            dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
+            cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
+            pca=Statsample::Factor::PCA.new(cor_matrix)
+
+            # Select best eigenvectors
+            pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
+            @eigenvalue_sums = Array.new
+            (0..@cols.size-1).each { |i|
+              @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
+            }
+            eigenvectors_selected = Array.new
+            pca.eigenvectors.each_with_index { |ev, i|
+              if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
+                eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
+              end
+            }
+            @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
+            @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
+
+          rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # Transforms data to feature space found by PCA.
+        #
+        # @param [GSL::Matrix] Data matrix.
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform values
+          begin
+            vs = values.clone
+            raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
+            data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
+            @cols.each_with_index { |i,j|
+              data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
+            }
+            (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # Restores data in the original feature space (possibly with compression loss).
+        #
+        # @param [GSL::Matrix] Transformed data matrix.
+        # @return [GSL::Matrix] Data matrix.
+        def restore
+          begin 
+            data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
+            # reverse scaling
+            (0..@cols.size-1).each { |i|
+              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
+            }
+            data_matrix_restored
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+      end
+
+
+      # Singular Value Decomposition
+      class SVD
+        attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv
+
+        # Creates a transformed dataset as GSL::Matrix.
+        #
+        # @param [GSL::Matrix] Data matrix
+        # @param [Float] Compression ratio from [0,1], default 0.05
+        # @return [GSL::Matrix] Data transformed matrix
+
+        def initialize data_matrix, compression=0.05
+          begin
+            @data_matrix = data_matrix.clone
+            @compression = compression
+
+            # Compute the SV Decomposition X=USV
+            # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
+            u, vt, s = data_matrix.SV_decomp 
+            
+            # Determine cutoff index
+            s2 = s.mul(s) ; s2_sum = s2.sum
+            s2_run = 0
+            k = s2.size - 1
+            s2.to_a.reverse.each { |v| 
+              s2_run += v
+              frac = s2_run / s2_sum
+              break if frac > compression
+              k -= 1
+            }
+            k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
+            
+            # Take the k-rank approximation of the Matrix
+            #   - Take first k columns of u
+            #   - Take first k columns of vt
+            #   - Take the first k eigenvalues
+            @uk = u.submatrix(nil, (0..k)) # used to transform column format data
+            @vk = vt.submatrix(nil, (0..k)) # used to transform row format data
+            s = GSL::Matrix.diagonal(s)
+            @eigk = s.submatrix((0..k), (0..k))
+            @eigk_inv = @eigk.inv
+
+            # Transform data
+            @data_transformed_matrix = @uk # = u for all SVs
+            # NOTE: @data_transformed_matrix is also equal to
+            # @data_matrix * @vk * @eigk_inv
+
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+
+        # Transforms data instance (1 row) to feature space found by SVD.
+        #
+        # @param [GSL::Matrix] Data matrix (1 x m).
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform_instance values
+          begin
+            values * @vk * @eigk_inv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        alias :transform :transform_instance # make this the default (see PCA interface)
+
+        # Transforms data feature (1 column) to feature space found by SVD.
+        #
+        # @param [GSL::Matrix] Data matrix (1 x n).
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform_feature values
+          begin
+            values * @uk * @eigk_inv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+
+        # Restores data in the original feature space (possibly with compression loss).
+        #
+        # @param [GSL::Matrix] Transformed data matrix.
+        # @return [GSL::Matrix] Data matrix.
+        def restore
+          begin 
+            @data_transformed_matrix * @eigk * @vk.transpose  # reverse svd
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+
+      end
+
+
+
+      # Attaches transformations to an OpenTox::Model
+      # Stores props, sims, performs similarity calculations
+      class ModelTransformer
+        attr_accessor :model, :similarity_algorithm, :acts, :sims
+
+        # @params[OpenTox::Model] model to transform
+        def initialize model
+          @model = model
+          @similarity_algorithm = @model.similarity_algorithm
+        end
+
+        def transform
+          get_matrices # creates @n_prop, @q_prop, @acts from ordered fps
+          @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors
+
+          # Preprocessing
+          if (@model.similarity_algorithm == "Similarity.cosine")
+            # truncate nil-columns and -rows
+            LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+            while @q_prop.size>0
+              idx = @q_prop.index(nil)
+              break if idx.nil?
+              @q_prop.slice!(idx)
+              @n_prop.each { |r| r.slice!(idx) }
+            end
+            LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+            remove_nils  # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors
+            LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+
+            # adjust rest
+            fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp
+            cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp
+            acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
+
+            # scale and svd
+            nr_cases, nr_features = @n_prop.size, @n_prop[0].size
+            gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup
+            gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup
+            (0...nr_features).each { |i|
+               autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i))
+               gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs
+               gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i)
+            }
+            svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0)
+            @n_prop = svd.data_transformed_matrix.to_a
+            @q_prop = svd.transform(gsl_q_prop).row(0).to_a
+            LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+          else
+            convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched
+          end
+
+          # neighbor calculation
+          @ids = [] # surviving compounds become neighbors
+          @sims = [] # calculated by neighbor routine
+          neighbors
+          n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
+          acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
+
+
+          # Sims between neighbors, if necessary
+          gram_matrix = []
+          if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.)
+            @n_prop.each_index do |i|
+              gram_matrix[i] = [] unless gram_matrix[i]
+              @n_prop.each_index do |j|
+                if (j>i)
+                  sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
+                  gram_matrix[i][j] = sim
+                  gram_matrix[j] = [] unless gram_matrix[j]
+                  gram_matrix[j][i] = gram_matrix[i][j]
+                end
+              end
+              gram_matrix[i][i] = 1.0
+            end
+          end
+
+          # reclaim original data (if svd was performed)
+          if svd
+            @n_prop = gsl_n_prop_orig.to_a 
+            n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp
+            @q_prop = gsl_q_prop_orig.row(0).to_a
+          end
+
+          LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+          LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
+
+          @sims = [ gram_matrix, @sims ] 
+
+        end
+
+
+          
+
+        # Find neighbors and store them as object variable, access all compounds for that.
+        def neighbors
+          @model.neighbors = []
+          @n_prop.each_with_index do |fp, idx| # AM: access all compounds
+            add_neighbor fp, idx
+          end
+        end
+
+
+        # Adds a neighbor to @neighbors if it passes the similarity threshold
+        # adjusts @ids to signal the
+        def add_neighbor(training_props, idx)
+
+          sim = similarity(training_props)
+          if sim > @model.parameter("min_sim")
+            if @model.activities[@cmpds[idx]]
+              @model.activities[@cmpds[idx]].each do |act|
+                @model.neighbors << {
+                  :compound => @cmpds[idx],
+                  :similarity => sim,
+                  :features => @fps[idx].keys,
+                  :activity => act
+                }
+                @sims << sim
+                @ids << idx
+              end
+            end
+          end
+        end
+
+
+        # Removes nil entries from n_prop and q_prop.
+        # Matrix is a nested two-dimensional array.
+        # Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed.
+        # Tie break: columns take precedence.
+        # Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero.
+        # Enables the use of cosine similarity / SVD
+        def remove_nils
+         return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0)
+          col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
+          row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
+          m_cols = col_nr_nils.max
+          m_rows = row_nr_nils.max
+          idx_cols = col_nr_nils.index(m_cols)
+          idx_rows = row_nr_nils.index(m_rows)
+          while ((m_cols > 0) || (m_rows > 0)) do
+            if m_cols >= m_rows
+              @n_prop.each { |row| row.slice!(idx_cols) }
+              @q_prop.slice!(idx_cols)
+            else
+              @n_prop.slice!(idx_rows)
+              @ids.slice!(idx_rows)
+            end
+            break if (@n_prop.length == 0) || (@n_prop[0].length == 0)
+            col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
+            row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
+            m_cols = col_nr_nils.max
+            m_rows = row_nr_nils.max
+            idx_cols= col_nr_nils.index(m_cols)
+            idx_rows = row_nr_nils.index(m_rows)
+          end
+        end
+
+
+        # Replaces nils by zeroes in n_prop and q_prop
+        # Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop)
+        def convert_nils
+          @n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } }
+          @q_prop.collect! { |v| v.nil? ? 0 : v }
+        end
+
+
+        # Executes model similarity_algorithm
+        def similarity(training_props)
+          eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)")
+        end
+
+
+        # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
+        # Same for compound fingerprints.
+        def get_matrices
+
+          @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
+          
+          @model.fingerprints.each { |fp|
+            cmpd = fp[0]; fp = fp[1]
+            if @model.activities[cmpd] # row good
+              acts = @model.activities[cmpd]; @acts += acts
+              LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
+              row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
+              acts.size.times { # multiple additions for multiple activities
+                @n_prop << row.collect
+                @cmpds << cmpd
+                @fps << Marshal.load(Marshal.dump(fp))
+              } 
+            else
+              LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
+            end
+          }
+
+          @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
+
+        end
+
+        def props
+          @model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil
+        end
+
+      end
+
+    end
+end
diff --git a/lib/utils.rb b/lib/utils.rb
new file mode 100644
index 0000000..d9d7b4b
--- /dev/null
+++ b/lib/utils.rb
@@ -0,0 +1,372 @@
+require 'csv'
+
+
+module OpenTox
+
+  module Algorithm
+
+    include OpenTox
+
+    # Calculate physico-chemical descriptors.
+    # @param[Hash] Required keys: :dataset_uri, :pc_type
+    # @return[String] dataset uri
+
+    def self.pc_descriptors(params)
+
+      begin
+        ds = OpenTox::Dataset.find(params[:dataset_uri])
+        compounds = ds.compounds.collect
+        ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
+        #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
+        LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
+        load_ds_csv(ambit_result_uri, smiles_to_inchi)
+      rescue Exception => e
+        LOGGER.debug "#{e.class}: #{e.message}"
+        LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+      end
+
+    end
+    
+    # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
+    # @param[Hash] Required keys: :compounds, :pc_type
+    # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
+    def self.get_pc_descriptors(params)
+
+      begin
+
+        ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
+        ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
+        descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
+        descs_uris = []
+        params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
+        types = params[:pc_type].split(",")
+        descs.each { |uri, cat_name| 
+          if types.include? cat_name[:category]
+            descs_uris << uri
+          end
+        }
+        if descs_uris.size == 0
+          raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
+        end
+        #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
+
+        begin
+          # Create SMI
+          smiles_array = []; smiles_to_inchi = {}
+          params[:compounds].each do |n|
+            cmpd = OpenTox::Compound.new(n)
+            smiles_string = cmpd.to_smiles
+            smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
+            smiles_array << smiles_string
+          end
+          smi_file = Tempfile.open(['pc_ambit', '.csv'])
+          pc_descriptors = nil
+
+          # Create Ambit dataset
+          smi_file.puts( "SMILES\n" )
+          smi_file.puts( smiles_array.join("\n") )
+          smi_file.flush
+          ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+        ensure
+          smi_file.close! if smi_file
+        end
+        ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
+
+        # Calculate 3D for CPSA
+        if types.include? "cpsa"
+          ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} ) 
+          LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
+        end
+
+        # Get Ambit results
+        ambit_result_uri = [] # 1st pos: base uri, then features
+        ambit_result_uri << ambit_ds_uri + "?"
+        ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
+        descs_uris.each_with_index do |uri, i|
+          algorithm = Algorithm::Generic.new(uri)
+          result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
+          ambit_result_uri << result_uri.split("?")[1] + "&"
+          LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}"
+        end
+        #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
+        [ ambit_result_uri, smiles_to_inchi ]
+
+      rescue Exception => e
+        LOGGER.debug "#{e.class}: #{e.message}"
+        LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+      end
+    end
+
+
+    # Load dataset via CSV
+    # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
+    # @return[String] dataset uri 
+    def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil)
+      
+      master=nil
+      (1...ambit_result_uri.size).collect { |idx|
+        curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
+        LOGGER.debug "Requesting #{curr_uri}"
+        csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
+        if csv_data[0] && csv_data[0].size>1
+          if master.nil? # This is the smiles entry
+            (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
+            master = csv_data
+            next
+          else
+            index_uri = csv_data[0].index("SMILES")
+            csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
+            
+            nr_cols = (csv_data[0].size)-1
+            LOGGER.debug "Merging #{nr_cols} new columns"
+            master.each {|row| nr_cols.times { row.push(nil) }  } # Adds empty columns to all rows
+            csv_data.each do |row|
+              temp = master.assoc(row[0]) # Finds the appropriate line in master
+              ((-1*nr_cols)..-1).collect.each { |idx|
+                temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
+              }
+            end
+          end
+        end
+      }
+
+      index_uri = master[0].index("Compound")
+      master.map {|i| i.delete_at(index_uri)}
+      master[0].each {|cell| cell.chomp!(" ")}
+      master[0][0] = "Compound" #"SMILES" 
+      index_smi = master[0].index("SMILES")
+      master.map {|i| i.delete_at(index_smi)} if index_smi
+      #master[0][0] = "SMILES" 
+       
+      #LOGGER.debug "-------- AM: Writing to dumpfile"
+      #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
+     
+      parser = OpenTox::Parser::Spreadsheets.new
+      ds = OpenTox::Dataset.new(nil,subjectid)
+      ds.save(subjectid)
+      parser.dataset = ds
+      ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
+      ds.save(subjectid)
+    end
+
+
+    # Gauss kernel
+    # @return [Float] 
+    def self.gauss(x, sigma = 0.3) 
+      d = 1.0 - x.to_f
+      Math.exp(-(d*d)/(2*sigma*sigma))
+    end
+
+
+    # For symbolic features
+    # @param [Array] Array to test, must indicate non-occurrence with 0.
+    # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
+    def self.isnull_or_singular?(array)
+      nr_zeroes = array.count(0)
+      return (nr_zeroes == array.size) ||    # remove non-occurring feature
+             (nr_zeroes == array.size-1) ||  # remove singular feature
+             (nr_zeroes == 0)                # also remove feature present everywhere
+    end
+
+
+    # Numeric value test
+    # @param[Object] value
+    # @return [Boolean] Whether value is a number
+    def self.numeric?(value)
+      true if Float(value) rescue false
+    end
+
+
+    # For symbolic features
+    # @param [Array] Array to test, must indicate non-occurrence with 0.
+    # @return [Boolean] Whether the feature has variance zero.
+    def self.zero_variance?(array)
+      return array.uniq.size == 1
+    end
+    
+
+    # Sum of an array for Arrays.
+    # @param [Array] Array with values
+    # @return [Integer] Sum of size of values
+    def self.sum_size(array)
+      sum=0
+      array.each { |e| sum += e.size }
+      return sum
+    end
+
+
+    # Minimum Frequency
+    # @param [Integer] per-mil value
+    # return [Integer] min-frequency
+    def self.min_frequency(training_dataset,per_mil)
+      minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+      minfreq = 2 unless minfreq > 2
+      Integer (minfreq)
+    end
+
+
+    # Effect calculation for classification
+    # @param [Array] Array of occurrences per class in the form of Enumerables.
+    # @param [Array] Array of database instance counts per class.
+    def self.effect(occurrences, db_instances)
+      max=0
+      max_value=0
+      nr_o = self.sum_size(occurrences)
+      nr_db = db_instances.to_scale.sum
+
+      occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
+        actual = o.size.to_f/nr_o
+        expected = db_instances[i].to_f/nr_db
+        if actual > expected
+          if ((actual - expected) / actual) > max_value
+           max_value = (actual - expected) / actual # 'Schleppzeiger'
+            max = i
+          end
+        end
+      }
+      max
+    end
+    
+
+    # neighbors
+
+    module Neighbors
+      
+      # Get confidence.
+      # @param[Hash] Required keys: :sims, :acts
+      # @return[Float] Confidence
+      def self.get_confidence(params)
+        conf = params[:sims].inject{|sum,x| sum + x }
+        confidence = conf/params[:sims].size
+        LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
+        return confidence
+      end
+
+    end
+
+
+    # Similarity calculations
+    module Similarity
+
+      # Tanimoto similarity
+      # @param [Hash, Array] fingerprints of first compound
+      # @param [Hash, Array] fingerprints of second compound
+      # @return [Float] (Weighted) tanimoto similarity
+      def self.tanimoto(fingerprints_a,fingerprints_b,weights=nil,params=nil)
+
+        common_p_sum = 0.0
+        all_p_sum = 0.0
+
+        # fingerprints are hashes
+        if fingerprints_a.class == Hash && fingerprints_b.class == Hash
+          common_features = fingerprints_a.keys & fingerprints_b.keys
+          all_features = (fingerprints_a.keys + fingerprints_b.keys).uniq
+          if common_features.size > 0
+            common_features.each{ |f| common_p_sum += [ fingerprints_a[f], fingerprints_b[f] ].min }
+            all_features.each{ |f| all_p_sum += [ fingerprints_a[f],fingerprints_b[f] ].compact.max } # compact, since one fp may be empty at that pos
+          end
+
+        # fingerprints are arrays
+        elsif fingerprints_a.class == Array && fingerprints_b.class == Array
+          size = [ fingerprints_a.size, fingerprints_b.size ].min
+          LOGGER.warn "fingerprints don't have equal size" if fingerprints_a.size != fingerprints_b.size
+          (0...size).each { |idx|
+            common_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].min
+            all_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].max
+          }
+        end
+
+        (all_p_sum > 0.0) ? (common_p_sum/all_p_sum) : 0.0
+
+      end
+
+
+      # Cosine similarity
+      # @param [Hash] properties_a key-value properties of first compound
+      # @param [Hash] properties_b key-value properties of second compound
+      # @return [Float] cosine of angle enclosed between vectors induced by keys present in both a and b
+      def self.cosine(fingerprints_a,fingerprints_b,weights=nil)
+
+        # fingerprints are hashes
+        if fingerprints_a.class == Hash && fingerprints_b.class == Hash
+          a = []; b = []
+          common_features = fingerprints_a.keys & fingerprints_b.keys
+          if common_features.size > 1
+            common_features.each do |p|
+              a << fingerprints_a[p]
+              b << fingerprints_b[p]
+            end
+          end
+
+        # fingerprints are arrays
+        elsif fingerprints_a.class == Array && fingerprints_b.class == Array
+          a = fingerprints_a
+          b = fingerprints_b
+        end
+
+        (a.size > 0 && b.size > 0) ? self.cosine_num(a.to_gv, b.to_gv) : 0.0
+
+      end
+
+
+      # Cosine similarity
+      # @param [GSL::Vector] a
+      # @param [GSL::Vector] b
+      # @return [Float] cosine of angle enclosed between a and b
+      def self.cosine_num(a, b)
+        if a.size>12 && b.size>12
+          a = a[0..11]
+          b = b[0..11]
+        end
+        a.dot(b) / (a.norm * b.norm)
+      end
+
+
+      # Outlier detection based on Mahalanobis distances
+      # Multivariate detection on X, univariate detection on y
+      # Uses an existing Rinruby instance, if possible
+      # @param[Hash] Keys query_matrix, data_matrix, acts are required; r, p_outlier optional
+      # @return[Array] indices identifying outliers (may occur several times, this is intended)
+      def self.outliers(params)
+        outlier_array = []
+        data_matrix = params[:data_matrix]
+        query_matrix = params[:query_matrix]
+        acts = params[:acts]
+        begin
+          LOGGER.debug "Outliers (p=#{params[:p_outlier] || 0.9999})..."
+          r = ( params[:r] || RinRuby.new(false,false) )
+          r.eval "suppressPackageStartupMessages(library(\"robustbase\"))"
+          r.eval "outlier_threshold = #{params[:p_outlier] || 0.999}"
+          nr_cases, nr_features = data_matrix.to_a.size, data_matrix.to_a[0].size
+          r.odx = data_matrix.to_a.flatten
+          r.q = query_matrix.to_a.flatten
+          r.y = acts.to_a.flatten
+          r.eval "odx = matrix(odx, #{nr_cases}, #{nr_features}, byrow=T)"
+          r.eval 'odx = rbind(q,odx)' # query is nr 0 (1) in ruby (R)
+          r.eval 'mah = covMcd(odx)$mah' # run MCD alg
+          r.eval "mah = pchisq(mah,#{nr_features})"
+          r.eval 'outlier_array = which(mah>outlier_threshold)'  # multivariate outliers using robust mahalanobis
+          outlier_array = r.outlier_array.to_a.collect{|v| v-2 }  # translate to ruby index (-1 for q, -1 due to ruby)
+          r.eval 'fqu = matrix(summary(y))[2]'
+          r.eval 'tqu = matrix(summary(y))[5]'
+          r.eval 'outlier_array = which(y>(tqu+1.5*IQR(y)))'     # univariate outliers due to Tukey (http://goo.gl/mwzNH)
+          outlier_array += r.outlier_array.to_a.collect{|v| v-1 } # translate to ruby index (-1 due to ruby)
+          r.eval 'outlier_array = which(y<(fqu-1.5*IQR(y)))'
+          outlier_array += r.outlier_array.to_a.collect{|v| v-1 }
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+        end
+        outlier_array
+      end
+
+    end
+
+
+  end
+
+end
+
diff --git a/lib/validation.rb b/lib/validation.rb
index 646b076..85004c7 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -1,3 +1,4 @@
+require "yaml"
 module OpenTox
   class Validation
     include OpenTox
@@ -66,7 +67,7 @@ module OpenTox
     # @return [String] report uri
     def find_or_create_report( subjectid=nil, waiting_task=nil )
       @report = ValidationReport.find_for_validation(@uri, subjectid) unless @report
-      @report = ValidationReport.create(@uri, subjectid, waiting_task) unless @report
+      @report = ValidationReport.create(@uri, {}, subjectid, waiting_task) unless @report
       @report.uri
     end
     
@@ -107,6 +108,31 @@ module OpenTox
       end
       table
     end
+    
+    # returns probability-distribution for a given prediction
+    # it takes all predictions into account that have a confidence value that is >= confidence and that have the same predicted value
+    # (minimum 12 predictions with the hightest confidence are selected (even if the confidence is lower than the given param)
+    # 
+    # @param [Float] confidence value (between 0 and 1)
+    # @param [String] predicted value
+    # @param [String,optional] subjectid
+    # @return [Hash] see example
+    #
+    # Example 1:
+    # validation.probabilities(0.3,"active")
+    # -> {:min_confidence=>0.32, :num_predictions=>20, :probs=>{"active"=>0.7, "moderate"=>0.25 "inactive"=>0.05}}
+    # there have been 20 "active" predictions with confidence >= 0.3, 70 percent of them beeing correct
+    #
+    # Example 2:
+    # validation.probabilities(0.8,"active")
+    # -> {:min_confidence=>0.45, :num_predictions=>12, :probs=>{"active"=>0.9, "moderate"=>0.1 "inactive"=>0}}
+    # the given confidence value was to high (i.e. <12 predictions with confidence value >= 0.8)
+    # the top 12 "active" predictions have a min_confidence of 0.45, 90 percent of them beeing correct
+    # 
+    def probabilities( confidence, prediction, subjectid=nil )
+      YAML.load(OpenTox::RestClientWrapper.get(@uri+"/probabilities?prediction="+prediction.to_s+"&confidence="+confidence.to_s,
+        {:subjectid => subjectid, :accept => "application/x-yaml"}))
+    end
   end
   
   class Crossvalidation
@@ -168,6 +194,13 @@ module OpenTox
     def statistics( subjectid=nil )
       Validation.from_cv_statistics( @uri, subjectid )
     end
+    
+    # documentation see OpenTox::Validation.probabilities
+    def probabilities( confidence, prediction, subjectid=nil )
+      YAML.load(OpenTox::RestClientWrapper.get(@uri+"/statistics/probabilities?prediction="+prediction.to_s+"&confidence="+confidence.to_s,
+        {:subjectid => subjectid, :accept => "application/x-yaml"}))
+    end
+    
   end
   
   class ValidationReport
@@ -196,12 +229,18 @@ module OpenTox
     
     # creates a validation report via validation
     # @param [String] validation uri 
+    # @param [Hash] params addiditonal possible 
+    #               (min_confidence, params={}, min_num_predictions, max_num_predictions)
     # @param [String,optional] subjectid
     # @param [OpenTox::Task,optional] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
     # @return [OpenTox::ValidationReport]
-    def self.create( validation_uri, subjectid=nil, waiting_task=nil )
+    def self.create( validation_uri, params={}, subjectid=nil, waiting_task=nil )
+      params = {} if params==nil
+      raise OpenTox::BadRequestError.new "params is no hash" unless params.is_a?(Hash)
+      params[:validation_uris] = validation_uri
+      params[:subjectid] = subjectid
       uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-validation"],"/report/validation"),
-        { :validation_uris => validation_uri, :subjectid => subjectid }, {}, waiting_task )
+        params, {}, waiting_task )
       ValidationReport.new(uri)
     end
     
@@ -268,15 +307,17 @@ module OpenTox
       uris.size==0 ? nil : AlgorithmComparisonReport.new(uris[-1])
     end
     
-    # creates a crossvalidation report via crossvalidation
+    # creates a algorithm comparison report via crossvalidation uris
     # @param [Hash] crossvalidation uri_hash, see example 
+    # @param [Hash] params addiditonal possible 
+    #               (ttest_significance, ttest_attributes, min_confidence, min_num_predictions, max_num_predictions)
     # @param [String,optional] subjectid
     # @param [OpenTox::Task,optional] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
     # @return [OpenTox::AlgorithmComparisonReport]
     # example for hash:
     # { :lazar-bbrc => [ http://host/validation/crossvalidation/x1, http://host/validation/crossvalidation/x2 ],
     #   :lazar-last => [ http://host/validation/crossvalidation/xy, http://host/validation/crossvalidation/xy ] }
-    def self.create( crossvalidation_uri_hash, subjectid=nil, waiting_task=nil )
+    def self.create( crossvalidation_uri_hash, params={}, subjectid=nil, waiting_task=nil )
       identifier = []
       validation_uris = []
       crossvalidation_uri_hash.each do |id, uris|
@@ -285,8 +326,13 @@ module OpenTox
           validation_uris << uri
         end
       end
+      params = {} if params==nil
+      raise OpenTox::BadRequestError.new "params is no hash" unless params.is_a?(Hash)
+      params[:validation_uris] = validation_uris.join(",")
+      params[:identifier] = identifier.join(",")
+      params[:subjectid] = subjectid
       uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-validation"],"/report/algorithm_comparison"),
-        { :validation_uris => validation_uris.join(","), :identifier => identifier.join(","), :subjectid => subjectid }, {}, waiting_task )
+        params, {}, waiting_task )
       AlgorithmComparisonReport.new(uri)
     end
   end  
diff --git a/opentox-ruby.gemspec b/opentox-ruby.gemspec
index 2ec5a18..900d53f 100644
--- a/opentox-ruby.gemspec
+++ b/opentox-ruby.gemspec
@@ -5,7 +5,7 @@
 
 Gem::Specification.new do |s|
   s.name = %q{opentox-ruby}
-  s.version = "3.0.0"
+  s.version = "3.1.0"
 
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler"]
@@ -45,6 +45,8 @@ Gem::Specification.new do |s|
     "lib/templates/default_guest_policy.xml",
     "lib/templates/default_policy.xml",
     "lib/to-html.rb",
+    "lib/transform.rb",
+    "lib/utils.rb"
     "lib/validation.rb"
   ]
   s.homepage = %q{http://github.com/opentox/opentox-ruby}
author	rautenberg <rautenberg@in-silico.ch>	2012-03-13 15:32:57 +0100
committer	rautenberg <rautenberg@in-silico.ch>	2012-03-13 15:32:57 +0100
commit	6b064515f11623e0209f265b32be6889e28def52 (patch)
tree	66e9182b001a94c96270c153a659b6b8eb0055c2
parent	1687a218b1593478bae1ab43a3eb8e5596def684 (diff)
parent	4f14262609d58bf856675ae01195dd2c5f70b97b (diff)