17 files changed, 762 insertions, 1102 deletions
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
deleted file mode 100644
index c83b9b3..0000000
--- a/lib/bbrc.rb
+++ /dev/null
@@ -1,165 +0,0 @@
-module OpenTox
-  module Algorithm
-    class Fminer
-      TABLE_OF_ELEMENTS = [
-"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
-        
-      #
-      # Run bbrc algorithm on dataset
-      #
-      # @param [OpenTox::Dataset] training dataset
-      # @param [optional] parameters BBRC parameters, accepted parameters are
-      #   - min_frequency  Minimum frequency (default 5)
-      #   - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
-      #   - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
-      #   - min_chisq_significance Significance threshold (between 0 and 1)
-      #   - nr_hits Set to "true" to get hit count instead of presence
-      #   - get_target Set to "true" to obtain target variable as feature
-      # @return [OpenTox::Dataset] Fminer Dataset
-      def self.bbrc training_dataset, params={}
-
-        time = Time.now
-        bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
-
-        prediction_feature = training_dataset.features.first
-        if params[:min_frequency]
-          minfreq = params[:min_frequency]
-        else
-          per_mil = 5 # value from latest version
-          per_mil = 8 # as suggested below
-          i = training_dataset.feature_ids.index prediction_feature.id
-          nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
-          minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
-          minfreq = 2 unless minfreq > 2
-          minfreq = minfreq.round
-        end
-
-        @bbrc ||= Bbrc::Bbrc.new
-        @bbrc.Reset
-        if prediction_feature.numeric 
-          @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
-        else
-          bad_request_error "No accept values for "\
-                            "dataset '#{training_dataset.id}' and "\
-                            "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
-          value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
-        end
-        @bbrc.SetMinfreq(minfreq)
-        @bbrc.SetType(1) if params[:feature_type] == "paths"
-        @bbrc.SetBackbone(false) if params[:backbone] == "false"
-        @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
-        @bbrc.SetConsoleOut(false)
-
-        params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
-        feature_dataset = FminerDataset.new(
-            :training_dataset_id => training_dataset.id,
-            :training_algorithm => "#{self.to_s}.bbrc",
-            :training_feature_id => prediction_feature.id ,
-            :training_parameters => {
-              :min_frequency => minfreq,
-              :nr_hits => nr_hits,
-              :backbone => (params[:backbone] == false ? false : true) 
-            }
-
-        )
-        feature_dataset.compounds = training_dataset.compounds
-
-        # add data 
-        training_dataset.compounds.each_with_index do |compound,i|
-          act = value2act[training_dataset.data_entries[i].first]
-          if act # TODO check if this works
-            @bbrc.AddCompound(compound.smiles,i+1)
-            @bbrc.AddActivity(act,i+1)
-          end
-        end
-        #g_median=@fminer.all_activities.values.to_scale.median
-
-        #task.progress 10
-        #step_width = 80 / @bbrc.GetNoRootNodes().to_f
-
-        $logger.debug "BBRC setup: #{Time.now-time}"
-        time = Time.now
-        ftime = 0
-        itime = 0
-        rtime = 0
-  
-        # run @bbrc
-        (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
-          results = @bbrc.MineRoot(j)
-          results.each do |result|
-            rt = Time.now
-            f = YAML.load(result)[0]
-            smarts = f.shift
-            # convert fminer SMARTS representation into a more human readable format
-            smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
-             element = TABLE_OF_ELEMENTS[$1.to_i-1]
-             $2 == "a" ? element.downcase : element
-            end
-            p_value = f.shift
-            f.flatten!
-            compound_idxs = f.collect{|e| e.first.first-1}
-            # majority class
-            effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
-  
-=begin
-            if (!@bbrc.GetRegression)
-              id_arrs = f[2..-1].flatten
-              max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
-              effect = max+1
-            else #regression part
-              id_arrs = f[2]
-              # DV: effect calculation
-              f_arr=Array.new
-              f[2].each do |id|
-                id=id.keys[0] # extract id from hit count hash
-                f_arr.push(@fminer.all_activities[id])
-              end
-              f_median=f_arr.to_scale.median
-              if g_median >= f_median
-                effect = 'activating'
-              else
-                effect = 'deactivating'
-              end
-            end
-=end
-            rtime += Time.now - rt
-  
-            ft = Time.now
-            feature = OpenTox::FminerSmarts.find_or_create_by({
-              "smarts" => smarts,
-              "p_value" => p_value.to_f.abs.round(5),
-              "effect" => effect,
-              "dataset_id" => feature_dataset.id
-            })
-            feature_dataset.feature_ids << feature.id
-            ftime += Time.now - ft
-
-            it = Time.now
-            f.each do |id_count_hash|
-              id_count_hash.each do |id,count|
-                nr_hits ? count = count.to_i : count = 1
-                feature_dataset.data_entries[id-1] ||= []
-                feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
-              end
-            end
-            itime += Time.now - it
-  
-          end
-        end
-
-        $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
-        time = Time.now
-
-        feature_dataset.fill_nil_with 0
-
-        $logger.debug "Prepare save: #{Time.now-time}"
-        time = Time.now
-        feature_dataset.save_all
-
-        $logger.debug "Save: #{Time.now-time}"
-        feature_dataset
-  
-      end
-    end
-  end
-end
diff --git a/lib/classification.rb b/lib/classification.rb
index b4b2e59..0202940 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -5,14 +5,12 @@ module OpenTox
 
       def self.weighted_majority_vote compound, params
         neighbors = params[:neighbors]
-        return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
         weighted_sum = {}
         sim_sum = 0.0
         confidence = 0.0
         neighbors.each do |row|
-          n,sim,acts = row
-          #confidence = sim if sim > confidence # distance to nearest neighbor
-          acts.each do |act|
+          sim = row["tanimoto"]
+          row["features"][params[:prediction_feature_id].to_s].each do |act|
             weighted_sum[act] ||= 0
             weighted_sum[act] += sim
           end
@@ -30,81 +28,7 @@ module OpenTox
           bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
         end
       end
-
-      # Classification with majority vote from neighbors weighted by similarity
-      # @param [Hash] params Keys `:activities, :sims, :value_map` are required
-      # @return [Numeric] A prediction value.
-      def self.fminer_weighted_majority_vote neighbors, training_dataset
-
-        neighbor_contribution = 0.0
-        confidence_sum = 0.0
-
-        $logger.debug "Weighted Majority Vote Classification."
-
-        values = neighbors.collect{|n| n[2]}.uniq
-        neighbors.each do |neighbor|
-          i = training_dataset.compound_ids.index n.id
-          neighbor_weight = neighbor[1]
-          activity = values.index(neighbor[2]) + 1 # map values to integers > 1
-          neighbor_contribution += activity * neighbor_weight
-          if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
-            case activity
-            when 1
-              confidence_sum -= neighbor_weight
-            when 2
-              confidence_sum += neighbor_weight
-            end
-          else
-            confidence_sum += neighbor_weight
-          end
-        end
-        if values.size == 2 
-          if confidence_sum >= 0.0
-            prediction = values[1]
-          elsif confidence_sum < 0.0
-            prediction = values[0] 
-          end
-        elsif values.size == 1 # all neighbors have the same value
-          prediction = values[0] 
-        else 
-          prediction = (neighbor_contribution/confidence_sum).round  # AM: new multinomial prediction
-        end 
-
-        confidence = (confidence_sum/neighbors.size).abs 
-        {:value => prediction, :confidence => confidence.abs}
-      end
-
-      # Local support vector regression from neighbors 
-      # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
-      # @return [Numeric] A prediction value.
-      def self.local_svm_classification(params)
-
-        confidence = 0.0
-        prediction = nil
-
-        $logger.debug "Local SVM."
-        if params[:activities].size>0
-          if params[:props]
-            n_prop = params[:props][0].collect.to_a
-            q_prop = params[:props][1].collect.to_a
-            props = [ n_prop, q_prop ]
-          end
-          activities = params[:activities].collect.to_a
-          activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
-          prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
-          prediction = prediction.sub(/Val/,"") if prediction # Convert back
-          confidence = 0.0 if prediction.nil?
-          #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
-          confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
-        end
-        {:value => prediction, :confidence => confidence}
-
-      end
-
-
-
     end
-
   end
 end
 
diff --git a/lib/compound.rb b/lib/compound.rb
index 4d32e24..e002305 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -1,12 +1,9 @@
-# TODO: check
-# *** Open Babel Error  in ParseFile
-#    Could not find contribution data file.
-
 CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
 
 module OpenTox
 
   class Compound
+    require_relative "unique_descriptors.rb"
     include OpenTox
 
     DEFAULT_FINGERPRINT = "MP2D"
@@ -15,34 +12,35 @@ module OpenTox
     field :smiles, type: String
     field :inchikey, type: String
     field :names, type: Array
-    field :warning, type: String
     field :cid, type: String
     field :chemblid, type: String
     field :png_id, type: BSON::ObjectId
     field :svg_id, type: BSON::ObjectId
     field :sdf_id, type: BSON::ObjectId
-    field :molecular_weight, type: Float
     field :fingerprints, type: Hash, default: {}
     field :default_fingerprint_size, type: Integer
+    field :physchem_descriptors, type: Hash, default: {}
+    field :dataset_ids, type: Array, default: []
+    field :features, type: Hash, default: {}
 
     index({smiles: 1}, {unique: true})
 
     # Overwrites standard Mongoid method to create fingerprints before database insertion
     def self.find_or_create_by params
       compound = self.find_or_initialize_by params
-      compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT)
+      compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
       compound.save
       compound
     end
 
-    def fingerprint type="MP2D"
+    def fingerprint type=DEFAULT_FINGERPRINT
       unless fingerprints[type]
         return [] unless self.smiles
         #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
         if type == "MP2D"
           fp = obconversion(smiles,"smi","mpd").strip.split("\t")
           name = fp.shift # remove Title
-          fingerprints[type] = fp
+          fingerprints[type] = fp.uniq # no fingerprint counts
         #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
         elsif type== "MNA"
           level = 2 # TODO: level as parameter, evaluate level 1, see paper
@@ -82,19 +80,60 @@ module OpenTox
       fingerprints[type]
     end
 
+    def physchem descriptors=PhysChem.openbabel_descriptors
+      # TODO: speedup java descriptors
+      calculated_ids = physchem_descriptors.keys
+      # BSON::ObjectId instances are not allowed as keys in a BSON document.
+      new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
+      descs = {}
+      algos = {}
+      new_ids.each do |id|
+        descriptor = PhysChem.find id
+        descs[[descriptor.library, descriptor.descriptor]]  = descriptor
+        algos[descriptor.name] = descriptor
+      end
+      # avoid recalculating Cdk features with multiple values
+      descs.keys.uniq.each do |k|
+        descs[k].send(k[0].downcase,k[1],self).each do |n,v|
+          physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
+        end
+      end
+      save
+      physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+    end
+
+    def smarts_match smarts, count=false
+      obconversion = OpenBabel::OBConversion.new
+      obmol = OpenBabel::OBMol.new
+      obconversion.set_in_format('smi')
+      obconversion.read_string(obmol,self.smiles)
+      smarts_pattern = OpenBabel::OBSmartsPattern.new
+      smarts.collect do |sma|
+        smarts_pattern.init(sma.smarts)
+        if smarts_pattern.match(obmol)
+          count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
+        else
+          value = 0 
+        end
+        value
+      end
+    end
+
     # Create a compound from smiles string
     # @example
     #   compound = OpenTox::Compound.from_smiles("c1ccccc1")
     # @param [String] smiles Smiles string
     # @return [OpenTox::Compound] Compound
     def self.from_smiles smiles
-      return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
+      if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
+        $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
+        return nil
+      end
       smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
       if smiles.empty?
+        $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
         return nil
-        #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
       else
-        #Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
         Compound.find_or_create_by :smiles => smiles 
       end
     end
@@ -109,7 +148,7 @@ module OpenTox
       #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
       smiles = obconversion(inchi,"inchi","can")
       if smiles.empty?
-        Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
+        Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
       else
         Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
       end
@@ -245,34 +284,19 @@ module OpenTox
     def fingerprint_neighbors params
       bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
       neighbors = []
-      #if params[:type] == DEFAULT_FINGERPRINT
-        #neighbors = db_neighbors params
-        #p neighbors
-      #else
+      if params[:type] == DEFAULT_FINGERPRINT
+        neighbors = db_neighbors params
+      else 
         query_fingerprint = self.fingerprint params[:type]
-        training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
-          unless self == compound
-            candidate_fingerprint = compound.fingerprint params[:type]
-            sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
-            neighbors << [compound.id, sim] if sim >= params[:min_sim]
-          end
-        end
-      #end
-      neighbors.sort{|a,b| b.last <=> a.last}
-    end
-
-    def fminer_neighbors params
-      bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
-      feature_dataset = Dataset.find params[:feature_dataset_id]
-      query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
-      neighbors = []
-
-      # find neighbors
-      feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
-        sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint
-        if sim >= params[:min_sim]
-          neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+        training_dataset = Dataset.find(params[:training_dataset_id])
+        prediction_feature = training_dataset.features.first
+        training_dataset.compounds.each do |compound|
+          candidate_fingerprint = compound.fingerprint params[:type]
+          sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
+          feature_values = training_dataset.values(compound,prediction_feature)
+          neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
         end
+        neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
       end
       neighbors
     end
@@ -285,13 +309,7 @@ module OpenTox
         # TODO implement pearson and cosine similarity separatly
         R.assign "x", query_fingerprint
         R.assign "y", candidate_fingerprint
-        # pearson r
-        #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
-        #p "pearson"
-        #p sim
-        #p "cosine"
         sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
-        #p sim
         if sim >= params[:min_sim]
           neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
         end
@@ -300,53 +318,53 @@ module OpenTox
     end
 
     def db_neighbors params
-      p "DB NEIGHBORS"
-      p params
-      # TODO restrict to dataset
       # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
-      qn = fingerprint(params[:type]).size
+
+      #qn = default_fingerprint_size
       #qmin = qn * threshold
       #qmax = qn / threshold
       #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
       #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
       aggregate = [
         #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
-        {'$match' =>  {'_id' => {'$ne' => self.id}}}, # remove self
+        #{'$match' =>  {'_id' => {'$ne' => self.id}}}, # remove self
         {'$project' => {
           'tanimoto' => {'$let' => {
-            'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}},
-            'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]}
+            'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
+            #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
+            'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
           }},
-          '_id' => 1
+          '_id' => 1,
+          'features' => 1,
+          'dataset_ids' => 1
         }},
         {'$match' =>  {'tanimoto' => {'$gte' => params[:min_sim]}}},
         {'$sort' => {'tanimoto' => -1}}
       ]
       
-      $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
+      $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
         
     end
     
-   # Get mg from mmol
-   # @return [Float] value in mg
-   def mmol_to_mg mmol
-     mmol.to_f*molecular_weight
+    # Convert mmol to mg
+    # @return [Float] value in mg
+    def mmol_to_mg mmol
+      mmol.to_f*molecular_weight
     end
 
-   def mg_to_mmol mg
-     mg.to_f/molecular_weight
+    # Convert mg to mmol
+    # @return [Float] value in mg
+    def mg_to_mmol mg
+      mg.to_f/molecular_weight
     end
     
     # Calculate molecular weight of Compound with OB and store it in object
     # @return [Float] molecular weight
     def molecular_weight
-      if self["molecular_weight"]==0.0 || self["molecular_weight"].nil?
-        update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first)
-      end
-      self["molecular_weight"]
+      mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
+      physchem([mw_feature])[mw_feature.id.to_s]
     end
 
-
     private
 
     def self.obconversion(identifier,input_format,output_format,option=nil)
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 2e6dabb..15dfb21 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -35,14 +35,14 @@ module OpenTox
       predictions = []
       training_dataset = Dataset.find model.training_dataset_id
       training_dataset.folds(n).each_with_index do |fold,fold_nr|
-        fork do # parallel execution of validations
+        #fork do # parallel execution of validations
           $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
           t = Time.now
           validation = Validation.create(model, fold[0], fold[1],cv)
           $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
-        end
+        #end
       end
-      Process.waitall
+      #Process.waitall
       cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
       cv.validations.each do |validation|
         nr_instances += validation.nr_instances
@@ -52,7 +52,7 @@ module OpenTox
       cv.update_attributes(
         nr_instances: nr_instances,
         nr_unpredicted: nr_unpredicted,
-        predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+        predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
       )
       $logger.debug "Nr unpredicted: #{nr_unpredicted}"
       cv.statistics
@@ -79,23 +79,26 @@ module OpenTox
       true_rate = {}
       predictivity = {}
       predictions.each do |pred|
-        compound_id,activity,prediction,confidence = pred
-        if activity and prediction and confidence.numeric? 
-          if prediction == activity
-            if prediction == accept_values[0]
-              confusion_matrix[0][0] += 1
-              weighted_confusion_matrix[0][0] += confidence
-            elsif prediction == accept_values[1]
-              confusion_matrix[1][1] += 1
-              weighted_confusion_matrix[1][1] += confidence
-            end
-          elsif prediction != activity
-            if prediction == accept_values[0]
-              confusion_matrix[0][1] += 1
-              weighted_confusion_matrix[0][1] += confidence
-            elsif prediction == accept_values[1]
-              confusion_matrix[1][0] += 1
-              weighted_confusion_matrix[1][0] += confidence
+        compound_id,activities,prediction,confidence = pred
+        if activities and prediction #and confidence.numeric? 
+          if activities.uniq.size == 1
+            activity = activities.uniq.first
+            if prediction == activity
+              if prediction == accept_values[0]
+                confusion_matrix[0][0] += 1
+                #weighted_confusion_matrix[0][0] += confidence
+              elsif prediction == accept_values[1]
+                confusion_matrix[1][1] += 1
+                #weighted_confusion_matrix[1][1] += confidence
+              end
+            elsif prediction != activity
+              if prediction == accept_values[0]
+                confusion_matrix[0][1] += 1
+                #weighted_confusion_matrix[0][1] += confidence
+              elsif prediction == accept_values[1]
+                confusion_matrix[1][0] += 1
+                #weighted_confusion_matrix[1][0] += confidence
+              end
             end
           end
         else
@@ -109,17 +112,17 @@ module OpenTox
         predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
       end
       confidence_sum = 0
-      weighted_confusion_matrix.each do |r|
-        r.each do |c|
-          confidence_sum += c
-        end
-      end
+      #weighted_confusion_matrix.each do |r|
+        #r.each do |c|
+          #confidence_sum += c
+        #end
+      #end
       update_attributes(
         accept_values: accept_values,
         confusion_matrix: confusion_matrix,
-        weighted_confusion_matrix: weighted_confusion_matrix,
+        #weighted_confusion_matrix: weighted_confusion_matrix,
         accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
-        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
         true_rate: true_rate,
         predictivity: predictivity,
         finished_at: Time.now
@@ -129,14 +132,14 @@ module OpenTox
 
     def confidence_plot
       unless confidence_plot_id
-        tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+        tmpfile = "/tmp/#{id.to_s}_confidence.png"
         accuracies = []
         confidences = []
         correct_predictions = 0
         incorrect_predictions = 0
         predictions.each do |p|
           if p[1] and p[2]
-            p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1
+            p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
             accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
             confidences << p[3]
 
@@ -146,7 +149,7 @@ module OpenTox
         R.assign "confidence", confidences
         R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
         R.eval "ggsave(file='#{tmpfile}', plot=image)"
-        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
         plot_id = $gridfs.insert_one(file)
         update(:confidence_plot_id => plot_id)
       end
@@ -162,52 +165,46 @@ module OpenTox
 
     field :rmse, type: Float
     field :mae, type: Float
-    field :weighted_rmse, type: Float
-    field :weighted_mae, type: Float
     field :r_squared, type: Float
     field :correlation_plot_id, type: BSON::ObjectId
-    field :confidence_plot_id, type: BSON::ObjectId
 
     def statistics
       rmse = 0
-      weighted_rmse = 0
-      rse = 0
-      weighted_rse = 0
       mae = 0
-      weighted_mae = 0
-      rae = 0
-      weighted_rae = 0
-      confidence_sum = 0
+      x = []
+      y = []
       predictions.each do |pred|
         compound_id,activity,prediction,confidence = pred
-        if activity and prediction
-          error = Math.log10(prediction)-Math.log10(activity)
-          rmse += error**2
-          weighted_rmse += confidence*error**2
-          mae += error.abs
-          weighted_mae += confidence*error.abs
-          confidence_sum += confidence
+        if activity and prediction 
+          unless activity == [nil]
+            x << -Math.log10(activity.median)
+            y << -Math.log10(prediction)
+            error = Math.log10(prediction)-Math.log10(activity.median)
+            rmse += error**2
+            #weighted_rmse += confidence*error**2
+            mae += error.abs
+            #weighted_mae += confidence*error.abs
+            #confidence_sum += confidence
+          end
         else
           warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
           $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
         end
       end
-      x = predictions.collect{|p| p[1]}
-      y = predictions.collect{|p| p[2]}
       R.assign "measurement", x
       R.assign "prediction", y
-      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+      R.eval "r <- cor(measurement,prediction,use='complete')"
       r = R.eval("r").to_ruby
 
       mae = mae/predictions.size
-      weighted_mae = weighted_mae/confidence_sum
+      #weighted_mae = weighted_mae/confidence_sum
       rmse = Math.sqrt(rmse/predictions.size)
-      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+      #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
       update_attributes(
         mae: mae,
         rmse: rmse,
-        weighted_mae: weighted_mae,
-        weighted_rmse: weighted_rmse,
+        #weighted_mae: weighted_mae,
+        #weighted_rmse: weighted_rmse,
         r_squared: r**2,
         finished_at: Time.now
       )
@@ -243,11 +240,11 @@ module OpenTox
             :neighbors => neighbors
           }
         end
-      end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1]
+      end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
     end
 
     def confidence_plot
-      tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+      tmpfile = "/tmp/#{id.to_s}_confidence.png"
       sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
       R.assign "error", sorted_predictions.collect{|p| p[0]}
       R.assign "confidence", sorted_predictions.collect{|p| p[1]}
@@ -255,7 +252,7 @@ module OpenTox
       R.eval "image = qplot(confidence,error)"
       R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
       R.eval "ggsave(file='#{tmpfile}', plot=image)"
-      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
       plot_id = $gridfs.insert_one(file)
       update(:confidence_plot_id => plot_id)
       $gridfs.find_one(_id: confidence_plot_id).data
@@ -263,7 +260,7 @@ module OpenTox
 
     def correlation_plot
       unless correlation_plot_id
-        tmpfile = "/tmp/#{id.to_s}_correlation.svg"
+        tmpfile = "/tmp/#{id.to_s}_correlation.png"
         x = predictions.collect{|p| p[1]}
         y = predictions.collect{|p| p[2]}
         attributes = Model::Lazar.find(self.model_id).attributes
@@ -276,7 +273,7 @@ module OpenTox
         R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
         R.eval "image = image + geom_abline(intercept=0, slope=1)"
         R.eval "ggsave(file='#{tmpfile}', plot=image)"
-        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
         plot_id = $gridfs.insert_one(file)
         update(:correlation_plot_id => plot_id)
       end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 4da8fd6..5d8aeaf 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -5,25 +5,12 @@ module OpenTox
 
   class Dataset
 
-    #attr_writer :data_entries
-
     # associations like has_many, belongs_to deteriorate performance
     field :feature_ids, type: Array, default: []
     field :compound_ids, type: Array, default: []
-    #field :data_entries_id, type: BSON::ObjectId
     field :data_entries, type: Array, default: []
     field :source, type: String
 
-    # Save all data including data_entries
-    # Should be used instead of save
-    def save_all
-      save
-      #dump = Marshal.dump(@data_entries)
-      #file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
-      #entries_id = $gridfs.insert_one(file)
-      #update(:data_entries_id => entries_id)
-    end
-
     # Readers
 
     # Get all compounds
@@ -38,33 +25,6 @@ module OpenTox
       @features
     end
 
-=begin
-    # Get all data_entries
-    def data_entries
-      unless @data_entries
-        t = Time.now
-        data_entry_file = $gridfs.find_one(_id: data_entries_id)
-        if data_entry_file.nil?
-          @data_entries = []
-        else
-          @data_entries = Marshal.load(data_entry_file.data)
-          bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
-          unless @data_entries.first.size == feature_ids.size
-            # TODO: fix (unknown) source of empty data_entries
-            sleep 1
-            data_entry_file = $gridfs.find_one(_id: data_entries_id)
-            @data_entries = Marshal.load(data_entry_file.data)
-          end
-          bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
-          # TODO: data_entries can be empty, poorly reproducible, mongo problem?
-          bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
-          #$logger.debug "Retrieving data: #{Time.now-t}"
-        end
-      end
-      @data_entries
-    end
-=end
-
     # Find data entry values for a given compound and feature
     # @param compound [OpenTox::Compound] OpenTox Compound object
     # @param feature [OpenTox::Feature] OpenTox Feature object
@@ -93,7 +53,13 @@ module OpenTox
     # @param [Integer] number of folds
     # @return [Array] Array with folds [training_dataset,test_dataset]
     def folds n
-      len = self.compound_ids.size
+      unique_compound_data = {}
+      compound_ids.each_with_index do |cid,i|
+        unique_compound_data[cid] ||= []
+        unique_compound_data[cid] << data_entries[i]
+      end
+      unique_compound_ids = unique_compound_data.keys
+      len = unique_compound_ids.size
       indices = (0..len-1).to_a.shuffle
       mid = (len/n)
       chunks = []
@@ -102,22 +68,44 @@ module OpenTox
         last = start+mid
         last = last-1 unless len%n >= i
         test_idxs = indices[start..last] || []
-        test_cids = test_idxs.collect{|i| self.compound_ids[i]}
-        test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
-        test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
+        test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
         training_idxs = indices-test_idxs
-        training_cids = training_idxs.collect{|i| self.compound_ids[i]}
-        training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
-        training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
-        test_dataset.save_all
-        training_dataset.save_all
-        chunks << [training_dataset,test_dataset]
+        training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
+        chunk = [training_cids,test_cids].collect do |unique_cids|
+          cids = []
+          data_entries = []
+          unique_cids.each do |cid| 
+            unique_compound_data[cid].each do |de|
+              cids << cid
+              data_entries << de
+            end
+          end
+          dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
+          dataset.compounds.each do |compound|
+            compound.dataset_ids << dataset.id
+            compound.save
+          end
+          dataset.save
+          dataset
+        end
         start = last+1
+        chunks << chunk
       end
       chunks
     end
 
     # Diagnostics
+    
+    def duplicates feature=self.features.first
+      col = feature_ids.index feature.id
+      dups = {}
+      compound_ids.each_with_index do |cid,i|
+        rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
+        values = rows.collect{|row| data_entries[row][col]}
+        dups[cid] = values if values.size > 1
+      end
+      dups
+    end
 
     def correlation_plot training_dataset
       # TODO: create/store svg
@@ -145,7 +133,6 @@ module OpenTox
       end
     end
 
-
     # Parsers
 
     # Create a dataset from file (csv,sdf,...)
@@ -154,10 +141,10 @@ module OpenTox
     # TODO
     #def self.from_sdf_file
     #end
-
+    
     # Create a dataset from CSV file
     # TODO: document structure
-    def self.from_csv_file file, source=nil, bioassay=true
+    def self.from_csv_file file, source=nil, bioassay=true#, layout={}
       source ||= file
       name = File.basename(file,".*")
       dataset = self.find_by(:source => source, :name => name)
@@ -167,7 +154,7 @@ module OpenTox
         $logger.debug "Parsing #{file}."
         table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
         dataset = self.new(:source => source, :name => name)
-        dataset.parse_table table, bioassay
+        dataset.parse_table table, bioassay#, layout
       end
       dataset
     end
@@ -224,12 +211,11 @@ module OpenTox
       value_time = 0
 
       # compounds and values
-      #@data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
       self.data_entries = []
 
       table.each_with_index do |vals,i|
         ct = Time.now
-        identifier = vals.shift
+        identifier = vals.shift.strip
         warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
         begin
           case compound_format
@@ -246,7 +232,7 @@ module OpenTox
           warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
           next
         end
-        # TODO insert empty compounds to keep positions?
+        compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
         compound_time += Time.now-ct
           
         r += 1
@@ -263,10 +249,15 @@ module OpenTox
             warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
             next
           elsif numeric[j]
-            self.data_entries.last[j] = v.to_f
+            v = v.to_f
           else
-            self.data_entries.last[j] = v.strip
+            v = v.strip
           end
+          self.data_entries.last[j] = v
+          #i = compound.feature_ids.index feature_ids[j]
+          compound.features[feature_ids[j].to_s] ||= []
+          compound.features[feature_ids[j].to_s] << v
+          compound.save
         end
       end
       compounds.duplicates.each do |compound|
@@ -293,28 +284,6 @@ module OpenTox
       end
     end
 
-    def scale
-      scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)}
-      centers = []
-      scales = []
-      feature_ids.each_with_index do |feature_id,col| 
-        R.assign "x", data_entries.collect{|de| de[col]}
-        R.eval "scaled = scale(x,center=T,scale=T)"
-        centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby
-        scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby
-        R.eval("scaled").to_ruby.each_with_index do |value,row|
-          scaled_data_entries[row][col] = value
-        end
-      end
-      scaled_dataset = ScaledDataset.new(attributes)
-      scaled_dataset["_id"] = BSON::ObjectId.new
-      scaled_dataset["_type"] = "OpenTox::ScaledDataset"
-      scaled_dataset.centers = centers
-      scaled_dataset.scales = scales
-      scaled_dataset.data_entries = scaled_data_entries
-      scaled_dataset.save_all
-      scaled_dataset
-    end
   end
 
   # Dataset for lazar predictions
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
deleted file mode 100644
index 9733bde..0000000
--- a/lib/descriptor.rb
+++ /dev/null
@@ -1,248 +0,0 @@
-require 'digest/md5'
-ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" 
-# TODO store descriptors in mongodb
-
-module OpenTox
-
-  module Algorithm 
-    
-    # Class for descriptor calculations
-    class Descriptor 
-      include OpenTox
-
-      JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
-      CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
-      JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
-      LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
-      JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
-
-      obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
-      OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
-        name,description = d.split(/\s+/,2)
-        ["Openbabel."+name,description] unless obexclude.include? name
-      end.compact.sort{|a,b| a[0] <=> b[0]}]
-
-      cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR}  CdkDescriptorInfo`)
-      CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
-      CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
-
-      # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
-      joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
-      # strip Joelib messages from stdout
-      JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR}  JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
-        name = d[:java_class].sub(/^joelib2.feature.types./,'')
-        # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
-        ["Joelib."+name, "no description available"] unless joelibexclude.include? name
-      end.compact.sort{|a,b| a[0] <=> b[0]}] 
-
-      DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
-      DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
-
-      require_relative "unique_descriptors.rb"
-
-      # Description of available descriptors
-      def self.description descriptor
-        lib = descriptor.split('.').first
-        case lib
-        when "Openbabel"
-          OBDESCRIPTORS[descriptor]
-        when "Cdk"
-          name = descriptor.split('.')[0..-2].join('.')
-          CDKDESCRIPTORS[name]
-        when "Joelib"
-          JOELIBDESCRIPTORS[descriptor]
-        when "lookup"
-          "Read feature values from a dataset"
-        end
-      end
-
-      # Match an array of smarts features 
-      def self.smarts_match compounds, smarts_features, count=false
-        bad_request_error "Compounds for smarts_match are empty" unless compounds
-        bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
-        parse compounds
-        @count = count
-        obconversion = OpenBabel::OBConversion.new
-        obmol = OpenBabel::OBMol.new
-        obconversion.set_in_format('smi')
-        smarts_pattern = OpenBabel::OBSmartsPattern.new
-        smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
-        @smarts = smarts_features.collect{|f| f.smarts}
-        @physchem_descriptors = nil
-        @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
-        @compounds.each_with_index do |compound,c|
-          obconversion.read_string(obmol,compound.smiles)
-          @smarts.each_with_index do |smart,s|
-            smarts_pattern.init(smart)
-            if smarts_pattern.match(obmol)
-              count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
-            else
-              value = 0 
-            end
-            @data_entries[c][s] = value
-          end
-        end
-        serialize 
-      end
-
-      # Count matches of an array with smarts features 
-      def self.smarts_count compounds, smarts
-        # TODO: non-overlapping matches?
-        smarts_match compounds,smarts,true
-      end
-
-      # Calculate physchem descriptors
-      # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
-      def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
-        parse compounds
-        @data_entries = Array.new(@compounds.size){[]}
-        @descriptors = descriptors
-        @smarts = nil
-        @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
-        des = {}
-        @descriptors.each do |d|
-          lib, descriptor = d.split(".",2)
-          lib = lib.downcase.to_sym
-          des[lib] ||= []
-          des[lib] << descriptor
-        end
-        des.each do |lib,descriptors|
-          p lib, descriptors
-          send(lib, descriptors)
-        end
-        serialize
-      end
-
-      def self.openbabel descriptors
-        $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
-        obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
-        obmol = OpenBabel::OBMol.new
-        obconversion = OpenBabel::OBConversion.new
-        obconversion.set_in_format 'smi'
-        last_feature_idx = @physchem_descriptors.size
-        @compounds.each_with_index do |compound,c|
-          obconversion.read_string obmol, compound.smiles
-          obdescriptors.each_with_index do |descriptor,d|
-            @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
-          end
-        end
-        @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
-      end
-
-      def self.java_descriptors descriptors, lib
-        $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
-        sdf = sdf_3d 
-        # use java system call (rjb blocks within tasks)
-        # use Tempfiles to avoid "Argument list too long" error 
-        case lib
-        when "cdk"
-          run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR}  CdkDescriptors #{sdf} #{descriptors.join(" ")}"
-        when "joelib"
-          run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR}  JoelibDescriptors  #{sdf} #{descriptors.join(' ')}"
-        end
-        last_feature_idx = @physchem_descriptors.size
-        YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
-          # TODO create warnings
-          #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
-          # CDK Descriptors may calculate multiple values, they are stored in separate features
-          @physchem_descriptors += calculation.keys if i == 0
-          calculation.keys.each_with_index do |name,j|
-            @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
-          end
-        end
-        FileUtils.rm "#{sdf}#{lib}.yaml"
-      end
-
-      def self.cdk descriptors
-        java_descriptors descriptors, "cdk"
-      end
-
-      def self.joelib descriptors
-        java_descriptors descriptors, "joelib"
-      end
-
-      def self.lookup compounds, features, dataset
-        parse compounds
-        fingerprint = []
-        compounds.each do |compound|
-          fingerprint << []
-          features.each do |feature|
-          end
-        end
-      end
-
-      def self.run_cmd cmd
-        cmd = "#{cmd} 2>&1"
-        $logger.debug "running external cmd: '#{cmd}'"
-        p = IO.popen(cmd) do |io|
-          while line = io.gets
-            $logger.debug "> #{line.chomp}"
-          end
-          io.close
-          raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
-        end
-      end
-
-      def self.sdf_3d 
-        # TODO check if 3d sdfs are stored in GridFS
-        sdf = ""
-        @compounds.each do |compound|
-          sdf << compound.sdf
-        end
-        sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
-        File.open(sdf_file,"w+"){|f| f.print sdf}
-        sdf_file
-      end
-
-      def self.parse compounds
-        @input_class = compounds.class.to_s
-        case @input_class
-        when "OpenTox::Compound"
-          @compounds = [compounds]
-        when "Array"
-          @compounds = compounds
-        when "OpenTox::Dataset"
-          @compounds = compounds.compounds
-        else
-          bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
-        end
-      end
-
-      def self.serialize
-        @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
-        case @input_class
-        when "OpenTox::Compound"
-          @data_entries.first
-        when "Array"
-          @data_entries
-        when "OpenTox::Dataset"
-          dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
-          if @smarts
-            dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
-            @count ? algo = "count" : algo = "match"
-            dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
-            
-          elsif @physchem_descriptors
-            dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
-            dataset.data_entries = @data_entries
-            dataset.feature_calculation_algorithm = "#{self}.physchem"
-            #TODO params?
-          end
-          dataset.save_all
-          dataset
-        end
-      end
-
-      def self.fix_value val
-        val = val.first if val.is_a? Array and val.size == 1
-        val = nil if val == "NaN"
-        if val.numeric?
-          val = Float(val)
-          val = nil if val.nan? or val.infinite?
-        end
-        val
-      end
-      private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
-    end
-  end
-end
diff --git a/lib/feature.rb b/lib/feature.rb
index a308a55..b58946b 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -5,11 +5,11 @@ module OpenTox
     field :nominal, type: Boolean
     field :numeric, type: Boolean
     field :measured, type: Boolean
+    field :calculated, type: Boolean
   end
 
   # Feature for categorical variables
   class NominalFeature < Feature
-    # TODO check if accept_values are still needed 
     field :accept_values, type: Array
     def initialize params
       super params
@@ -34,21 +34,6 @@ module OpenTox
     end
   end
 
-  # Feature for supervised fragments from Fminer algorithm
-  class FminerSmarts < Smarts
-    field :p_value, type: Float
-    # TODO check if effect is used
-    field :effect, type: String
-    field :dataset_id 
-  end
-
-  # Feature for physico-chemical descriptors
-  class PhysChemDescriptor < NumericFeature
-    field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
-    field :parameters, type: Hash
-    field :creator, type: String
-  end
-
   # Feature for categorical bioassay results
   class NominalBioAssay < NominalFeature
   end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index cc66841..a28ba3a 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -8,6 +8,7 @@ require 'mongoid'
 require 'rserve'
 require "nokogiri"
 require "base64"
+require 'openbabel'
 
 # Environment setup
 ENV["LAZAR_ENV"] ||= "production"
@@ -24,7 +25,6 @@ Mongoid.load_configuration({
   }
 })
 Mongoid.raise_not_found_error = false # return nil if no document is found
-#$mongo = Mongoid.default_client
 $mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
 $gridfs = $mongo.database.fs
 
@@ -41,26 +41,27 @@ when "development"
 end
 
 # R setup
+rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
+# should work on POSIX including os x
+# http://stackoverflow.com/questions/19619582/number-of-processors-cores-in-command-line
+NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
 R = Rserve::Connection.new
-R.eval "library(ggplot2)"
-R.eval "library(grid)"
-R.eval "library(gridExtra)"
-
-# Require sub-Repositories
-require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
-require_relative '../libfminer/liblast/last' # 
-require_relative '../last-utils/lu.rb'
-require_relative '../openbabel/lib/openbabel'
-
-# Fminer environment variables
-ENV['FMINER_SMARTS'] = 'true'
-ENV['FMINER_NO_AROMATIC'] = 'true'
-ENV['FMINER_PVALUES'] = 'true'
-ENV['FMINER_SILENT'] = 'true'
-ENV['FMINER_NR_HITS'] = 'true'
+R.eval "
+suppressPackageStartupMessages({
+  library(iterators,lib=\"#{rlib}\")
+  library(foreach,lib=\"#{rlib}\")
+  library(ggplot2,lib=\"#{rlib}\")
+  library(grid,lib=\"#{rlib}\")
+  library(gridExtra,lib=\"#{rlib}\")
+  library(pls,lib=\"#{rlib}\")
+  library(caret,lib=\"#{rlib}\")
+  library(doMC,lib=\"#{rlib}\")
+  registerDoMC(#{NR_CORES})
+})
+"
 
 # OpenTox classes and includes
-CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
 
 [ # be aware of the require sequence as it affects class/method overwrites
   "overwrite.rb",
@@ -68,18 +69,16 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat
   "error.rb",
   "opentox.rb",
   "feature.rb",
+  "physchem.rb",
   "compound.rb",
   "dataset.rb",
-  "descriptor.rb",
   "algorithm.rb",
-  "descriptor.rb",
-  "bbrc.rb",
   "model.rb",
-  "similarity.rb",
   "classification.rb",
   "regression.rb",
   "validation.rb",
   "crossvalidation.rb",
+  "leave-one-out-validation.rb",
   "experiment.rb",
 ].each{ |f| require_relative f }
-
+OpenTox::PhysChem.descriptors # load descriptor features
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
new file mode 100644
index 0000000..2cd13db
--- /dev/null
+++ b/lib/leave-one-out-validation.rb
@@ -0,0 +1,205 @@
+module OpenTox
+
+  class LeaveOneOutValidation
+
+    field :model_id, type: BSON::ObjectId
+    field :dataset_id, type: BSON::ObjectId
+    field :nr_instances, type: Integer
+    field :nr_unpredicted, type: Integer
+    field :predictions, type: Array
+    field :finished_at, type: Time 
+
+    def self.create model
+      model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
+      loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
+      compound_ids = model.training_dataset.compound_ids
+      predictions = model.predict model.training_dataset.compounds
+      predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
+      predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
+      loo.nr_instances = predictions.size
+      predictions.select!{|p| p[:value]} # remove unpredicted
+      loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
+      loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
+      loo.statistics
+      loo.save
+      loo
+    end
+
+    def model
+      Model::Lazar.find model_id
+    end
+  end
+
+  class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
+
+    field :accept_values, type: Array
+    field :confusion_matrix, type: Array, default: []
+    field :weighted_confusion_matrix, type: Array, default: []
+    field :accuracy, type: Float
+    field :weighted_accuracy, type: Float
+    field :true_rate, type: Hash, default: {}
+    field :predictivity, type: Hash, default: {}
+    field :confidence_plot_id, type: BSON::ObjectId
+
+    def statistics
+      accept_values = Feature.find(model.prediction_feature_id).accept_values
+      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      predictions.each do |pred|
+        pred[:database_activities].each do |db_act|
+          if pred[:value]
+            if pred[:value] == db_act
+              if pred[:value] == accept_values[0]
+                confusion_matrix[0][0] += 1
+                weighted_confusion_matrix[0][0] += pred[:confidence]
+              elsif pred[:value] == accept_values[1]
+                confusion_matrix[1][1] += 1
+                weighted_confusion_matrix[1][1] += pred[:confidence]
+              end
+            else
+              if pred[:value] == accept_values[0]
+                confusion_matrix[0][1] += 1
+                weighted_confusion_matrix[0][1] += pred[:confidence]
+              elsif pred[:value] == accept_values[1]
+                confusion_matrix[1][0] += 1
+                weighted_confusion_matrix[1][0] += pred[:confidence]
+              end
+            end
+          end
+        end
+      end
+      accept_values.each_with_index do |v,i|
+        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+      end
+      confidence_sum = 0
+      weighted_confusion_matrix.each do |r|
+        r.each do |c|
+          confidence_sum += c
+        end
+      end
+      update_attributes(
+        accept_values: accept_values,
+        confusion_matrix: confusion_matrix,
+        weighted_confusion_matrix: weighted_confusion_matrix,
+        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
+        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        true_rate: true_rate,
+        predictivity: predictivity,
+        finished_at: Time.now
+      )
+      $logger.debug "Accuracy #{accuracy}"
+    end
+
+    def confidence_plot
+      unless confidence_plot_id
+        tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+        accuracies = []
+        confidences = []
+        correct_predictions = 0
+        incorrect_predictions = 0
+        predictions.each do |p|
+          p[:database_activities].each do |db_act|
+            if p[:value] 
+              p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
+              accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+              confidences << p[:confidence]
+
+            end
+          end
+        end
+        R.assign "accuracy", accuracies
+        R.assign "confidence", confidences
+        R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+        plot_id = $gridfs.insert_one(file)
+        update(:confidence_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: confidence_plot_id).data
+    end
+  end
+  
+
+  class RegressionLeaveOneOutValidation < LeaveOneOutValidation
+
+
+    field :rmse, type: Float, default: 0.0
+    field :mae, type: Float, default: 0
+    #field :weighted_rmse, type: Float, default: 0
+    #field :weighted_mae, type: Float, default: 0
+    field :r_squared, type: Float
+    field :correlation_plot_id, type: BSON::ObjectId
+    field :confidence_plot_id, type: BSON::ObjectId
+
+    def statistics
+      confidence_sum = 0
+      predicted_values = []
+      measured_values = []
+      predictions.each do |pred|
+        pred[:database_activities].each do |activity|
+          if pred[:value]
+            predicted_values << pred[:value]
+            measured_values << activity
+            error = Math.log10(pred[:value])-Math.log10(activity)
+            self.rmse += error**2
+            #self.weighted_rmse += pred[:confidence]*error**2
+            self.mae += error.abs
+            #self.weighted_mae += pred[:confidence]*error.abs
+            #confidence_sum += pred[:confidence]
+          end
+        end
+        if pred[:database_activities].empty?
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+        end
+      end
+      R.assign "measurement", measured_values
+      R.assign "prediction", predicted_values
+      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+      r = R.eval("r").to_ruby
+
+      self.mae = self.mae/predictions.size
+      #self.weighted_mae = self.weighted_mae/confidence_sum
+      self.rmse = Math.sqrt(self.rmse/predictions.size)
+      #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
+      self.r_squared = r**2
+      self.finished_at = Time.now
+      save
+      $logger.debug "R^2 #{r**2}"
+      $logger.debug "RMSE #{rmse}"
+      $logger.debug "MAE #{mae}"
+    end
+
+    def correlation_plot
+      unless correlation_plot_id
+        tmpfile = "/tmp/#{id.to_s}_correlation.svg"
+        predicted_values = []
+        measured_values = []
+        predictions.each do |pred|
+          pred[:database_activities].each do |activity|
+            if pred[:value]
+              predicted_values << pred[:value]
+              measured_values << activity
+            end
+          end
+        end
+        attributes = Model::Lazar.find(self.model_id).attributes
+        attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
+        attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
+        R.assign "measurement", measured_values
+        R.assign "prediction", predicted_values
+        R.eval "all = c(-log(measurement),-log(prediction))"
+        R.eval "range = c(min(all), max(all))"
+        R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+        R.eval "image = image + geom_abline(intercept=0, slope=1)"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
+        plot_id = $gridfs.insert_one(file)
+        update(:correlation_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: correlation_plot_id).data
+    end
+  end
+
+end
diff --git a/lib/model.rb b/lib/model.rb
index 227d4d3..8e657b8 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -34,7 +34,6 @@ module OpenTox
       def initialize training_dataset, params={}
 
         super params
-        bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
 
         # TODO document convention
         prediction_feature = training_dataset.features.first
@@ -48,13 +47,34 @@ module OpenTox
         self
       end
 
-      def predict object, use_database_values=true
+      def predict_compound compound
+        prediction_feature = Feature.find prediction_feature_id
+        neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+        # remove neighbors without prediction_feature
+        # check for database activities (neighbors may include query compound)
+        database_activities = nil
+        prediction = {}
+        if neighbors.collect{|n| n["_id"]}.include? compound.id
+
+          database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
+          prediction[:database_activities] = database_activities
+          prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
+          neighbors.delete_if{|n| n["_id"] == compound.id}
+        end
+        neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
+        if neighbors.empty?
+          prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
+        else
+          prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
+          prediction[:neighbors] = neighbors
+          prediction[:neighbors] ||= []
+        end
+        prediction
+      end
 
-        t = Time.now
-        at = Time.now
+      def predict object
 
         training_dataset = Dataset.find training_dataset_id
-        prediction_feature = Feature.find prediction_feature_id
 
         # parse data
         compounds = []
@@ -71,63 +91,33 @@ module OpenTox
 
         # make predictions
         predictions = []
-        neighbors = []
-        compounds.each_with_index do |compound,c|
-          t = Time.new
-
-          neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
-          # add activities
-          # TODO: improve efficiency, takes 3 times longer than previous version
-          neighbors.collect! do |n|
-            rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
-            acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
-            acts.empty? ? nil : n << acts
-          end
-          neighbors.compact! # remove neighbors without training activities
-
-          database_activities = training_dataset.values(compound,prediction_feature)
-          if use_database_values and database_activities and !database_activities.empty?
-            database_activities = database_activities.first if database_activities.size == 1
-            predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
-            next
-          end
-          predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size})
-=begin
-# TODO scaled dataset for physchem
-          p neighbor_algorithm_parameters
-          p (neighbor_algorithm_parameters["feature_dataset_id"])
-          d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"])
-          p d
-          p d.class
-          if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset
-            p "SCALED"
-          end
-=end
-        end 
+        predictions = compounds.collect{|c| predict_compound c}
 
         # serialize result
         case object.class.to_s
         when "OpenTox::Compound"
           prediction = predictions.first
-          prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
+          prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
           return prediction
         when "Array"
           return predictions
         when "OpenTox::Dataset"
           # prepare prediction dataset
+          measurement_feature = Feature.find prediction_feature_id
+
+          prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
           prediction_dataset = LazarPrediction.new(
             :name => "Lazar prediction for #{prediction_feature.name}",
             :creator =>  __FILE__,
             :prediction_feature_id => prediction_feature.id
 
           )
-          confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" )
-          # TODO move into warnings field
+          confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
           warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
-          prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+          prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
           prediction_dataset.compounds = compounds
-          prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
-          prediction_dataset.save_all
+          prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
+          prediction_dataset.save
           return prediction_dataset
         end
 
@@ -164,15 +154,12 @@ module OpenTox
       def self.create training_dataset, params={}
         model = self.new training_dataset, params
         model.neighbor_algorithm ||= "fingerprint_neighbors"
-        model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average" 
+        model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" 
         model.neighbor_algorithm_parameters ||= {}
         {
           :type => "MP2D",
           :training_dataset_id => training_dataset.id,
           :min_sim => 0.1
-          #:type => "FP4",
-          #:training_dataset_id => training_dataset.id,
-          #:min_sim => 0.7
         }.each do |key,value|
           model.neighbor_algorithm_parameters[key] ||= value
         end
@@ -181,31 +168,11 @@ module OpenTox
       end
     end
 
-    class LazarFminerClassification < LazarClassification
-      field :feature_calculation_parameters, type: Hash
-
-      def self.create training_dataset, fminer_params={}
-        model = super(training_dataset)
-        model.update "_type" => self.to_s # adjust class
-        model = self.find model.id # adjust class
-        model.neighbor_algorithm = "fminer_neighbors"
-        model.neighbor_algorithm_parameters = {
-          :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
-          :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id,
-          :min_sim => 0.3
-        }
-        model.feature_calculation_parameters = fminer_params
-        model.save
-        model
-      end
-    end
-
     class Prediction
       include OpenTox
       include Mongoid::Document
       include Mongoid::Timestamps
 
-      # TODO cv -> repeated cv
       # TODO field Validations
       field :endpoint, type: String
       field :species, type: String
@@ -249,7 +216,6 @@ module OpenTox
         training_dataset = Dataset.from_csv_file file
         model = nil
         if training_dataset.features.first.nominal?
-          #model = LazarFminerClassification.create training_dataset
           model = LazarClassification.create training_dataset
         elsif training_dataset.features.first.numeric?
           model = LazarRegression.create training_dataset
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index c92ad2b..cef5758 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -22,6 +22,14 @@ class Numeric
   end
 end
 
+class Float
+  # round to n significant digits
+  # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
+  def signif(n)
+    Float("%.#{n}g" % self)
+  end
+end
+
 module Enumerable
   # @return [Array] only the duplicates of an enumerable
   def duplicates
diff --git a/lib/physchem.rb b/lib/physchem.rb
new file mode 100644
index 0000000..f7b880f
--- /dev/null
+++ b/lib/physchem.rb
@@ -0,0 +1,133 @@
+module OpenTox
+
+  # Feature for physico-chemical descriptors
+  class PhysChem < NumericFeature
+
+    field :library, type: String
+    field :descriptor, type: String
+    field :description, type: String
+
+    JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
+    CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
+    JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
+    LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
+    JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
+
+    obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
+    OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
+      name,description = d.split(/\s+/,2)
+      ["Openbabel."+name,description] unless obexclude.include? name
+    end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+    cdkdescriptors = {}
+    CDK_DESCRIPTIONS = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR}  CdkDescriptorInfo`)
+    CDK_DESCRIPTIONS.each do |d|
+      prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
+      d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
+    end
+    CDKDESCRIPTORS = cdkdescriptors
+
+    # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
+    joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
+    # strip Joelib messages from stdout
+    JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR}  JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
+      name = d[:java_class].sub(/^joelib2.feature.types./,'')
+      ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
+    end.compact.sort{|a,b| a[0] <=> b[0]}] 
+
+    DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
+
+    require_relative "unique_descriptors.rb"
+
+    def self.descriptors desc=DESCRIPTORS
+      desc.collect do |name,description|
+        lib,desc = name.split('.',2)
+        self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+      end
+    end
+
+    def self.unique_descriptors
+      udesc = []
+      UNIQUEDESCRIPTORS.each do |name|
+        lib,desc = name.split('.',2)
+        if lib == "Cdk"
+          CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
+            dname = "#{name}.#{n}"
+            description = DESCRIPTORS[dname]
+            udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+          end
+        else
+          description = DESCRIPTORS[name]
+          udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+        end
+      end
+      udesc
+    end
+
+    def self.openbabel_descriptors
+      descriptors OBDESCRIPTORS
+    end
+
+    def self.cdk_descriptors
+      descriptors CDKDESCRIPTORS
+    end
+
+    def self.joelib_descriptors
+      descriptors JOELIBDESCRIPTORS
+    end
+
+    def calculate compound
+      result = send library.downcase,descriptor,compound
+      result[self.name]
+    end
+
+    def openbabel descriptor, compound
+      obdescriptor = OpenBabel::OBDescriptor.find_type descriptor
+      obmol = OpenBabel::OBMol.new
+      obconversion = OpenBabel::OBConversion.new
+      obconversion.set_in_format 'smi'
+      obconversion.read_string obmol, compound.smiles
+      {"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))}
+    end
+
+    def cdk descriptor, compound
+      java_descriptor "cdk", descriptor, compound
+    end
+
+    def joelib descriptor, compound
+      java_descriptor "joelib", descriptor, compound
+    end
+
+    private
+
+    def java_descriptor lib, descriptor, compound
+
+      sdf_3d = "/tmp/#{SecureRandom.uuid}.sdf"
+      File.open(sdf_3d,"w+"){|f| f.print compound.sdf}
+      
+      # use java system call (rjb blocks within tasks)
+      # use Tempfiles to avoid "Argument list too long" error 
+      case lib
+      when "cdk"
+        `java -classpath #{CDK_JAR}:#{JAVA_DIR}  CdkDescriptors #{sdf_3d} #{descriptor}`
+      when "joelib"
+        `java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR}  JoelibDescriptors  #{sdf_3d} #{descriptor}`
+      end
+      result = YAML.load_file("#{sdf_3d}#{lib}.yaml").first
+      result.keys.each{|k| result[k] = result.delete(k)}
+      result
+    end
+
+    def fix_value val
+      val = val.first if val.is_a? Array and val.size == 1
+      val = nil if val == "NaN"
+      if val.numeric?
+        val = Float(val)
+        val = nil if val.nan? or val.infinite?
+      end
+      val
+    end
+
+  end
+
+end
diff --git a/lib/regression.rb b/lib/regression.rb
index 868c25f..5021fb3 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -1,262 +1,151 @@
-# TODO install R packages kernlab, caret, doMC, class, e1071
-
-
-        # log transform activities (create new dataset)
-        # scale, normalize features, might not be necessary
-        # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
-        # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
-        # zero-order correlation and the semi-partial correlation
-        # seems to be necessary for svm
-        #   http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
-        #   http://stackoverflow.com/questions/15436367/svm-scaling-input-values
-        # use lasso or elastic net??
-        # select relevant features
-        #   remove features with a single value
-        #   remove correlated features
-        #   remove features not correlated with endpoint
 module OpenTox
   module Algorithm
     
     class Regression
 
-      def self.weighted_average compound, params
+      def self.local_weighted_average compound, params
         weighted_sum = 0.0
         sim_sum = 0.0
-        confidence = 0.0
         neighbors = params[:neighbors]
-        activities = []
         neighbors.each do |row|
-          n,sim,acts = row
-          confidence = sim if sim > confidence # distance to nearest neighbor
-          # TODO add LOO errors
-          acts.each do |act|
-            weighted_sum += sim*Math.log10(act)
-            activities << act
-            sim_sum += sim
+          sim = row["tanimoto"]
+          if row["features"][params[:prediction_feature_id].to_s]
+            row["features"][params[:prediction_feature_id].to_s].each do |act|
+              weighted_sum += sim*Math.log10(act)
+              sim_sum += sim
+            end
           end
         end
-        #R.assign "activities", activities
-        #R.eval "cv = cv(activities)"
-        #confidence /= activities.standard_deviation#/activities.mean
-        #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size]
-        #confidence = sim_sum/neighbors.size.to_f
-        #confidence = neighbors.size.to_f
-        confidence = 0 if confidence.nan?
         sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
-        {:value => prediction,:confidence => confidence}
+        {:value => prediction}
       end
 
-      def self.local_linear_regression  compound, neighbors
-        p neighbors.size
-        return nil unless neighbors.size > 0
-        features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq
-        p features
-        training_data = Array.new(neighbors.size){Array.new(features.size,0)}
-        neighbors.each_with_index do |n,i|
-          #p n.first
-          neighbor = Compound.find n.first
-          features.each_with_index do |f,j|
-            training_data[i][j] = 1 if neighbor.fp4.include? f
+      # TODO explicit neighbors, also for physchem
+      def self.local_fingerprint_regression  compound, params, method='pls'#, method_params="sigma=0.05"
+        neighbors = params[:neighbors]
+        return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
+        activities = []
+        fingerprints = {}
+        weights = []
+        fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
+        
+        neighbors.each_with_index do |row,i|
+          neighbor = Compound.find row["_id"]
+          fingerprint = neighbor.fingerprint
+          if row["features"][params[:prediction_feature_id].to_s]
+            row["features"][params[:prediction_feature_id].to_s].each do |act|
+              activities << Math.log10(act)
+              weights << row["tanimoto"]
+              fingerprint_ids.each_with_index do |id,j|
+                fingerprints[id] ||= []
+                fingerprints[id] << fingerprint.include?(id) 
+              end
+            end
           end
         end
-        p training_data
-
-        R.assign "activities", neighbors.collect{|n| n[2].median}
-        R.assign "features", training_data
-        R.eval "model <- lm(activities ~ features)"
-        R.eval "summary <- summary(model)"
-        p R.summary
-        compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0}
-        R.assign "compound_features", compound_features
-        R.eval "prediction <- predict(model,compound_features)"
-        p R.prediction
-      
-      end
 
-      def self.weighted_average_with_relevant_fingerprints neighbors
-        weighted_sum = 0.0
-        sim_sum = 0.0
-        fingerprint_features = []
-        neighbors.each do |row|
-          n,sim,acts = row
-          neighbor = Compound.find n
-          fingerprint_features += neighbor.fp4
-        end
-        fingerprint_features.uniq!
-        p fingerprint_features
-=begin
-          p n
-          acts.each do |act|
-            weighted_sum += sim*Math.log10(act)
-            sim_sum += sim
+        variables = []
+        data_frame = [activities]
+        fingerprints.each do |k,v| 
+          unless v.uniq.size == 1
+            data_frame << v.collect{|m| m ? "T" : "F"}
+            variables << k
           end
         end
-=end
-        confidence = sim_sum/neighbors.size.to_f
-        sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
-        {:value => prediction,:confidence => confidence}
-      end
-
-      # Local support vector regression from neighbors 
-      # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
-      # @return [Numeric] A prediction value.
-      def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
 
-        confidence = 0.0
-        prediction = nil
+        if variables.empty?
+            result = local_weighted_average(compound, params)
+            result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
+            return result
 
-        $logger.debug "Local SVM."
-        props = neighbors.collect{|row| row[3] }
-        neighbors.shift
-        activities = neighbors.collect{|n| n[2]}
-        prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
-        prediction = nil if (!prediction.nil? && prediction.infinite?)
-        $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
-        if prediction
-          confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
         else
-          confidence = nil if prediction.nil?
+          compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} 
+          prediction = r_model_prediction method, data_frame, variables, weights, compound_features
+          if prediction.nil? or prediction[:value].nil?
+            prediction = local_weighted_average(compound, params)
+            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
+            return prediction
+          else
+            prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
+            prediction[:value] = 10**prediction[:value]
+            prediction[:rmse] = 10**prediction[:rmse]
+            prediction
+          end
         end
-          [prediction, confidence]
-
+      
       end
 
+      def self.local_physchem_regression  compound, params, method="plsr"#, method_params="ncomp = 4"
 
-      # Local support vector prediction from neighbors. 
-      # Uses propositionalized setting.
-      # Not to be called directly (use local_svm_regression or local_svm_classification).
-      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
-      # @param [Array] activities, activities for neighbors.
-      # @param [Float] min_train_performance, parameter to control censoring
-      # @return [Numeric] A prediction value.
-      def self.local_svm_prop(props, activities, min_train_performance)
-
-        $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
-        n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
-        q_prop = props[0] # is an Array.
+        neighbors = params[:neighbors]
+        return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
+        return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
 
-        prediction = nil
-        if activities.uniq.size == 1
-          prediction = activities[0]
-        else
-          t = Time.now
-          #$logger.debug gram_matrix.to_yaml
-          #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
-          @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
-          rs = []
-          ["caret", "doMC", "class"].each do |lib|
-            #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
-            rs << "suppressPackageStartupMessages(library('#{lib}'))"
+        activities = []
+        weights = []
+        physchem = {}
+        
+        neighbors.each_with_index do |row,i|
+          neighbor = Compound.find row["_id"]
+          if row["features"][params[:prediction_feature_id].to_s]
+            row["features"][params[:prediction_feature_id].to_s].each do |act|
+              activities << Math.log10(act)
+              weights << row["tanimoto"] # TODO cosine ?
+              neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
+                physchem[pid] ||= []
+                physchem[pid] <<  v
+              end
+            end
           end
-          #@r.eval "registerDoMC()" # switch on parallel processing
-          rs << "registerDoMC()" # switch on parallel processing
-          #@r.eval "set.seed(1)"
-          rs << "set.seed(1)"
-          $logger.debug "Loading R packages: #{Time.now-t}"
-          t = Time.now
-          p n_prop
-          begin
-
-            # set data
-            rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
-            rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
-            rs << "n_prop_x_size <- c(#{n_prop.size})"
-            rs << "n_prop_y_size <- c(#{n_prop[0].size})"
-            rs << "y <- c(#{activities.join(',')})"
-            rs << "q_prop <- c(#{q_prop.join(',')})"
-            rs << "y = matrix(y)"
-            rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
-            rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
-
-            $logger.debug "Setting R data: #{Time.now-t}"
-            t = Time.now
-            # prepare data
-            rs << "
-              weights=NULL
-              if (!(class(y) == 'numeric')) { 
-                y = factor(y)
-                weights=unlist(as.list(prop.table(table(y))))
-                weights=(weights-1)^2
-              }
-            "
-
-            rs << "
-              rem = nearZeroVar(prop_matrix)
-              if (length(rem) > 0) {
-                prop_matrix = prop_matrix[,-rem,drop=F]
-                q_prop = q_prop[,-rem,drop=F]
-              }
-              rem = findCorrelation(cor(prop_matrix))
-              if (length(rem) > 0) {
-                prop_matrix = prop_matrix[,-rem,drop=F]
-                q_prop = q_prop[,-rem,drop=F]
-              }
-            "
+        end
 
-            #p @r.eval("y").to_ruby
-            #p "weights"
-            #p @r.eval("weights").to_ruby
-            $logger.debug "Preparing R data: #{Time.now-t}"
-            t = Time.now
-            # model + support vectors
-            #train_success = @r.eval <<-EOR
-            rs << '
-              model = train(prop_matrix,y,
-                             method="svmRadial",
-                             preProcess=c("center", "scale"),
-                             class.weights=weights,
-                             trControl=trainControl(method="LGOCV",number=10),
-                             tuneLength=8
-                           )
-              perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
-            '
-            File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
-            p rs.join("\n")
-            p `Rscript /tmp/r.r`
-=begin
-            @r.void_eval <<-EOR
-              model = train(prop_matrix,y,
-                             method="svmRadial",
-                             #preProcess=c("center", "scale"),
-                             #class.weights=weights,
-                             #trControl=trainControl(method="LGOCV",number=10),
-                             #tuneLength=8
-                           )
-              perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
-            EOR
-=end
+        # remove properties with a single value
+        physchem.each do |pid,v|
+          physchem.delete(pid) if v.uniq.size <= 1
+        end
 
-            $logger.debug "Creating R SVM model: #{Time.now-t}"
-            t = Time.now
-            if train_success
-              # prediction
-              @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
-              #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
-              @r.eval "if (class(y)!='numeric') p = as.character(p)"
-              prediction = @r.p
+        if physchem.empty?
+          result = local_weighted_average(compound, params)
+          result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
+          return result
 
-              # censoring
-              prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
-              prediction = nil if prediction =~ /NA/
-              $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
-            else
-              $logger.debug "Model creation failed."
-              prediction = nil 
-            end
-            $logger.debug "R Prediction: #{Time.now-t}"
-          rescue Exception => e
-            $logger.debug "#{e.class}: #{e.message}"
-            $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-          ensure
-            #puts @r.inspect
-            #TODO: broken pipe
-            #@r.quit # free R
+        else
+          data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
+          prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
+          if prediction.nil?
+            prediction = local_weighted_average(compound, params)
+            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
+            return prediction
+          else
+            prediction[:value] = 10**prediction[:value]
+            prediction
           end
         end
-        prediction
+      
       end
-    end
 
+      def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
+        R.assign "weights", training_weights
+        r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+        R.eval "data <- #{r_data_frame}"
+        R.assign "features", training_features
+        R.eval "names(data) <- append(c('activities'),features)" #
+        begin
+          R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
+        rescue 
+          return nil
+        end
+        R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
+        R.eval "names(fingerprint) <- features" 
+        R.eval "prediction <- predict(model,fingerprint)"
+        {
+          :value => R.eval("prediction").to_f,
+          :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
+          :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
+        }
+      end
+
+    end
   end
 end
 
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
index de1b74f..9321a75 100644
--- a/lib/rest-client-wrapper.rb
+++ b/lib/rest-client-wrapper.rb
@@ -26,15 +26,14 @@ module OpenTox
       define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
 
         # check input
-        bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash) 
+        bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) 
         headers[:subjectid] ||= @@subjectid
-        bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri
-        #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
+        bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri
         # make sure that no header parameters are set in the payload
         [:accept,:content_type,:subjectid].each do |header|
           if defined? $aa || URI(uri).host == URI($aa[:uri]).host
           else
-            bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header]
+            bad_request_error "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
           end
         end
       
@@ -72,7 +71,7 @@ module OpenTox
               msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
               cause = nil
             end
-            Object.method(error[:method]).call msg, uri, cause # call error method
+            Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method
           else
             response
           end
diff --git a/lib/similarity.rb b/lib/similarity.rb
deleted file mode 100644
index 91e18db..0000000
--- a/lib/similarity.rb
+++ /dev/null
@@ -1,58 +0,0 @@
-=begin
-* Name: similarity.rb
-* Description: Similarity algorithms
-* Author: Andreas Maunz <andreas@maunz.de
-* Date: 10/2012
-=end
-
-module OpenTox
-  module Algorithm
-
-    class Similarity
-
-      #TODO weighted tanimoto
-
-      # Tanimoto similarity
-      # @param [Array] a fingerprints of first compound
-      # @param [Array] b fingerprints of second compound
-      # @return [Float] Tanimoto similarity
-      def self.tanimoto(a,b)
-        bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
-        #common = 0.0
-        #a.each_with_index do |n,i|
-          #common += 1 if n == b[i]
-        #end
-        #common/a.size
-        # TODO check if calculation speed can be improved
-        common_p_sum = 0.0
-        all_p_sum = 0.0
-        (0...a.size).each { |idx|
-          common_p_sum += [ a[idx], b[idx] ].min
-          all_p_sum += [ a[idx], b[idx] ].max
-        }
-        common_p_sum/all_p_sum
-      end
-
-
-      # Cosine similarity
-      # @param [Array] a fingerprints of first compound
-      # @param [Array] b fingerprints of second compound
-      # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
-      def self.cosine(a, b)
-        val = 0.0
-        if a.size>0 and b.size>0
-          if a.size>12 && b.size>12
-            a = a[0..11]
-            b = b[0..11]
-          end
-          a_vec = a.to_gv
-          b_vec = b.to_gv
-          val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
-        end
-        val
-      end
-
-    end
-
-  end
-end
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
index cf9cbf3..8341a67 100644
--- a/lib/unique_descriptors.rb
+++ b/lib/unique_descriptors.rb
@@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [
   "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
   "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
   "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
-  #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
+  #"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
   "Openbabel.logP", #octanol/water partition coefficient
   "Openbabel.MP", #Melting point
   "Openbabel.MR", #molar refractivity
@@ -24,7 +24,7 @@ UNIQUEDESCRIPTORS = [
   "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
   "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
   "Cdk.AcidicGroupCount", #Returns the number of acidic groups.
-  "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
+  #"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
   #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
   #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
   #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
@@ -75,7 +75,7 @@ UNIQUEDESCRIPTORS = [
   "Joelib.count.NumberOfP", #no description available
   "Joelib.count.NumberOfO", #no description available
   "Joelib.count.NumberOfN", #no description available
-  #"Joelib.count.AromaticBonds", #no description available
+  #"Joeli#.count.AromaticBonds", #no description available
   "Joelib.count.NumberOfI", #no description available
   "Joelib.count.NumberOfF", #no description available
   "Joelib.count.NumberOfC", #no description available
@@ -91,7 +91,7 @@ UNIQUEDESCRIPTORS = [
   "Joelib.GeometricalShapeCoefficient", #no description available
   #"Joelib.MolecularWeight", #no description available
   "Joelib.FractionRotatableBonds", #no description available
-  #"Joelib.count.HBD2", #no description available
+  #"Joeli..count.HBD2", #no description available
   #"Joelib.count.HBD1", #no description available
   "Joelib.LogP", #no description available
   "Joelib.GraphShapeCoefficient", #no description available
@@ -116,5 +116,4 @@ UNIQUEDESCRIPTORS = [
   "Joelib.count.SOGroups", #no description available
   "Joelib.TopologicalDiameter", #no description available
   "Joelib.count.NumberOfHal", #no description available
-
-].sort
+]
diff --git a/lib/validation.rb b/lib/validation.rb
index c52ffc0..b72d273 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -29,17 +29,21 @@ module OpenTox
       atts[:training_dataset_id] = training_set.id
       validation_model = model.class.create training_set, atts
       validation_model.save
-      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+      cids = test_set.compound_ids
+
+      test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
       prediction_dataset = validation_model.predict test_set_without_activities
       predictions = []
       nr_unpredicted = 0
       activities = test_set.data_entries.collect{|de| de.first}
       prediction_dataset.data_entries.each_with_index do |de,i|
-        if de[0] and de[1] and de[1].numeric? 
-          activity = activities[i]
+        if de[0] #and de[1] 
+          cid = prediction_dataset.compound_ids[i]
+          rows = cids.each_index.select{|r| cids[r] == cid }
+          activities = rows.collect{|r| test_set.data_entries[r][0]}
           prediction = de.first
           confidence = de[1]
-          predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]]
+          predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
         else
           nr_unpredicted += 1
         end
@@ -63,6 +67,42 @@ module OpenTox
   end
 
   class RegressionValidation < Validation
+
+    def statistics
+      rmse = 0
+      weighted_rmse = 0
+      rse = 0
+      weighted_rse = 0
+      mae = 0
+      weighted_mae = 0
+      confidence_sum = 0
+      predictions.each do |pred|
+        compound_id,activity,prediction,confidence = pred
+        if activity and prediction
+          error = Math.log10(prediction)-Math.log10(activity.median)
+          rmse += error**2
+          weighted_rmse += confidence*error**2
+          mae += error.abs
+          weighted_mae += confidence*error.abs
+          confidence_sum += confidence
+        else
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+        end
+      end
+      x = predictions.collect{|p| p[1].median}
+      y = predictions.collect{|p| p[2]}
+      R.assign "measurement", x
+      R.assign "prediction", y
+      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+      r = R.eval("r").to_ruby
+
+      mae = mae/predictions.size
+      weighted_mae = weighted_mae/confidence_sum
+      rmse = Math.sqrt(rmse/predictions.size)
+      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+      { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
+    end
   end
 
 end