23 files changed, 1356 insertions, 622 deletions
diff --git a/lib/caret-classification.rb b/lib/caret-classification.rb
new file mode 100644
index 0000000..fefe6b6
--- /dev/null
+++ b/lib/caret-classification.rb
@@ -0,0 +1,107 @@
+module OpenTox
+  module Algorithm
+    
+    # Ruby interface for the R caret package
+    # Caret model list: https://topepo.github.io/caret/modelList.html
+    class Caret
+
+      # Create a local R caret model and make a prediction
+      # @param [Array<Float,Bool>] dependent_variables
+      # @param [Array<Array<Float,Bool>>] independent_variables
+      # @param [Array<Float>] weights
+      # @param [String] Caret method
+      # @param [Array<Float,Bool>] query_variables
+      # @return [Hash]
+      def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
+        remove = []
+        # remove independent_variables with single values
+        independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1}
+        remove.sort.reverse.each do |i|
+          independent_variables.delete_at i
+          query_variables.delete_at i
+        end
+        if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] 
+          prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+          prediction[:warnings] << "No variables for classification model. Using weighted average of similar substances."
+        elsif dependent_variables.uniq.size == 1
+          prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+          prediction[:warnings] << "All neighbors have the same measured activity. Cannot create random forest model, using weighted average of similar substances."
+        elsif dependent_variables.size < 3
+          prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+          prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for classification model. Using weighted average of similar substances."
+        else
+          dependent_variables.collect!{|v| to_r(v)}
+          independent_variables.each_with_index do |c,i| 
+            c.each_with_index do |v,j|
+              independent_variables[i][j] = to_r(v)
+            end
+          end
+#          query_variables.collect!{|v| to_r(v)}
+          begin
+            R.assign "weights", weights
+            #r_data_frame = "data.frame(#{([dependent_variables.collect{|v| to_r(v)}]+independent_variables).collect{|r| "c(#{r.collect{|v| to_r(v)}.join(',')})"}.join(', ')})"
+            r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+            #p r_data_frame
+            R.eval "data <- #{r_data_frame}"
+            R.assign "features", (0..independent_variables.size-1).to_a
+            R.eval "names(data) <- append(c('activities'),features)" #
+            p "train"
+            R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
+            p "done"
+          rescue => e
+            $logger.debug "R caret model creation error for: #{e.message}"
+            $logger.debug dependent_variables
+            $logger.debug independent_variables
+            prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+            prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
+            return prediction
+          end
+          begin
+            R.eval "query <- data.frame(rbind(c(#{query_variables.collect{|v| to_r(v)}.join ','})))"
+            R.eval "names(query) <- features" 
+            R.eval "prediction <- predict(model,query, type=\"prob\")"
+            names = R.eval("names(prediction)").to_ruby
+            probs = R.eval("prediction").to_ruby
+            probabilities = {}
+            names.each_with_index { |n,i| probabilities[n] = probs[i] }
+            value = probabilities.sort_by{|n,p| -p }[0][0]
+            prediction = {
+              :value => value,
+              :probabilities => probabilities,
+              :warnings => [],
+            }
+          rescue => e
+            $logger.debug "R caret prediction error for: #{e.inspect}"
+            $logger.debug self.inspect
+            prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+            prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
+            return prediction
+          end
+          if prediction.nil? or prediction[:value].nil?
+            prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+            prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
+          end
+        end
+        prediction
+
+      end
+
+      # Call caret methods dynamically, e.g. Caret.pls
+      def self.method_missing(sym, *args, &block)
+        args.first[:method] = sym.to_s
+        self.create_model_and_predict args.first
+      end
+
+      # Convert Ruby values to R values
+      def self.to_r v
+        return "F" if v == false
+        return "T" if v == true
+        return nil if v.is_a? Float and v.nan?
+        return "\"#{v}\"" if v.is_a? String
+        v
+      end
+
+    end
+  end
+end
+
diff --git a/lib/caret.rb b/lib/caret.rb
index 8bccf74..2e5f1bc 100644
--- a/lib/caret.rb
+++ b/lib/caret.rb
@@ -22,11 +22,11 @@ module OpenTox
         end
         if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] 
           prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-          prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances."
+          prediction[:warnings] = ["No variables for regression model, using weighted average of similar substances (no prediction interval available)."]
         elsif
           dependent_variables.size < 3
           prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-          prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
+          prediction[:warnings] = ["Insufficient number of neighbors (#{dependent_variables.size}) for regression model, using weighted average of similar substances (no prediction interval available)."]
         else
           dependent_variables.each_with_index do |v,i| 
             dependent_variables[i] = to_r(v)
@@ -51,7 +51,8 @@ module OpenTox
             $logger.debug dependent_variables
             $logger.debug independent_variables
             prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-            prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
+            prediction[:warnings] ||= []
+            prediction[:warnings] << "R caret model creation error, using weighted average of similar substances (no prediction interval available)."
             return prediction
           end
           begin
@@ -72,12 +73,12 @@ module OpenTox
             $logger.debug "R caret prediction error for:"
             $logger.debug self.inspect
             prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-            prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
+            prediction[:warnings] << "R caret prediction error, using weighted average of similar substances (no prediction interval available)."
             return prediction
           end
           if prediction.nil? or prediction[:value].nil?
             prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-            prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
+            prediction[:warnings] << "Empty R caret prediction, using weighted average of similar substances (no prediction interval available)."
           end
         end
         prediction
diff --git a/lib/classification.rb b/lib/classification.rb
index a875903..638492b 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -18,11 +18,6 @@ module OpenTox
         class_weights.each do |a,w|
           probabilities[a] = w.sum/weights.sum
         end
-        # DG: hack to ensure always two probability values
-        if probabilities.keys.uniq.size == 1
-          missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
-          probabilities[missing_key] = 0.0
-        end
         probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
         p_max = probabilities.collect{|a,p| p}.max
         prediction = probabilities.key(p_max)
diff --git a/lib/compound.rb b/lib/compound.rb
index bfe69e3..6d0e075 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -1,5 +1,3 @@
-CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/"
-
 module OpenTox
 
   # Small molecules with defined chemical structures
@@ -12,7 +10,6 @@ module OpenTox
     field :inchikey, type: String
     field :names, type: Array
     field :cid, type: String
-    field :chemblid, type: String
     field :png_id, type: BSON::ObjectId
     field :svg_id, type: BSON::ObjectId
     field :sdf_id, type: BSON::ObjectId
@@ -35,13 +32,11 @@ module OpenTox
     def fingerprint type=DEFAULT_FINGERPRINT
       unless fingerprints[type]
         return [] unless self.smiles
-        #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
-        if type == "MP2D"
+        if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
           fp = obconversion(smiles,"smi","mpd").strip.split("\t")
           name = fp.shift # remove Title
           fingerprints[type] = fp.uniq # no fingerprint counts
-        #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
-        elsif type== "MNA"
+        elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
           level = 2 # TODO: level as parameter, evaluate level 1, see paper
           fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
           fp.shift # remove Title
@@ -130,30 +125,17 @@ module OpenTox
     # @param [String] smiles 
     # @return [OpenTox::Compound]
     def self.from_smiles smiles
-      if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
-        $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
-        return nil
-      end
+      return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
       smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
-      if smiles.empty?
-        $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
-        return nil
-      else
-        Compound.find_or_create_by :smiles => smiles 
-      end
+      smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles)
     end
 
     # Create a compound from InChI string
     # @param [String] InChI 
     # @return [OpenTox::Compound] 
     def self.from_inchi inchi
-      #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
       smiles = obconversion(inchi,"inchi","can")
-      if smiles.empty?
-        Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
-      else
-        Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
-      end
+      smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles)
     end
 
     # Create a compound from SDF 
@@ -170,7 +152,7 @@ module OpenTox
     # @param [String] name, can be also an InChI/InChiKey, CAS number, etc
     # @return [OpenTox::Compound]
     def self.from_name name
-      Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
+      Compound.from_smiles RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","name",URI.escape(name),"property","CanonicalSMILES","TXT")).chomp
     end
 
     # Get InChI
@@ -238,56 +220,16 @@ module OpenTox
     #   names = compound.names
     # @return [Array<String>] 
     def names
-      update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"] 
+      update(:names => RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","smiles",URI.escape(smiles),"synonyms","TXT")).split("\n")) #unless self["names"] 
       self["names"]
     end
 
     # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem
     # @return [String] 
     def cid
-      pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
-      update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] 
+      update(:cid => RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] 
       self["cid"]
     end
-
-    # Get ChEMBL database compound id, obtained via REST call to ChEMBL
-    # @return [String] 
-    def chemblid
-      # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
-      uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
-      update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"] 
-      self["chemblid"]
-    end
-
-    def db_neighbors min_sim: 0.1, dataset_id:
-      #p fingerprints[DEFAULT_FINGERPRINT]
-      # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
-
-      #qn = default_fingerprint_size
-      #qmin = qn * threshold
-      #qmax = qn / threshold
-      #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
-      #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
-      aggregate = [
-        #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
-        #{'$match' =>  {'_id' => {'$ne' => self.id}}}, # remove self
-        {'$project' => {
-          'similarity' => {'$let' => {
-            'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
-            'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
-          }},
-          '_id' => 1,
-          #'measurements' => 1,
-          'dataset_ids' => 1
-        }},
-        {'$match' =>  {'similarity' => {'$gte' => min_sim}}},
-        {'$sort' => {'similarity' => -1}}
-      ]
-
-      # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
-      $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
-        
-    end
     
     # Convert mmol to mg
     # @return [Float] value in mg
@@ -319,7 +261,7 @@ module OpenTox
       obconversion.read_string obmol, identifier
       case output_format
       when /smi|can|inchi/
-        obconversion.write_string(obmol).gsub(/\s/,'').chomp
+        obconversion.write_string(obmol).split(/\s/).first
       when /sdf/
         # TODO: find disconnected structures
         # strip_salts
@@ -332,11 +274,11 @@ module OpenTox
 print sdf
         if sdf.match(/.nan/)
           
-          $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
+          #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
           obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
           sdf = obconversion.write_string(obmol)
           if sdf.match(/.nan/)
-            $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
+            #warn "2D generation failed for compound #{identifier}, rendering without coordinates."
             obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
             sdf = obconversion.write_string(obmol)
           end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 06a1e2a..e1761bc 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -15,7 +15,7 @@ module OpenTox
         $logger.debug model.algorithms
         klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
         klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
-        bad_request_error "Unknown model class #{model.class}." unless klass
+        raise ArgumentError, "Unknown model class #{model.class}." unless klass
 
         cv = klass.new(
           name: model.name,
@@ -24,24 +24,16 @@ module OpenTox
         )
         cv.save # set created_at
 
-        nr_instances = 0
-        nr_unpredicted = 0
         training_dataset = model.training_dataset
         training_dataset.folds(n).each_with_index do |fold,fold_nr|
           #fork do # parallel execution of validations can lead to Rserve and memory problems
-            $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
-            t = Time.now
-            validation = TrainTest.create(model, fold[0], fold[1])
-            cv.validation_ids << validation.id
-            cv.nr_instances += validation.nr_instances
-            cv.nr_unpredicted += validation.nr_unpredicted
-            #cv.predictions.merge! validation.predictions
-            $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
-          #end
+          $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+          t = Time.now
+          validation = TrainTest.create(model, fold[0], fold[1])
+          cv.validation_ids << validation.id
+          $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
         end
-        #Process.waitall
         cv.save
-        $logger.debug "Nr unpredicted: #{nr_unpredicted}"
         cv.statistics
         cv.update_attributes(finished_at: Time.now)
         cv
@@ -72,25 +64,25 @@ module OpenTox
     class ClassificationCrossValidation < CrossValidation
       include ClassificationStatistics
       field :accept_values, type: Array
-      field :confusion_matrix, type: Array
-      field :weighted_confusion_matrix, type: Array
-      field :accuracy, type: Float
-      field :weighted_accuracy, type: Float
+      field :confusion_matrix, type: Hash
+      field :accuracy, type: Hash
       field :true_rate, type: Hash
       field :predictivity, type: Hash
+      field :nr_predictions, type: Hash
       field :probability_plot_id, type: BSON::ObjectId
     end
 
     # Crossvalidation of regression models
     class RegressionCrossValidation < CrossValidation
       include RegressionStatistics
-      field :rmse, type: Float, default:0
-      field :mae, type: Float, default:0
-      field :r_squared, type: Float
-      field :within_prediction_interval, type: Integer, default:0
-      field :out_of_prediction_interval, type: Integer, default:0
-      field :correlation_plot_id, type: BSON::ObjectId
+      field :rmse, type: Hash
+      field :mae, type: Hash
+      field :r_squared, type: Hash
+      field :within_prediction_interval, type: Hash
+      field :out_of_prediction_interval, type: Hash
+      field :nr_predictions, type: Hash
       field :warnings, type: Array
+      field :correlation_plot_id, type: BSON::ObjectId
     end
 
     # Independent repeated crossvalidations
@@ -103,7 +95,7 @@ module OpenTox
       # @param [Fixnum] number of folds
       # @param [Fixnum] number of repeats
       # @return [OpenTox::Validation::RepeatedCrossValidation]
-      def self.create model, folds=10, repeats=3
+      def self.create model, folds=10, repeats=5
         repeated_cross_validation = self.new
         repeats.times do |n|
           $logger.debug "Crossvalidation #{n+1} for #{model.name}"
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 6e7d67f..fb1afd2 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -1,12 +1,16 @@
 require 'csv'
 require 'tempfile'
+require 'digest/md5'
 
 module OpenTox
 
   # Collection of substances and features
   class Dataset
 
-    field :data_entries, type: Hash, default: {}
+    field :data_entries, type: Array, default: [] #substance,feature,value
+    field :warnings, type: Array, default: [] 
+    field :source, type: String
+    field :md5, type: String
 
     # Readers
 
@@ -25,29 +29,87 @@ module OpenTox
     # Get all substances
     # @return [Array<OpenTox::Substance>]
     def substances
-      @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq
+      @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq
       @substances
     end
 
     # Get all features
     # @return [Array<OpenTox::Feature>]
     def features
-      @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
+      @features ||= data_entries.collect{|row| OpenTox::Feature.find(row[1])}.uniq
       @features
     end
 
     # Get all values for a given substance and feature
     # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id
     # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id
-    # @return [TrueClass,FalseClass,Float]
+    # @return [Array<TrueClass,FalseClass,Float>] values
     def values substance,feature
       substance = substance.id if substance.is_a? Substance
       feature = feature.id if feature.is_a? Feature
-      if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
-        data_entries[substance.to_s][feature.to_s]
-      else
-        [nil]
-      end
+      substance = BSON::ObjectId.from_string(substance) if substance.is_a? String
+      feature = BSON::ObjectId.from_string(feature) if feature.is_a? String
+      data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]}
+    end
+
+    # Get OriginalId features
+    # @return [Array<OpenTox::OriginalId>] original ID features (merged datasets may have multiple original IDs)
+    def original_id_features
+      features.select{|f| f.is_a?(OriginalId)}
+    end
+
+    # Get OriginalSmiles features
+    # @return [Array<OpenTox::OriginalSmiles>] original smiles features (merged datasets may have multiple original smiles)
+    def original_smiles_features
+      features.select{|f| f.is_a?(OriginalSmiles)}
+    end
+
+    # Get Warnings features
+    # @return [Array<OpenTox::Warnings>] warnings features (merged datasets may have multiple warnings)
+    def warnings_features
+      features.select{|f| f.is_a?(Warnings)}
+    end
+
+    # Get Confidence feature
+    # @return [OpenTox::Confidence] confidence feature
+    def confidence_feature
+      features.select{|f| f.is_a?(Confidence)}.first
+    end
+
+    # Get nominal and numeric bioactivity features
+    # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>]
+    def bioactivity_features
+      features.select{|f| f._type.match(/BioActivity/)}
+    end
+
+    # Get nominal and numeric bioactivity features
+    # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>]
+    def transformed_bioactivity_features
+      features.select{|f| f._type.match(/Transformed.*BioActivity/)}
+    end
+
+    # Get nominal and numeric substance property features
+    # @return [Array<OpenTox::NominalSubstanceProperty,OpenTox::NumericSubstanceProperty>]
+    def substance_property_features
+      features.select{|f| f._type.match("SubstanceProperty")}
+    end
+
+    # Get nominal and numeric prediction features
+    # @return [Array<OpenTox::NominalLazarPrediction,OpenTox::NumericLazarPrediction>]
+    def prediction_feature
+      features.select{|f| f._type.match(/Prediction$/)}.first
+    end
+
+    # Get supporting nominal and numeric prediction features (class probabilities, prediction interval)
+    # @return [Array<OpenTox::LazarPredictionProbability,OpenTox::LazarPredictionInterval>]
+    def prediction_supporting_features
+      features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)}
+    end
+
+    # Get nominal and numeric merged features
+    # @return [Array<OpenTox::MergedNominalBioActivity,OpenTox::MergedNumericBioActivity>]
+    def merged_features
+      features.select{|f| f._type.match("Merged")}
     end
 
     # Writers
@@ -59,137 +121,178 @@ module OpenTox
     def add(substance,feature,value)
       substance = substance.id if substance.is_a? Substance
       feature = feature.id if feature.is_a? Feature
-      data_entries[substance.to_s] ||= {}
-      data_entries[substance.to_s][feature.to_s] ||= []
-      data_entries[substance.to_s][feature.to_s] << value
-      #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source
+      data_entries << [substance,feature,value] if substance and feature and value
     end
 
-    # Dataset operations
-
-    # Split a dataset into n folds
-    # @param [Integer] number of folds
-    # @return [Array] Array with folds [training_dataset,test_dataset]
-    def folds n
-      len = self.substances.size
-      indices = (0..len-1).to_a.shuffle
-      mid = (len/n)
-      chunks = []
-      start = 0
-      1.upto(n) do |i|
-        last = start+mid
-        last = last-1 unless len%n >= i
-        test_idxs = indices[start..last] || []
-        test_substances = test_idxs.collect{|i| substances[i]}
-        training_idxs = indices-test_idxs
-        training_substances = training_idxs.collect{|i| substances[i]}
-        chunk = [training_substances,test_substances].collect do |substances|
-          dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
-          substances.each do |substance|
-            substance.dataset_ids << dataset.id
-            substance.dataset_ids.uniq!
-            substance.save
-            dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
+    # Parsers
+    
+    # Create a dataset from CSV file
+    # @param [File] Input file with the following format:
+    #   - ID column (optional): header containing "ID" string, arbitrary ID values
+    #   - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings
+    #   - one or more properties column(s): header with property name(s), property values
+    #     files with a single property column are read as BioActivities (i.e. dependent variable)
+    #     files with multiple property columns are read as SubstanceProperties (i.e. independent variables)
+    # @return [OpenTox::Dataset]
+    def self.from_csv_file file
+      md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
+      dataset = self.find_by(:md5 => md5)
+      if dataset
+        $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
+      else
+        $logger.debug "Parsing #{file}."
+        table = nil
+        sep = ","
+        ["\t",";"].each do |s| # guess alternative CSV separator
+          if File.readlines(file).first.match(/#{s}/)
+            sep = s
+            break
           end
-          dataset.save
-          dataset
         end
-        start = last+1
-        chunks << chunk
+        table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
+        if table
+          dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
+          dataset.parse_table table
+        else
+          raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator."
+        end
       end
-      chunks
+      dataset
     end
 
-    # Serialisation
-    
-    # Convert dataset to csv format including compound smiles as first column, other column headers are feature names
-    # @return [String]
-    def to_csv(inchi=false)
-      CSV.generate() do |csv| 
-        compound = substances.first.is_a? Compound
-        if compound
-          csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
-        else
-          csv << ["Name"] + features.collect{|f| f.name}
-        end
-        substances.each do |substance|
-          if compound
-            name = (inchi ? substance.inchi : substance.smiles)
-          else
-            name = substance.name
-          end
-          nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
+    # Create a dataset from SDF file 
+    #   files with a single data field are read as BioActivities (i.e. dependent variable)
+    #   files with multiple data fields are read as SubstanceProperties (i.e. independent variable)
+    # @param [File] 
+    # @return [OpenTox::Dataset]
+    def self.from_sdf_file file
+      md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
+      dataset = self.find_by(:md5 => md5)
+      if dataset
+        $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
+      else
+        $logger.debug "Parsing #{file}."
+
+        dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
+        original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID")
 
-          if nr_measurements.size > 1
-            warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
+        read_result = false
+        sdf = ""
+        feature_name = ""
+        compound = nil
+        features = {}
+        table = [["ID","SMILES"]]
+
+        File.readlines(file).each do |line|
+          if line.match %r{\$\$\$\$}
+            sdf << line
+            id = sdf.split("\n").first.chomp
+            compound = Compound.from_sdf sdf
+            row = [id,compound.smiles]
+            features.each do |f,v|
+              table[0] << f unless table[0].include? f
+              row[table[0].index(f)] = v
+            end
+            table << row
+            sdf = ""
+            features = {}
+          elsif line.match /^>\s+</
+            feature_name = line.match(/^>\s+<(.*)>/)[1]
+            read_result = true
           else
-            (0..nr_measurements.first-1).each do |i|
-              row = [name]
-              features.each do |f|
-                values(substance,f) ? row << values(substance,f)[i] : row << ""
-              end
-              csv << row
+            if read_result
+              value = line.chomp
+              features[feature_name] = value
+              read_result = false
+            else
+              sdf << line
             end
           end
         end
+        dataset.parse_table table
       end
+      dataset
     end
 
-    # Parsers
-
-    # Create a dataset from file (csv,sdf,...)
-    # @param filename [String]
-    # @return [String] dataset uri
-    # TODO
-    #def self.from_sdf_file
-    #end
-    
-    # Create a dataset from CSV file
-    # @param [File] 
-    # @param [TrueClass,FalseClass] accept or reject empty values
+    # Create a dataset from PubChem Assay
+    # @param [Integer] PubChem AssayID (AID)
     # @return [OpenTox::Dataset]
-    def self.from_csv_file file, accept_empty_values=false
-      source = file
-      name = File.basename(file,".*")
-      dataset = self.find_by(:source => source, :name => name)
-      if dataset
-        $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
-      else
-        $logger.debug "Parsing #{file}."
-        table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
-        dataset = self.new(:source => source, :name => name)
-        dataset.parse_table table, accept_empty_values
+    def self.from_pubchem_aid aid
+      # TODO get regression data
+      aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+      assay_metadata = JSON.parse(RestClientWrapper.get(File.join aid_url,"description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"]
+      name = assay_metadata["name"].gsub(/\s+/,"_")
+      dataset = self.new(:source => aid_url, :name => name) 
+      # Get assay data in chunks
+      # Assay record retrieval is limited to 10000 SIDs
+      # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+      list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+      listkey = list["ListKey"]
+      size = list["Size"]
+      start = 0
+      csv = []
+      while start < size
+        url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+        csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+        start += 10000
+      end
+      table = [["SID","SMILES",name]]
+      csv.each_slice(100) do |slice| # get SMILES in chunks
+        cids = slice.collect{|s| s[2]}
+        pubchem_cids = []
+        JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+          i = cids.index(prop["CID"].to_s)
+          value = slice[i][3]
+          if value == "Active" or value == "Inactive"
+            table << [slice[i][1].to_s,prop["CanonicalSMILES"],slice[i][3].to_s]
+            pubchem_cids << prop["CID"].to_s
+          else
+            dataset.warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is #{value}."
+          end
+        end
+        (cids-pubchem_cids).each { |cid| dataset.warnings << "Could not retrieve SMILES for CID #{cid}, all entries are ignored." }
       end
+      dataset.parse_table table
       dataset
     end
 
     # Parse data in tabular format (e.g. from csv)
     #   does a lot of guesswork in order to determine feature types
     # @param [Array<Array>] 
-    # @param [TrueClass,FalseClass] accept or reject empty values
-    def parse_table table, accept_empty_values
+    def parse_table table
 
       # features
       feature_names = table.shift.collect{|f| f.strip}
-      warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
-      compound_format = feature_names.shift.strip
-      bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+      raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+
+      if feature_names[0] !~ /SMILES|InChI/i # check ID column
+        original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift)
+      else
+        original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID")
+      end
+
+      compound_format = feature_names.shift
+      raise ArgumentError, "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+      original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i)
+
       numeric = []
       features = []
+
       # guess feature types
+      bioactivity = true if feature_names.size == 1
+
       feature_names.each_with_index do |f,i|
-        metadata = {:name => f}
-        values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+        original_id.name.match(/LineID$/) ? j = i+1 : j = i+2
+        values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact
         types = values.collect{|v| v.numeric? ? true : false}.uniq
         feature = nil
         if values.size == 0 # empty feature
         elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
           numeric[i] = true
-          feature = NumericFeature.find_or_create_by(metadata)
+          bioactivity ?  feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
         else
-          metadata["accept_values"] = values
           numeric[i] = false
-          feature = NominalFeature.find_or_create_by(metadata)
+          bioactivity ?  feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
         end
         features << feature if feature
       end
@@ -198,35 +301,31 @@ module OpenTox
 
       all_substances = []
       table.each_with_index do |vals,i|
+        original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.to_s.strip
         identifier = vals.shift.strip
-        warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
         begin
           case compound_format
           when /SMILES/i
-            substance = OpenTox::Compound.from_smiles(identifier)
+            substance = Compound.from_smiles(identifier)
+            add substance, original_smiles, identifier
           when /InChI/i
-            substance = OpenTox::Compound.from_inchi(identifier)
+            substance = Compound.from_inchi(identifier)
           end
         rescue 
           substance = nil
         end
+
         if substance.nil? # compound parsers may return nil
-          warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
+          warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
           next
         end
+
         all_substances << substance
-        substance.dataset_ids << self.id
-        substance.dataset_ids.uniq!
-        substance.save
-          
-        unless vals.size == features.size 
-          warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
-          next
-        end
+        add substance, original_id, original_id_value 
 
         vals.each_with_index do |v,j|
           if v.blank?
-            warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
+            warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
             next
           elsif numeric[j]
             v = v.to_f
@@ -235,46 +334,202 @@ module OpenTox
           end
           add substance, features[j], v
         end
-        data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values
       end
+
+      warnings_feature = Warnings.find_or_create_by(:dataset_id => id)
       all_substances.duplicates.each do |substance|
         positions = []
-        all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
-        warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." 
+        all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles}
+        all_substances.select{|s| s.smiles == substance.smiles}.each do |s|
+          add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." 
+        end
       end
       save
     end
 
-    # Delete dataset
-    def delete
-      compounds.each{|c| c.dataset_ids.delete id.to_s}
-      super
+    # Serialisation
+    
+    # Convert lazar prediction dataset to csv format 
+    # @return [String]
+    def to_prediction_csv
+      
+      compound = substances.first.is_a? Compound
+      header = ["ID"]
+      header << "Original SMILES" if compound
+      compound ? header << "Canonical SMILES" : header << "Name"
+      header << "Prediction" if prediction_feature
+      header << "Confidence" if confidence_feature
+      header += prediction_supporting_features.collect{|f| f.name}
+      header << "Measurements" 
+      csv = [header]
+
+      substances.each do |substance|
+        row = original_id_features.collect{|f| values(substance,f).join(" ")}
+        row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
+        compound ? row << substance.smiles : row << substance.name
+        row << values(substance,prediction_feature).join(" ")
+        row << values(substance,confidence_feature).join(" ")
+        row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")}
+        row << values(substance,bioactivity_features[0]).join(" ")
+        csv << row
+      end
+      csv.collect{|r| r.join(",")}.join("\n")
     end
+    
+    # Convert dataset into csv formatted training data
+    # @return [String]
+    def to_training_csv 
+      
+      export_features = merged_features
+      export_features = transformed_bioactivity_features if export_features.empty? 
+      export_features = bioactivity_features if export_features.empty? 
+      export_feature = export_features.first
 
-  end
+      header = ["Canonical SMILES"]
+      header << bioactivity_features.first.name # use original bioactivity name instead of long merged name
+      csv = [header]
 
-  # Dataset for lazar predictions
-  class LazarPrediction #< Dataset
-    field :creator, type: String
-    field :prediction_feature_id, type: BSON::ObjectId
-    field :predictions, type: Hash, default: {}
+      substances.each do |substance|
+        nr_activities = values(substance,bioactivity_features.first).size
+        (0..nr_activities-1).each do |n| # new row for each value
+          row = [substance.smiles]
+          row << values(substance,export_feature)[n] 
+          csv << row
+        end
+      end
+      csv.collect{|r| r.join(",")}.join("\n")
+    end
 
-    # Get prediction feature
-    # @return [OpenTox::Feature]
-    def prediction_feature
-      Feature.find prediction_feature_id
+    # Convert dataset to SDF format
+    # @return [String] SDF string
+    def to_sdf
+      sdf = ""
+      compounds.each do |compound|
+        sdf_lines = compound.sdf.sub(/\$\$\$\$\n/,"").split("\n")
+        sdf_lines[0] = compound.smiles
+        sdf += sdf_lines.join("\n")
+        bioactivity_features.each do |f|
+          v = values(compound,f)
+          unless v.empty?
+            sdf += "\n> <#{f.name}>\n"
+            sdf += v.uniq.join ","
+            sdf += "\n"
+          end
+        end
+        sdf += "\n$$$$\n"
+      end
+      sdf
     end
 
-    # Get all compounds
-    # @return [Array<OpenTox::Compound>]
-    def compounds
-      substances.select{|s| s.is_a? Compound}
+    # Get lazar predictions from a dataset
+    # @return [Hash] predictions
+    def predictions
+      predictions = {}
+      substances.each do |s| 
+        predictions[s] ||= {}
+        predictions[s][:value] = values(s,prediction_feature).first
+        #predictions[s][:warnings] = []
+        #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) }
+        predictions[s][:confidence] = values(s,confidence_feature).first
+        if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction
+          prediction_feature.accept_values.each do |v|
+            f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id)
+            predictions[s][:probabilities] ||= {}
+            predictions[s][:probabilities][v] = values(s,f).first
+          end
+        end
+      end
+      predictions
     end
 
-    # Get all substances
-    # @return [Array<OpenTox::Substance>]
-    def substances
-      predictions.keys.collect{|id| Substance.find id}
+    # Dataset operations
+
+    # Copy a dataset
+    # @return OpenTox::Dataset dataset copy
+    def copy
+      dataset = Dataset.new
+      dataset.data_entries = data_entries
+      dataset.warnings = warnings
+      dataset.name = name
+      dataset.source = id.to_s
+      dataset.save
+      dataset
+    end
+
+    # Split a dataset into n folds
+    # @param [Integer] number of folds
+    # @return [Array] Array with folds [training_dataset,test_dataset]
+    def folds n
+      $logger.debug "Creating #{n} folds for #{name}."
+      len = self.substances.size
+      indices = (0..len-1).to_a.shuffle
+      mid = (len/n)
+      chunks = []
+      start = 0
+      1.upto(n) do |i|
+        last = start+mid
+        last = last-1 unless len%n >= i
+        test_idxs = indices[start..last] || []
+        test_substances = test_idxs.collect{|i| substances[i].id}
+        training_idxs = indices-test_idxs
+        training_substances = training_idxs.collect{|i| substances[i].id}
+        chunk = [training_substances,test_substances].collect do |substances|
+          self.class.create(
+            :name => "#{self.name} (Fold #{i-1})",
+            :source => self.id,
+            :data_entries => data_entries.select{|row| substances.include? row[0]}
+          )
+        end
+        start = last+1
+        chunks << chunk
+      end
+      chunks
+    end
+
+    # Merge an array of datasets 
+    # @param [Array<OpenTox::Dataset>] datasets Datasets to be merged
+    # @param [Array<OpenTox::Feature>] features Features to be merged (same size as datasets)
+    # @param [Array<Hash>] value_maps Value transfomations (use nil for keeping original values, same size as dataset)
+    # @param [Bool] keep_original_features Copy original features/values to the merged dataset
+    # @param [Bool] remove_duplicates Delete duplicated values (assuming they come from the same experiment)
+    # @return [OpenTox::Dataset] merged dataset
+    def self.merge datasets: , features: , value_maps: , keep_original_features: , remove_duplicates: 
+      dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged")
+
+      datasets.each do |d|
+        dataset.data_entries += d.data_entries
+        dataset.warnings += d.warnings
+      end if keep_original_features
+
+      feature_classes = features.collect{|f| f.class}.uniq
+      merged_feature = nil
+      if feature_classes.size == 1
+        if features.first.kind_of? NominalFeature
+          merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(" and ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps)
+        else
+          merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations 
+        end
+      else
+        raise ArgumentError, "Cannot merge features of different types (#{feature_classes})."
+      end
+
+      accept_values = []
+      features.each_with_index do |f,i|
+        dataset.data_entries += datasets[i].data_entries.select{|de| de[1] == f.id}.collect do |de|
+          value_maps[i] ?  v = value_maps[i][de[2]] : v = de[2]
+          accept_values << v
+          [de[0],merged_feature.id,v]
+        end
+      end
+
+      if merged_feature.is_a? MergedNominalBioActivity
+        merged_feature.accept_values = accept_values.uniq.sort
+        merged_feature.save
+      end
+
+      dataset.data_entries.uniq! if remove_duplicates
+      dataset.save
+      dataset
     end
 
   end
diff --git a/lib/download.rb b/lib/download.rb
new file mode 100644
index 0000000..2546dc4
--- /dev/null
+++ b/lib/download.rb
@@ -0,0 +1,358 @@
+module OpenTox
+
+  class Download
+
+    DATA = File.join(File.dirname(__FILE__),"..","data")
+
+    # Download classification dataset from PubChem into the data folder
+    # @param [Integer] aid PubChem Assay ID
+    # @param [String] active Name for the "Active" class
+    # @param [String] inactive Name for the "Inactive" class
+    # @param [String] species Species name
+    # @param [String] endpoint Endpoint name
+    # @param [Hash] qmrf Name and group for QMRF reports (optional)
+    def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil
+      aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+      
+      # Get assay data in chunks
+      # Assay record retrieval is limited to 10000 SIDs
+      # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+      list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+      listkey = list["ListKey"]
+      size = list["Size"]
+      start = 0
+      csv = []
+      while start < size
+        url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+        csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+        start += 10000
+      end
+      warnings = []
+      name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_")
+      $logger.debug name
+      table = [["SID","SMILES",name]]
+      csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100
+        cids = slice.collect{|s| s[2]}
+        pubchem_cids = []
+        JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+          i = cids.index(prop["CID"].to_s)
+          value = slice[i][3]
+          if value == "Active"
+            table << [slice[i][1].to_s,prop["CanonicalSMILES"],active]
+            pubchem_cids << prop["CID"].to_s
+          elsif value == "Inactive"
+            table << [slice[i][1].to_s,prop["CanonicalSMILES"],inactive]
+            pubchem_cids << prop["CID"].to_s
+          else
+            warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
+          end
+        end
+        (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
+      end
+      File.open(File.join(DATA,name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
+      meta = {
+        :species => species,
+        :endpoint => endpoint,
+        :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}",
+        :qmrf => qmrf,
+        :warnings => warnings
+      }
+      File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+      File.join(DATA,name+".csv")
+    end
+
+    # Download regression dataset from PubChem into the data folder
+    # Uses -log10 transformed experimental data in mmol units
+    # @param [String] aid PubChem Assay ID
+    # @param [String] species Species name
+    # @param [String] endpoint Endpoint name
+    # @param [Hash] qmrf Name and group for QMRF reports (optional)
+    def self.pubchem_regression aid: , species: , endpoint:, qmrf: nil
+      aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+      
+      # Get assay data in chunks
+      # Assay record retrieval is limited to 10000 SIDs
+      # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+      list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+      listkey = list["ListKey"]
+      size = list["Size"]
+      start = 0
+      csv = []
+      unit = nil
+      while start < size
+        url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+        # get unit
+        unit ||= CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0] == "RESULT_UNIT"}[0][8]
+        csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+        start += 10000
+      end
+      warnings = []
+      name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_")
+      $logger.debug name
+      table = [["SID","SMILES","-log10(#{name} [#{unit}])"]]
+      csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100
+        cids = slice.collect{|s| s[2]}
+        pubchem_cids = []
+        JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+          i = cids.index(prop["CID"].to_s)
+          value = slice[i][8]
+          if value
+            value = -Math.log10(value.to_f)
+            table << [slice[i][1].to_s,prop["CanonicalSMILES"],value]
+            pubchem_cids << prop["CID"].to_s
+          else
+            warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
+          end
+        end
+        (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
+      end
+      File.open(File.join(DATA,name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
+      meta = {
+        :species => species,
+        :endpoint => endpoint,
+        :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}",
+        :unit => unit,
+        :qmrf => qmrf,
+        :warnings => warnings
+      }
+      File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+      File.join(DATA,name+".csv")
+    end
+
+    # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder
+    def self.mutagenicity
+      $logger.debug "Mutagenicity"
+      hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
+      kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
+      efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
+      
+      parts = File.join(DATA, "parts")
+      FileUtils.mkdir_p parts
+      Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f }
+      Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f }
+      Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f }
+      File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s }
+
+      # convert hansen
+      hansen = CSV.read File.join(parts,"hansen-original.csv")
+      hansen.shift
+
+      map = {"0" => "non-mutagenic","1" => "mutagenic"}
+      File.open(File.join(parts,"hansen.csv"),"w+") do |f|
+        f.puts "ID,SMILES,Mutagenicity"
+        hansen.each do |row|
+          f.puts [row[0],row[5],map[row[2]]].join "," 
+        end
+      end
+      File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s }
+      `cd #{parts} && unzip cas_4337.zip`
+      `cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls`
+      `cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c "	" efsa.xls > efsa.tsv`
+
+      # convert EFSA data to mutagenicity classifications
+      i = 0
+      db = {}
+      CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row|
+        if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33]
+          begin
+            c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles
+          rescue
+            c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters
+          end
+          db[c] ||= {}
+          db[c][:id] ||= row[2]
+          if row[33].match(/Positiv/i)
+            db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100
+          elsif row[33].match(/Negativ/i)
+            db[c][:value] ||= "non-mutagenic"
+          end
+        end
+        i += 1
+      end
+      File.open(File.join(parts,"efsa.csv"),"w+") do |f|
+        f.puts "ID,SMILES,Mutagenicity"
+        db.each do |s,v|
+          f.puts [v[:id],s,v[:value]].join ","
+        end
+      end
+
+      # merge datasets
+      hansen = Dataset.from_csv_file File.join(parts,"hansen.csv")
+      efsa = Dataset.from_csv_file File.join(parts,"efsa.csv")
+      kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf")
+      datasets = [hansen,efsa,kazius]
+      map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
+      dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
+      dataset.merged_features.first.name = "Mutagenicity"
+      File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv}
+      meta = {
+        :species => "Salmonella typhimurium",
+        :endpoint => "Mutagenicity",
+        :source => [kazius_url,hansen_url,efsa_url].join(", "),
+        :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"},
+      }
+      File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json}
+      
+      # cleanup
+      datasets << dataset
+      datasets.each{|d| d.delete }
+      File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv")
+    end
+
+    # Download Blood Brain Barrier Penetration dataset into the data folder
+    def self.blood_brain_barrier
+      url =  "http://cheminformatics.org/datasets/li/bbp2.smi"
+      name = "Blood_Brain_Barrier_Penetration-Human"
+      $logger.debug name
+      map = {"n" => "non-penetrating", "p" => "penetrating"}
+      table = CSV.parse RestClientWrapper.get(url).to_s, :col_sep => "\t"
+      File.open(File.join(DATA,name+".csv"),"w+") do |f|
+        f.puts "ID,SMILES,#{name}"
+        table.each do |row|
+          f.puts [row[1],row[0],map[row[3]]].join(",")
+        end
+      end
+      meta = {
+        :species => "Human",
+        :endpoint => "Blood Brain Barrier Penetration",
+        :source => url,
+        :qmrf => {"name": "QMRF 5.4. Toxicokinetics.Blood-brain barrier penetration", "group": "QMRF 5. Toxicokinetics"},
+      }
+      File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+    end
+
+    # Download the combined LOAEL dataset from Helma et al 2018 into the data folder
+    def self.loael
+      # TODO: fix url??
+      url = "https://raw.githubusercontent.com/opentox/loael-paper/revision/data/training_log10.csv"
+      name = "Lowest_observed_adverse_effect_level-Rats"
+      $logger.debug name
+      File.open(File.join(DATA,name+".csv"),"w+") do |f|
+        CSV.parse(RestClientWrapper.get(url).to_s) do |row|
+          f.puts [row[0],row[1]].join ","
+        end
+      end
+      meta = {
+        :species => "Rat",
+        :endpoint => "Lowest observed adverse effect level",
+        :source => url,
+        :unit => "mmol/kg_bw/day",
+        :qmrf => {
+          "name": "QMRF 4.14. Repeated dose toxicity",
+          "group": "QMRF 4.Human Health Effects"
+        }
+      }
+      File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+    end
+
+    # Download Daphnia dataset from http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/ into the public folder
+    # The original file requires an email request, this is a temporary workaround
+    def self.daphnia
+      #url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv"
+      src = File.join(DATA,"parts","toxicity_data.xlsx")
+      name = "Acute_toxicity-Daphnia_magna"
+      $logger.debug name
+      File.open(File.join(DATA,name+".csv"),"w+") do |f|
+        i = 0
+        CSV.parse(`xlsx2csv #{src}`) do |row|
+          i == 0 ? v = "-log[LC50_mmol/L]" : v = -Math.log10(10**-row[3].to_f*1000)
+          f.puts [row[0],row[1],v].join(",")
+          i += 1
+        end
+      end
+      meta = { "species": "Daphnia magna",
+        "endpoint": "Acute toxicity",
+        "source": "http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/",
+        "unit": "mmol/L",
+        "qmrf": {
+          "group": "QMRF 3.1. Short-term toxicity to Daphnia (immobilisation)",
+          "name": "EC C. 2. Daphnia sp Acute Immobilisation Test"
+        }
+      }
+      File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+    end
+
+    # Download all public lazar datasets into the data folder
+    def self.public_data
+
+      # Classification
+      [
+        {
+          :aid => 1205,
+          :species => "Rodents",
+          :endpoint => "Carcinogenicity",
+          :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+        },{
+          :aid => 1208,
+          :species => "Rat",
+          :endpoint => "Carcinogenicity",
+          :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+        },{
+          :aid => 1199,
+          :species => "Mouse",
+          :endpoint => "Carcinogenicity",
+          :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+        }
+      ].each do |assay|
+        Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogenic", inactive: "non-carcinogenic", qmrf: assay[:qmrf]
+      end
+      Download.mutagenicity
+      Download.blood_brain_barrier
+
+      # Regression
+      [
+        {
+          :aid => 1195,
+          :species => "Human",
+          :endpoint => "Maximum Recommended Daily Dose",
+          :qmrf => {
+            "group": "QMRF 4.14. Repeated dose toxicity",
+            "name": "OECD 452 Chronic Toxicity Studies"
+          },
+        },{
+          :aid => 1208,
+          :species => "Rat (TD50)",
+          :endpoint => "Carcinogenicity",
+          :qmrf => {
+            :group => "QMRF 4.12. Carcinogenicity",
+            :name => "OECD 451 Carcinogenicity Studies"
+          }
+        },{
+          :aid => 1199,
+          :species => "Mouse (TD50)",
+          :endpoint => "Carcinogenicity",
+          :qmrf => {
+            :group => "QMRF 4.12. Carcinogenicity",
+            :name => "OECD 451 Carcinogenicity Studies"
+          }
+        },{
+          :aid => 1188,
+          :species => "Fathead minnow",
+          :endpoint => "Acute toxicity",
+          :qmrf => {
+            "group": "QMRF 3.3. Acute toxicity to fish (lethality)",
+            "name": "EC C. 1. Acute Toxicity for Fish"
+          }
+        }
+      ].each do |assay|
+        Download.pubchem_regression aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], qmrf: assay[:qmrf]
+      end
+      
+      Download.loael
+      Download.daphnia
+
+=begin
+        # 1204  estrogen receptor
+        # 1259408, # GENE-TOX
+        # 1159563 HepG2 cytotoxicity assay
+        # 588209 hepatotoxicity
+        # 1259333 cytotoxicity
+        # 1159569 HepG2 cytotoxicity counterscreen Measured in Cell-Based System Using Plate Reader - 2153-03_Inhibitor_Dose_DryPowder_Activity
+        # 2122 HTS Counterscreen for Detection of Compound Cytotoxicity in MIN6 Cells
+        # 116724 Acute toxicity determined after intravenal administration in mice
+        # 1148549 Toxicity in po dosed mouse assessed as mortality after 7 days
+=end
+    end
+
+  end
+end
diff --git a/lib/enm-import.rb b/lib/enm-import.rb
new file mode 100644
index 0000000..cf1a26f
--- /dev/null
+++ b/lib/enm-import.rb
@@ -0,0 +1,125 @@
+module OpenTox
+
+  # Import data from external databases
+  module Import
+
+    class Enanomapper
+      include OpenTox
+
+      # Import from eNanoMapper
+      def self.import
+        # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
+        datasets = {}
+        bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"]
+        bundles.each do |bundle|
+          datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
+          $logger.debug bundle["title"].strip
+          nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"]
+          nanoparticles.each_with_index do |np,n|
+            core_id = nil
+            coating_ids = []
+            np["composition"].each do |c|
+              uri = c["component"]["compound"]["URI"]
+              data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json}))
+              source = data["dataEntry"][0]["compound"]["URI"]
+              smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
+              names = []
+              names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
+              names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
+              if smiles
+                compound = Compound.find_or_create_by(:smiles => smiles)
+                compound.name = names.first
+                compound.names = names.compact
+              else
+                compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
+              end
+              compound.source = source
+              compound.save
+              if c["relation"] == "HAS_CORE"
+                core_id = compound.id.to_s
+              elsif c["relation"] == "HAS_COATING"
+                coating_ids << compound.id.to_s
+              end
+            end if np["composition"]
+            nanoparticle = Nanoparticle.find_or_create_by(
+              :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+              :source => np["compound"]["URI"],
+              :core_id => core_id,
+              :coating_ids => coating_ids
+            )
+            #np["bundles"].keys.each do |bundle_uri|
+              #nanoparticle.dataset_ids << datasets[bundle_uri].id
+            #end
+
+            studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"]
+            studies.each do |study|
+              dataset = datasets[np["bundles"].keys.first]
+              proteomics_features = {}
+              category = study["protocol"]["topcategory"]
+              source = study["protocol"]["category"]["term"]
+              study["effects"].each do |effect|
+
+                effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
+                effect["conditions"].delete_if { |k, v| v.nil? }
+
+                if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
+
+                  JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
+                    proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
+                    nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
+                  end
+                else
+                  name = effect["endpoint"]
+                  unit = effect["result"]["unit"]
+                  warnings = []
+                  case name
+                  when "Log2 transformed" # use a sensible name
+                    name = "log2(Net cell association)"
+                    warnings = ["Original name was 'Log2 transformed'"]
+                    unit = "log2(mL/ug(Mg))"
+                  when "Total protein (BCA assay)"
+                    category = "P-CHEM"
+                    warnings = ["Category changed from TOX to P-CHEM"]
+                  end
+                  feature = klass.find_or_create_by(
+                    :name => name,
+                    :unit => unit,
+                    :category => category,
+                    :conditions => effect["conditions"],
+                    :source => study["protocol"]["category"]["term"],
+                    :measured => true,
+                    :warnings => warnings
+                  )
+                  nanoparticle.parse_ambit_value feature, effect["result"], dataset
+                end
+              end
+            end
+            nanoparticle.save
+            print "#{n}, "
+          end
+          puts
+        end
+        datasets.each { |u,d| d.save }
+      end
+
+=begin
+      def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
+        #get list of bundle URIs
+        bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+        datasets = []
+        bundles.each do |bundle|
+          uri = bundle["URI"]
+          study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
+          study["@graph"].each do |i|
+            puts i.to_yaml if i.keys.include? "sio:has-value"
+          end
+        end
+        datasets.collect{|d| d.id}
+      end
+=end
+
+    end
+
+  end
+
+end
diff --git a/lib/error.rb b/lib/error.rb
deleted file mode 100644
index 39b3c76..0000000
--- a/lib/error.rb
+++ /dev/null
@@ -1,66 +0,0 @@
-module OpenToxError
-  attr_accessor :http_code, :message, :cause
-  def initialize message=nil
-    message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes
-    super message
-    @http_code ||= 500
-    @message = message.to_s
-    @cause = cut_backtrace(caller)
-    $logger.error("\n"+JSON.pretty_generate({
-      :http_code => @http_code,
-      :message => @message,
-      :cause => @cause
-    })) 
-  end
-  
-  def cut_backtrace(trace)
-    if trace.is_a?(Array)
-      cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)}
-      cut_index ||= trace.size
-      cut_index -= 1
-      cut_index = trace.size-1 if cut_index < 0
-      trace[0..cut_index]
-    else
-      trace
-    end
-  end
-
-end
-
-class RuntimeError
-  include OpenToxError
-end
-
-# clutters log file with library errors 
-#class NoMethodError
-  #include OpenToxError
-#end
-
-module OpenTox
-
-  class Error < RuntimeError
-    include OpenToxError
-    
-    def initialize(code, message=nil)
-      @http_code = code
-      super message
-    end
-  end
-
-  # OpenTox errors
-  RestClientWrapper.known_errors.each do |error|
-    # create error classes 
-    c = Class.new Error do
-      define_method :initialize do |message=nil|
-        super error[:code], message
-      end
-    end
-    OpenTox.const_set error[:class],c
-    
-    # define global methods for raising errors, eg. bad_request_error
-    Object.send(:define_method, error[:method]) do |message|
-      raise c.new(message)
-    end
-  end
-  
-end
diff --git a/lib/feature.rb b/lib/feature.rb
index f811aef..296a174 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -1,37 +1,112 @@
 module OpenTox
 
-  # Basic feature class
-  class Feature
-    field :measured, type: Boolean
-    field :calculated, type: Boolean
-    field :category, type: String
-    field :unit, type: String
-    field :conditions, type: Hash
+  # Original ID (e.g. from CSV input)
+  class OriginalId < Feature
+    field :dataset_id, type: BSON::ObjectId
+  end
 
-    # Is it a nominal feature
-    # @return [TrueClass,FalseClass]
-    def nominal?
-      self.class == NominalFeature
-    end
+  # Original SMILES (e.g. from CSV input)
+  class OriginalSmiles < Feature
+    field :dataset_id, type: BSON::ObjectId
+  end
 
-    # Is it a numeric feature
-    # @return [TrueClass,FalseClass]
-    def numeric?
-      self.class == NumericFeature
+  # Warnings
+  class Warnings < Feature
+    field :dataset_id, type: BSON::ObjectId
+  end
+
+  # Confidence
+  class Confidence < Feature
+    field :dataset_id, type: BSON::ObjectId
+    def name
+      "Confidence"
     end
   end
 
-  # Feature for categorical variables
+  # Categorical variables
   class NominalFeature < Feature
     field :accept_values, type: Array
   end
 
-  # Feature for quantitative variables
+  # Quantitative variables
   class NumericFeature < Feature
+    field :unit, type: String
+  end
+
+  # Nominal biological activity
+  class NominalBioActivity < NominalFeature
+  end
+
+  # Numeric biological activity
+  class NumericBioActivity < NumericFeature
+  end
+
+  # Merged nominal biological activity
+  class MergedNominalBioActivity < NominalBioActivity
+    field :original_feature_ids, type: Array
+    field :transformations, type: Array
+  end
+
+  # Merged numeric biological activity
+  class MergedNumericBioActivity < NumericBioActivity
+    field :original_feature_ids, type: Array
+  end
+
+  # Transformed nominal biological activity
+  class TransformedNominalBioActivity < NominalFeature
+    field :original_feature_id, type: BSON::ObjectId
+    field :transformation, type: Hash
+  end
+
+  # Transformed numeric biological activity
+  class TransformedNumericBioActivity < NumericFeature
+    field :original_feature_id, type: BSON::ObjectId
+    field :transformation, type: String
+  end
+
+  # Nominal lazar prediction
+  class NominalLazarPrediction < NominalFeature
+    field :model_id, type: BSON::ObjectId
+    field :training_feature_id, type: BSON::ObjectId
+    def name
+      "Prediction: #{self[:name]}"
+    end
+  end
+
+  class LazarPredictionProbability < NominalLazarPrediction
+    def name
+      "Probability: #{self[:name]}"
+    end
+  end
+
+  # Numeric lazar prediction
+  class NumericLazarPrediction < NumericFeature
+    field :model_id, type: BSON::ObjectId
+    field :training_feature_id, type: BSON::ObjectId
+    def name
+      "Prediction: #{self[:name]}"
+    end
+  end
+
+  class LazarPredictionInterval < NumericLazarPrediction
+    def name
+      "#{self[:name].capitalize} prediction interval"
+    end
+  end
+
+  class NominalSubstanceProperty < NominalFeature
+  end
+
+  class NumericSubstanceProperty < NumericFeature
+  end
+
+  class NanoParticleProperty < NumericSubstanceProperty
+    field :category, type: String
+    field :conditions, type: Hash
   end
 
   # Feature for SMARTS fragments
-  class Smarts < NominalFeature
+  class Smarts < Feature
     field :smarts, type: String 
     index "smarts" => 1
     # Create feature from SMARTS string
diff --git a/lib/import.rb b/lib/import.rb
index 0857717..cdf96e3 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -1,125 +1,21 @@
 module OpenTox
 
-  # Import data from external databases
-  module Import
-
-    class Enanomapper
-      include OpenTox
-
-      # Import from eNanoMapper
-      def self.import
-        # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
-        datasets = {}
-        bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"]
-        bundles.each do |bundle|
-          datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
-          $logger.debug bundle["title"].strip
-          nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"]
-          nanoparticles.each_with_index do |np,n|
-            core_id = nil
-            coating_ids = []
-            np["composition"].each do |c|
-              uri = c["component"]["compound"]["URI"]
-              data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json}))
-              source = data["dataEntry"][0]["compound"]["URI"]
-              smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
-              names = []
-              names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
-              names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
-              if smiles
-                compound = Compound.find_or_create_by(:smiles => smiles)
-                compound.name = names.first
-                compound.names = names.compact
-              else
-                compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
-              end
-              compound.source = source
-              compound.save
-              if c["relation"] == "HAS_CORE"
-                core_id = compound.id.to_s
-              elsif c["relation"] == "HAS_COATING"
-                coating_ids << compound.id.to_s
-              end
-            end if np["composition"]
-            nanoparticle = Nanoparticle.find_or_create_by(
-              :name => np["values"]["https://data.enanomapper.net/identifier/name"],
-              :source => np["compound"]["URI"],
-              :core_id => core_id,
-              :coating_ids => coating_ids
-            )
-            np["bundles"].keys.each do |bundle_uri|
-              nanoparticle.dataset_ids << datasets[bundle_uri].id
-            end
-
-            studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"]
-            studies.each do |study|
-              dataset = datasets[np["bundles"].keys.first]
-              proteomics_features = {}
-              category = study["protocol"]["topcategory"]
-              source = study["protocol"]["category"]["term"]
-              study["effects"].each do |effect|
-
-                effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
-                effect["conditions"].delete_if { |k, v| v.nil? }
-
-                if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
-
-                  JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
-                    proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
-                    nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
-                  end
-                else
-                  name = effect["endpoint"]
-                  unit = effect["result"]["unit"]
-                  warnings = []
-                  case name
-                  when "Log2 transformed" # use a sensible name
-                    name = "log2(Net cell association)"
-                    warnings = ["Original name was 'Log2 transformed'"]
-                    unit = "log2(mL/ug(Mg))"
-                  when "Total protein (BCA assay)"
-                    category = "P-CHEM"
-                    warnings = ["Category changed from TOX to P-CHEM"]
-                  end
-                  feature = klass.find_or_create_by(
-                    :name => name,
-                    :unit => unit,
-                    :category => category,
-                    :conditions => effect["conditions"],
-                    :source => study["protocol"]["category"]["term"],
-                    :measured => true,
-                    :warnings => warnings
-                  )
-                  nanoparticle.parse_ambit_value feature, effect["result"], dataset
-                end
-              end
-            end
-            nanoparticle.save
-            print "#{n}, "
-          end
-          puts
+  class Import
+
+    # Import datasets from the data folder, create and validate models 
+    # @return [Array<OpenTox::Model::Validation>] Validated models
+    def self.public_data
+      models = []
+      Dir[File.join(File.dirname(__FILE__),"..","data/*csv")].each do |f|
+        $logger.debug f
+        m = Model::Validation.from_csv_file f
+        $logger.debug "#{f} ID: #{m.id.to_s}"
+        m.crossvalidations.each do |cv|
+          $logger.debug cv.statistics
         end
-        datasets.each { |u,d| d.save }
+        models << m
       end
-
-=begin
-      def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
-        #get list of bundle URIs
-        bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
-        datasets = []
-        bundles.each do |bundle|
-          uri = bundle["URI"]
-          study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
-          study["@graph"].each do |i|
-            puts i.to_yaml if i.keys.include? "sio:has-value"
-          end
-        end
-        datasets.collect{|d| d.id}
-      end
-=end
-
+      models
     end
-
   end
-
 end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 32f0317..e77de9d 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -17,19 +17,22 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple
 
 ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"] 
 ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
+# CH: this interferes with /etc/hosts on my machine
 # search for a central mongo database in use
 # http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment
-CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
+# CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
 Mongoid.load_configuration({
   :clients => {
     :default => {
       :database => ENV["LAZAR_ENV"],
-      :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+      #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+      :hosts => ["localhost:27017"]
     }
   }
 })
 Mongoid.raise_not_found_error = false # return nil if no document is found
-$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
 $gridfs = $mongo.database.fs
 
 # Logger setup
@@ -68,13 +71,15 @@ suppressPackageStartupMessages({
 })
 "
 
+PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
+CHEMBL_URI = "https://www.ebi.ac.uk/chembl/api/data/molecule/"
+
 # OpenTox classes and includes
-CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation"]# Algorithm and Models are modules
 
 [ # be aware of the require sequence as it affects class/method overwrites
   "overwrite.rb",
   "rest-client-wrapper.rb", 
-  "error.rb",
   "opentox.rb",
   "feature.rb",
   "physchem.rb",
@@ -94,6 +99,6 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","
   "train-test-validation.rb",
   "leave-one-out-validation.rb",
   "crossvalidation.rb",
-  #"experiment.rb",
+  "download.rb",
   "import.rb",
 ].each{ |f| require_relative f }
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index c33c92b..7d73b89 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -9,25 +9,18 @@ module OpenTox
       # @param [OpenTox::Model::Lazar]
       # @return [OpenTox::Validation::LeaveOneOut]
       def self.create model
-        bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
+        raise ArgumentError, "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
         $logger.debug "#{model.name}: LOO validation started"
         t = Time.now
-        model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
+        model.training_dataset.features.collect{|f| f.class}.include?(NominalBioActivity) ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
         loo = klass.new :model_id => model.id
         predictions = model.predict model.training_dataset.substances
         predictions.each{|cid,p| p.delete(:neighbors)}
-        nr_unpredicted = 0
         predictions.each do |cid,prediction|
-          if prediction[:value]
-            prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
-          else
-            nr_unpredicted += 1
-          end
+          prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
           predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
         end
         predictions.select!{|cid,p| p[:value] and p[:measurements]}
-        loo.nr_instances = predictions.size
-        loo.nr_unpredicted = nr_unpredicted
         loo.predictions = predictions
         loo.statistics
         $logger.debug "#{model.name}, LOO validation:  #{Time.now-t} seconds"
@@ -40,25 +33,27 @@ module OpenTox
     class ClassificationLeaveOneOut < LeaveOneOut
       include ClassificationStatistics
       field :accept_values, type: Array
-      field :confusion_matrix, type: Array, default: []
-      field :weighted_confusion_matrix, type: Array, default: []
-      field :accuracy, type: Float
-      field :weighted_accuracy, type: Float
-      field :true_rate, type: Hash, default: {}
-      field :predictivity, type: Hash, default: {}
-      field :confidence_plot_id, type: BSON::ObjectId
+      field :confusion_matrix, type: Hash
+      field :weighted_confusion_matrix, type: Hash
+      field :accuracy, type: Hash
+      field :weighted_accuracy, type: Hash
+      field :true_rate, type: Hash
+      field :predictivity, type: Hash
+      field :nr_predictions, type: Hash
+      field :probability_plot_id, type: BSON::ObjectId
     end
     
     # Leave one out validation for regression models
     class RegressionLeaveOneOut  < LeaveOneOut
       include RegressionStatistics
-      field :rmse, type: Float, default: 0
-      field :mae, type: Float, default: 0
-      field :r_squared, type: Float
-      field :within_prediction_interval, type: Integer, default:0
-      field :out_of_prediction_interval, type: Integer, default:0
-      field :correlation_plot_id, type: BSON::ObjectId
+      field :rmse, type: Hash
+      field :mae, type: Hash
+      field :r_squared, type: Hash
+      field :within_prediction_interval, type: Hash
+      field :out_of_prediction_interval, type: Hash
+      field :nr_predictions, type: Hash
       field :warnings, type: Array
+      field :correlation_plot_id, type: BSON::ObjectId
     end
 
   end
diff --git a/lib/model.rb b/lib/model.rb
index dce53a9..d7b2df6 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -32,20 +32,20 @@ module OpenTox
       # @param [OpenTox::Feature, nil] prediction_feature
       #   By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
       # @param [Hash, nil] algorithms
-      #   Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. 
+      #   Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. 
       #
       # @return [OpenTox::Model::Lazar]
       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
-        bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
-        prediction_feature = training_dataset.features.first unless prediction_feature
-        # TODO: prediction_feature without training_dataset: use all available data
+        raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
+        prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
 
         # guess model type
-        prediction_feature.numeric? ?  model = LazarRegression.new : model = LazarClassification.new
+        prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
 
         model.prediction_feature_id = prediction_feature.id
         model.training_dataset_id = training_dataset.id
-        model.name = "#{prediction_feature.name} (#{training_dataset.name})" 
+        model.name = training_dataset.name
+        
         # git or gem versioning
         dir = File.dirname(__FILE__)
         path = File.expand_path("../", File.expand_path(dir))
@@ -62,7 +62,7 @@ module OpenTox
 
         # set defaults#
         substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
-        bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
+        raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
 
         if substance_classes.first == "OpenTox::Compound"
 
@@ -80,7 +80,7 @@ module OpenTox
             }
             model.algorithms[:similarity] = {
               :method => "Algorithm::Similarity.tanimoto",
-              :min => 0.1,
+              :min => [0.5,0.2],
             }
           elsif model.class == LazarRegression
             model.algorithms[:prediction] = {
@@ -88,7 +88,7 @@ module OpenTox
             }
             model.algorithms[:similarity] = {
               :method => "Algorithm::Similarity.tanimoto",
-              :min => 0.5,
+              :min => [0.5,0.2],
             }
           end
 
@@ -100,7 +100,7 @@ module OpenTox
             },
             :similarity => {
               :method => "Algorithm::Similarity.weighted_cosine",
-              :min => 0.5,
+              :min => [0.5,0.2],
             },
             :prediction => {
               :method => "Algorithm::Caret.rf",
@@ -110,7 +110,7 @@ module OpenTox
             },
           }
         else
-          bad_request_error "Cannot create models for #{substance_classes.first}."
+          raise ArgumentError, "Cannot create models for #{substance_classes.first}."
         end
         
         # overwrite defaults with explicit parameters
@@ -175,7 +175,7 @@ module OpenTox
           model.descriptor_ids = feature_ids & property_ids
           model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
         else
-          bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
+          raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented."
         end
         
         if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
@@ -197,7 +197,7 @@ module OpenTox
       # Predict a substance (compound or nanoparticle)
       # @param [OpenTox::Substance]
       # @return [Hash]
-      def predict_substance substance, threshold = self.algorithms[:similarity][:min]
+      def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil
         
         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
         case algorithms[:similarity][:method]
@@ -224,11 +224,11 @@ module OpenTox
             end
           end
         else
-          bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
+          raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
         end
         
-        prediction = {:warnings => [], :measurements => []}
-        prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
+        prediction ||= {:warnings => [], :measurements => []}
+        prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first
         neighbor_ids = []
         neighbor_similarities = []
         neighbor_dependent_variables = []
@@ -238,7 +238,7 @@ module OpenTox
         substance_ids.each_with_index do |s,i|
           # handle query substance
           if substance.id.to_s == s
-            prediction[:measurements] << dependent_variables[i]
+            prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass
             prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
           else
             if fingerprints?
@@ -264,25 +264,37 @@ module OpenTox
         
         if neighbor_similarities.empty?
           prediction[:value] = nil
-          prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
+          prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset."
+          if threshold == algorithms[:similarity][:min].last
+            prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})."
+            return prediction
+          end
         elsif neighbor_similarities.size == 1
           prediction[:value] = nil
-          prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
-          prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
+          prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})."
+          prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}]
+          if threshold == algorithms[:similarity][:min].last
+            prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set."
+            return prediction
+          end
         else
           query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
           # call prediction algorithm
           result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
           prediction.merge! result
           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
-          #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
-            #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
-          #end
         end
-        if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
-          prediction
-        else # try again with a lower threshold
-          predict_substance substance, 0.2
+        if threshold == algorithms[:similarity][:min].first
+          if prediction[:warnings].empty? 
+            prediction[:confidence] = "Similar to bioassay results"
+            return prediction
+          else # try again with a lower threshold
+            prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
+            predict_substance substance, algorithms[:similarity][:min].last, prediction
+          end
+        elsif threshold < algorithms[:similarity][:min].first
+          prediction[:confidence] = "Lower than bioassay results"
+          return prediction
         end
       end
 
@@ -302,13 +314,18 @@ module OpenTox
         elsif object.is_a? Dataset
           substances = object.substances
         else 
-          bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
+          raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
         end
 
         # make predictions
         predictions = {}
         substances.each do |c|
           predictions[c.id.to_s] = predict_substance c
+          if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value]
+            prediction_feature.accept_values.each do |v|
+              predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity)
+            end
+          end
           predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
         end
 
@@ -320,17 +337,35 @@ module OpenTox
         elsif object.is_a? Array
           return predictions
         elsif object.is_a? Dataset
-          # prepare prediction dataset
-          measurement_feature = Feature.find prediction_feature_id
-
-          prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
-          prediction_dataset = LazarPrediction.create(
-            :name => "Lazar prediction for #{prediction_feature.name}",
-            :creator =>  __FILE__,
-            :prediction_feature_id => prediction_feature.id,
-            :predictions => predictions
-          )
-          return prediction_dataset
+          d = object.copy
+          #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
+          confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id)
+          if prediction_feature.is_a? NominalBioActivity
+            f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
+            probability_features = {}
+            prediction_feature.accept_values.each do |v|
+              probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+            end
+          elsif prediction_feature.is_a? NumericBioActivity
+            f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
+            prediction_interval = []
+            ["lower","upper"].each do |v|
+              prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+            end
+          end
+
+          # add predictions to dataset
+          predictions.each do |substance_id,p|
+            substance_id = BSON::ObjectId.from_string(substance_id)
+            d.add substance_id,confidence_feature,p[:confidence]
+            unless p[:value].nil?
+              d.add substance_id,f,p[:value]
+              p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
+              p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval]
+            end
+          end
+          d.save
+          return d
         end
 
       end
@@ -402,6 +437,7 @@ module OpenTox
       field :species, type: String
       field :source, type: String
       field :unit, type: String
+      field :warnings, type: Array
       field :model_id, type: BSON::ObjectId
       field :repeated_crossvalidation_id, type: BSON::ObjectId
 
@@ -461,11 +497,11 @@ module OpenTox
       end
 
       # Create and validate a lazar model from a csv file with training data and a json file with metadata
-      # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
-      # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
+      # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar.
+      # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
       def self.from_csv_file file
         metadata_file = file.sub(/csv$/,"json")
-        bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
+        raise ArgumentError, "No metadata file #{metadata_file}" unless File.exist? metadata_file
         model_validation = self.new JSON.parse(File.read(metadata_file))
         training_dataset = Dataset.from_csv_file file
         model = Lazar.create training_dataset: training_dataset
@@ -477,6 +513,7 @@ module OpenTox
 
       # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
       #   nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
+      #   *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!*
       # @param [OpenTox::Dataset, nil] training_dataset
       # @param [OpenTox::Feature, nil] prediction_feature
       # @param [Hash, nil] algorithms
@@ -488,7 +525,7 @@ module OpenTox
         unless training_dataset # try to import 
           Import::Enanomapper.import
           training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-          bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
+          raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
         end
         prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
 
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 5c300cf..fb2a579 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -11,13 +11,6 @@ module OpenTox
       include Mongoid::Timestamps
       store_in collection: klass.downcase.pluralize
       field :name,  type: String
-      field :source,  type: String
-      field :warnings, type: Array, default: []
-
-      def warn warning
-        $logger.warn warning
-        warnings << warning
-      end
     end
     OpenTox.const_set klass,c
   end
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index 0dd1c8a..d482902 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -84,7 +84,7 @@ class String
   def to_boolean
     return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
     return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i)
-    bad_request_error "invalid value for Boolean: \"#{self}\""
+    raise ArgumentError, "invalid value for Boolean: \"#{self}\""
   end
 
 end
diff --git a/lib/physchem.rb b/lib/physchem.rb
index 07df867..2af043b 100644
--- a/lib/physchem.rb
+++ b/lib/physchem.rb
@@ -1,7 +1,7 @@
 module OpenTox
 
   # Feature for physico-chemical descriptors
-  class PhysChem < NumericFeature
+  class PhysChem < NumericSubstanceProperty
 
     field :library, type: String
     field :descriptor, type: String
@@ -45,7 +45,7 @@ module OpenTox
     def self.descriptors desc=DESCRIPTORS
       desc.collect do |name,description|
         lib,desc = name.split('.',2)
-        self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
+        self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description)
       end
     end
 
@@ -59,11 +59,11 @@ module OpenTox
           CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
             dname = "#{name}.#{n}"
             description = DESCRIPTORS[dname]
-            udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
+            udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description)
           end
         else
           description = DESCRIPTORS[name]
-          udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
+          udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description)
         end
       end
       udesc
diff --git a/lib/regression.rb b/lib/regression.rb
index 25c0732..fd2855f 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -17,7 +17,7 @@ module OpenTox
           sim_sum += weights[i]
         end if dependent_variables
         sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
-        {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]}
+        {:value => prediction}
       end
 
     end
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
index c9fd40f..db23e66 100644
--- a/lib/rest-client-wrapper.rb
+++ b/lib/rest-client-wrapper.rb
@@ -28,14 +28,14 @@ module OpenTox
 
         uri = Addressable::URI.encode(uri)
         # check input
-        bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) 
+        raise ArgumentError, "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) 
         headers[:subjectid] ||= @@subjectid
-        bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri
+        raise ArgumentError, "Invalid URI: '#{uri}'" unless URI.valid? uri
         # make sure that no header parameters are set in the payload
         [:accept,:content_type,:subjectid].each do |header|
           if defined? $aa || URI(uri).host == URI($aa[:uri]).host
           else
-            bad_request_error "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
+            raise ArgumentError, "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
           end
         end
       
@@ -56,6 +56,7 @@ module OpenTox
         @response = @request.execute do |response, request, result|
           if [301, 302, 307].include? response.code and request.method == :get
             response.follow_redirection(request, result)
+=begin
           elsif response.code >= 400 and !URI.task?(uri)
             error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
             begin # errors are returned as error reports in json, try to parse
@@ -68,6 +69,7 @@ module OpenTox
               cause = nil
             end
             Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method
+=end
           else
             response
           end
@@ -75,6 +77,7 @@ module OpenTox
       end
     end
 
+=begin
     #@return [Array] of hashes with error code, method and class
     def self.known_errors
       errors = []
@@ -88,6 +91,7 @@ module OpenTox
       end
       errors
     end
+=end
 
   end
 end
diff --git a/lib/substance.rb b/lib/substance.rb
index ef49659..5c486d8 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -3,7 +3,6 @@ module OpenTox
   # Base class for substances (e.g. compunds, nanoparticles)
   class Substance
     field :properties, type: Hash, default: {}
-    field :dataset_ids, type: Array, default: []
   end
 
 end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
index 9a5532d..d034cd1 100644
--- a/lib/train-test-validation.rb
+++ b/lib/train-test-validation.rb
@@ -18,22 +18,15 @@ module OpenTox
         validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
         validation_model.save
         predictions = validation_model.predict test_set.substances
-        nr_unpredicted = 0
         predictions.each do |cid,prediction|
-          if prediction[:value]
-            prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
-          else
-            nr_unpredicted += 1
-          end
+          prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
         end
         predictions.select!{|cid,p| p[:value] and p[:measurements]}
-        # hack to avoid mongos file size limit error on large datasets
-        #predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i)
+        # remove neighbors to avoid mongos file size limit error on large datasets
+        predictions.each{|cid,p| p.delete(:neighbors)} #if model.training_dataset.name.match(/mutagenicity/i)
         validation = self.new(
           :model_id => validation_model.id,
           :test_dataset_id => test_set.id,
-          :nr_instances => test_set.substances.size,
-          :nr_unpredicted => nr_unpredicted,
           :predictions => predictions
         )
         validation.save
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 69e7992..5fd9985 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -7,62 +7,66 @@ module OpenTox
       # @return [Hash]
       def statistics 
         self.accept_values = model.prediction_feature.accept_values
-        self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
-        self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
-        nr_instances = 0
+        self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_high => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_low => Array.new(accept_values.size){Array.new(accept_values.size,0)}}
+        self.nr_predictions = {:all => 0,:confidence_high => 0,:confidence_low => 0}
         predictions.each do |cid,pred|
-          # TODO
-          # use predictions without probabilities (single neighbor)??
-          # use measured majority class??
+          # TODO: use measured majority class or all measurements??
           if pred[:measurements].uniq.size == 1 and pred[:probabilities]
             m = pred[:measurements].first
             if pred[:value] == m
-              if pred[:value] == accept_values[0]
-                confusion_matrix[0][0] += 1
-                weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
-                nr_instances += 1
-              elsif pred[:value] == accept_values[1]
-                confusion_matrix[1][1] += 1
-                weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
-                nr_instances += 1
+              accept_values.each_with_index do |v,i|
+                if pred[:value] == v
+                  confusion_matrix[:all][i][i] += 1
+                  self.nr_predictions[:all] += 1
+                  if pred[:confidence].match(/Similar/i)
+                    confusion_matrix[:confidence_high][i][i] += 1
+                    self.nr_predictions[:confidence_high] += 1
+                  elsif pred[:confidence].match(/Low/i)
+                    confusion_matrix[:confidence_low][i][i] += 1
+                    self.nr_predictions[:confidence_low] += 1
+                  end
+                end
               end
             elsif pred[:value] != m
-              if pred[:value] == accept_values[0]
-                confusion_matrix[0][1] += 1
-                weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
-                nr_instances += 1
-              elsif pred[:value] == accept_values[1]
-                confusion_matrix[1][0] += 1
-                weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
-                nr_instances += 1
+              accept_values.each_with_index do |v,i|
+                if pred[:value] == v
+                  confusion_matrix[:all][i][(i+1)%2] += 1
+                  self.nr_predictions[:all] += 1
+                  if pred[:confidence].match(/Similar/i)
+                    confusion_matrix[:confidence_high][i][(i+1)%2] += 1
+                    self.nr_predictions[:confidence_high] += 1
+                  elsif pred[:confidence].match(/Low/i)
+                    confusion_matrix[:confidence_low][i][(i+1)%2] += 1
+                    self.nr_predictions[:confidence_low] += 1
+                  end
+                end
               end
             end
           end
         end
-        self.true_rate = {}
-        self.predictivity = {}
+
+        self.true_rate = {:all => {}, :confidence_high => {}, :confidence_low => {}}
+        self.predictivity = {:all => {}, :confidence_high => {}, :confidence_low => {}}
         accept_values.each_with_index do |v,i|
-          self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
-          self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
-        end
-        confidence_sum = 0
-        weighted_confusion_matrix.each do |r|
-          r.each do |c|
-            confidence_sum += c
+          [:all,:confidence_high,:confidence_low].each do |a|
+            self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f
+            self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f
           end
         end
-        self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
-        self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+        self.accuracy = {}
+        [:all,:confidence_high,:confidence_low].each do |a|
+          self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f
+        end
         $logger.debug "Accuracy #{accuracy}"
+        $logger.debug "Nr Predictions #{nr_predictions}"
         save
         {
           :accept_values => accept_values,
           :confusion_matrix => confusion_matrix,
-          :weighted_confusion_matrix => weighted_confusion_matrix,
           :accuracy => accuracy,
-          :weighted_accuracy => weighted_accuracy,
           :true_rate => self.true_rate,
           :predictivity => self.predictivity,
+          :nr_predictions => nr_predictions,
         }
       end
 
@@ -97,7 +101,7 @@ module OpenTox
           R.assign "probability", probabilities
           R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
           R.eval "ggsave(file='#{tmpfile}', plot=image)"
-          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
+          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.#{format}")
           plot_id = $gridfs.insert_one(file)
           update(:probability_plot_id => plot_id)
         #end
@@ -108,29 +112,27 @@ module OpenTox
     # Statistical evaluation of regression validations
     module RegressionStatistics
 
+      attr_accessor :x, :y
+
       # Get statistics
       # @return [Hash]
       def statistics
         self.warnings = []
-        self.rmse = 0
-        self.mae = 0
-        self.within_prediction_interval = 0
-        self.out_of_prediction_interval = 0
-        x = []
-        y = []
+        self.rmse = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+        self.r_squared  = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+        self.mae = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+        self.within_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+        self.out_of_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+        @x = {:all => [],:confidence_high => [],:confidence_low => []}
+        @y = {:all => [],:confidence_high => [],:confidence_low => []}
+        self.nr_predictions = {:all =>0,:confidence_high => 0,:confidence_low => 0}
         predictions.each do |cid,pred|
-          if pred[:value] and pred[:measurements] 
-            x << pred[:measurements].median
-            y << pred[:value]
-            error = pred[:value]-pred[:measurements].median
-            self.rmse += error**2
-            self.mae += error.abs
-            if pred[:prediction_interval]
-              if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
-                self.within_prediction_interval += 1
-              else
-                self.out_of_prediction_interval += 1
-              end
+          !if pred[:value] and pred[:measurements] and !pred[:measurements].empty?
+            insert_prediction pred, :all
+            if pred[:confidence].match(/Similar/i)
+              insert_prediction pred, :confidence_high
+            elsif pred[:confidence].match(/Low/i)
+              insert_prediction pred, :confidence_low
             end
           else
             trd_id = model.training_dataset_id
@@ -139,39 +141,49 @@ module OpenTox
             $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
           end
         end
-        R.assign "measurement", x
-        R.assign "prediction", y
-        R.eval "r <- cor(measurement,prediction,use='pairwise')"
-        self.r_squared = R.eval("r").to_ruby**2
-        self.mae = self.mae/predictions.size
-        self.rmse = Math.sqrt(self.rmse/predictions.size)
+        [:all,:confidence_high,:confidence_low].each do |a|
+          if @x[a].size > 2
+            R.assign "measurement", @x[a]
+            R.assign "prediction", @y[a]
+            R.eval "r <- cor(measurement,prediction,use='pairwise')"
+            self.r_squared[a] = R.eval("r").to_ruby**2
+          else
+            self.r_squared[a] = 0
+          end
+          if self.nr_predictions[a] > 0
+            self.mae[a] = self.mae[a]/self.nr_predictions[a]
+            self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a])
+          else
+            self.mae[a] = nil
+            self.rmse[a] = nil
+          end
+        end
         $logger.debug "R^2 #{r_squared}"
         $logger.debug "RMSE #{rmse}"
         $logger.debug "MAE #{mae}"
-        $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
-        $logger.debug "#{warnings}"
+        $logger.debug "Nr predictions #{nr_predictions}"
+        $logger.debug "#{within_prediction_interval} measurements within prediction interval"
         save
         {
           :mae => mae,
           :rmse => rmse,
           :r_squared => r_squared,
-          :within_prediction_interval => within_prediction_interval,
+          :within_prediction_interval => self.within_prediction_interval,
           :out_of_prediction_interval => out_of_prediction_interval,
+          :nr_predictions => nr_predictions,
         }
       end
 
-      # Get percentage of measurements within the prediction interval
-      # @return [Float]
-      def percent_within_prediction_interval
-        100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
-      end
-
       # Plot predicted vs measured values
       # @param [String,nil] format
       # @return [Blob]
       def correlation_plot format: "png"
-        unless correlation_plot_id
-          tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
+        #unless correlation_plot_id
+          #tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
+          tmpdir = "/tmp"
+          #p tmpdir
+          FileUtils.mkdir_p tmpdir
+          tmpfile = File.join(tmpdir,"#{id.to_s}_correlation.#{format}")
           x = []
           y = []
           feature = Feature.find(predictions.first.last["prediction_feature_id"])
@@ -187,7 +199,7 @@ module OpenTox
             title = "log2(Net cell association [mL/ug(Mg)])"
           else
             title = feature.name
-            title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank?
+            title += "-log10(#{feature.unit})" if feature.unit and !feature.unit.blank?
           end
           R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
           R.eval "image = image + geom_abline(intercept=0, slope=1)"
@@ -195,7 +207,7 @@ module OpenTox
           file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
           plot_id = $gridfs.insert_one(file)
           update(:correlation_plot_id => plot_id)
-        end
+        #end
         $gridfs.find_one(_id: correlation_plot_id).data
       end
 
@@ -215,6 +227,24 @@ module OpenTox
         end
         worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
       end
+
+      private
+
+      def insert_prediction prediction, type
+        self.nr_predictions[type] +=1
+        @x[type] << prediction[:measurements].median
+        @y[type] << prediction[:value]
+        error = prediction[:value]-prediction[:measurements].median
+        self.rmse[type] += error**2
+        self.mae[type] += error.abs
+        if prediction[:prediction_interval]
+          if prediction[:measurements].median >= prediction[:prediction_interval][0] and prediction[:measurements].median <= prediction[:prediction_interval][1]
+            self.within_prediction_interval[type] += 1
+          else
+            self.out_of_prediction_interval[type] += 1
+          end
+        end
+      end
     end
   end
 end
diff --git a/lib/validation.rb b/lib/validation.rb
index c9954b6..9402361 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -10,8 +10,6 @@ module OpenTox
       store_in collection: "validations"
       field :name, type: String
       field :model_id, type: BSON::ObjectId
-      field :nr_instances, type: Integer, default: 0
-      field :nr_unpredicted, type: Integer, default: 0
       field :predictions, type: Hash, default: {}
       field :finished_at, type: Time