From d5bf97c2cb999539c56bf59aa1d7d3286745be84 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 23 Sep 2015 14:51:41 +0200
Subject: validations fixed (all models were executed with default parameters)

---
 lib/compound.rb           |  41 +++++++++++-
 lib/crossvalidation.rb    |   7 +-
 lib/dataset.rb            |  40 ++++++++++++
 lib/descriptor.rb         |   3 +-
 lib/experiment.rb         | 162 +++++++++++++++++++++-------------------------
 lib/model.rb              |  88 ++++++++++++++++---------
 lib/unique_descriptors.rb |   4 +-
 lib/validation.rb         |  12 +++-
 8 files changed, 227 insertions(+), 130 deletions(-)

(limited to 'lib')

diff --git a/lib/compound.rb b/lib/compound.rb
index 7abd913..d3df125 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -44,6 +44,21 @@ module OpenTox
       compound.save
       compound
     end
+    
+    #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
+    def mpd 
+      smarts = obconversion(smiles,"smi","mpd").strip.split("\t")
+      smarts.shift # remove Title
+      smarts
+
+    end
+
+    #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
+    def mna level=2
+      smarts = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
+      smarts.shift # remove Title
+      smarts
+    end
 
     def openbabel_fingerprint type="FP2"
       unless self.send(type.downcase.to_sym) # stored fingerprint
@@ -72,7 +87,7 @@ module OpenTox
           end
           start += bitsperint
         end
-        update type.downcase.to_sym, bits_set
+        update_attribute type.downcase.to_sym, bits_set
       end
       self.send(type.downcase.to_sym) 
     end
@@ -242,6 +257,28 @@ module OpenTox
       neighbors
     end
 
+    def physchem_neighbors params
+      feature_dataset = Dataset.find params[:feature_dataset_id]
+      query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
+      neighbors = []
+      feature_dataset.data_entries.each_with_index do |fingerprint, i|
+        # TODO implement pearson and cosine similarity separatly
+        R.assign "x", query_fingerprint
+        R.assign "y", fingerprint
+        # pearson r
+        #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
+        #p "pearson"
+        #p sim
+        #p "cosine"
+        sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
+        #p sim
+        if sim >= params[:min_sim]
+          neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+        end
+      end
+      neighbors
+    end
+
     def neighbors threshold=0.7
       # TODO restrict to dataset
       # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
@@ -308,7 +345,7 @@ print sdf
     end
 
     def obconversion(identifier,input_format,output_format,option=nil)
-      self.class.obconversion(identifier,input_format,output_format,option=nil)
+      self.class.obconversion(identifier,input_format,output_format,option)
     end
   end
 end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 337b434..4c80344 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -33,15 +33,12 @@ module OpenTox
       nr_instances = 0
       nr_unpredicted = 0
       predictions = []
-      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
       training_dataset = Dataset.find model.training_dataset_id
       training_dataset.folds(n).each_with_index do |fold,fold_nr|
         fork do # parallel execution of validations
           $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
           t = Time.now
-          #p validation_class#.create(model, fold[0], fold[1],cv)
-          validation = validation_class.create(model, fold[0], fold[1],cv)
-          #p validation
+          validation = Validation.create(model, fold[0], fold[1],cv)
           $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
         end
       end
@@ -170,7 +167,7 @@ module OpenTox
       y = predictions.collect{|p| p[2]}
       R.assign "measurement", x
       R.assign "prediction", y
-      R.eval "r <- cor(-log(measurement),-log(prediction))"
+      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
       r = R.eval("r").to_ruby
 
       mae = mae/predictions.size
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 946fd90..7c8ab44 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -46,6 +46,12 @@ module OpenTox
         else
           @data_entries = Marshal.load(data_entry_file.data)
           bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
+          unless @data_entries.first.size == feature_ids.size
+            # TODO: fix (unknown) source of empty data_entries
+            sleep 1
+            data_entry_file = $gridfs.find_one(_id: data_entries_id)
+            @data_entries = Marshal.load(data_entry_file.data)
+          end
           bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
           # TODO: data_entries can be empty, poorly reproducible, mongo problem?
           bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
@@ -281,6 +287,29 @@ module OpenTox
         end
       end
     end
+
+    def scale
+      scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)}
+      centers = []
+      scales = []
+      feature_ids.each_with_index do |feature_id,col| 
+        R.assign "x", data_entries.collect{|de| de[col]}
+        R.eval "scaled = scale(x,center=T,scale=T)"
+        centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby
+        scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby
+        R.eval("scaled").to_ruby.each_with_index do |value,row|
+          scaled_data_entries[row][col] = value
+        end
+      end
+      scaled_dataset = ScaledDataset.new(attributes)
+      scaled_dataset["_id"] = BSON::ObjectId.new
+      scaled_dataset["_type"] = "OpenTox::ScaledDataset"
+      scaled_dataset.centers = centers
+      scaled_dataset.scales = scales
+      scaled_dataset.data_entries = scaled_data_entries
+      scaled_dataset.save_all
+      scaled_dataset
+    end
   end
 
   # Dataset for lazar predictions
@@ -297,6 +326,17 @@ module OpenTox
   # Dataset for descriptors (physchem)
   class DescriptorDataset < Dataset
     field :feature_calculation_algorithm, type: String
+
+  end
+
+  class ScaledDataset < DescriptorDataset
+
+    field :centers, type: Array, default: []
+    field :scales, type: Array, default: []
+
+    def original_value value, i
+      value * scales[i] + centers[i]
+    end
   end
 
   # Dataset for fminer descriptors
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 5ae0ef2..9733bde 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -16,7 +16,7 @@ module OpenTox
       LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
       JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
 
-      obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
+      obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
       OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
         name,description = d.split(/\s+/,2)
         ["Openbabel."+name,description] unless obexclude.include? name
@@ -107,6 +107,7 @@ module OpenTox
           des[lib] << descriptor
         end
         des.each do |lib,descriptors|
+          p lib, descriptors
           send(lib, descriptors)
         end
         serialize
diff --git a/lib/experiment.rb b/lib/experiment.rb
index 0a76c53..616a273 100644
--- a/lib/experiment.rb
+++ b/lib/experiment.rb
@@ -4,105 +4,93 @@ module OpenTox
     field :dataset_ids, type: Array
     field :model_settings, type: Array, default: []
     field :results, type: Hash, default: {}
-  end
 
-  def run 
-    dataset_ids.each do |dataset_id|
-      dataset = Dataset.find(dataset_id)
-      results[dataset_id.to_s] = []
-      model_settings.each do |setting|
-        model = Object.const_get(setting[:algorithm]).create dataset
-        model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm]
-        model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm]
-        model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter]
-        model.save
-        repeated_crossvalidation = RepeatedCrossValidation.create model
-        results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
+    def run 
+      dataset_ids.each do |dataset_id|
+        dataset = Dataset.find(dataset_id)
+        results[dataset_id.to_s] = []
+        model_settings.each do |setting|
+          model_algorithm = setting.delete :model_algorithm
+          model = Object.const_get(model_algorithm).create dataset, setting
+          #model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm]
+          #model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm]
+          #model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter]
+          p model
+          model.save
+          repeated_crossvalidation = RepeatedCrossValidation.create model
+          results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
+        end
       end
+      save
     end
-    save
-  end
-
-  def self.create params
-    experiment = self.new
-    $logge.debug "Experiment started ..."
-    #experiment.run params
-    experiment
-  end
 
-  def report
-    # TODO significances
-    # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
-    report = {}
-    report[:name] = name
-    report[:experiment_id] = self.id.to_s
-    report[:results] = {}
-    parameters = []
-    dataset_ids.each do |dataset_id|
-      dataset_name = Dataset.find(dataset_id).name
-      report[:results][dataset_name] = {}
-      report[:results][dataset_name][:anova] = {}
-      report[:results][dataset_name][:data] = []
-      results[dataset_id.to_s].each do |result|
-        model = Model::Lazar.find(result[:model_id])
-        repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
-        crossvalidations = repeated_cv.crossvalidations
-        if crossvalidations.first.is_a? ClassificationCrossValidation
-          parameters = [:accuracy,:true_rate,:predictivity]
-        elsif crossvalidations.first.is_a? RegressionCrossValidation
-          parameters = [:rmse,:mae,:r_squared]
-        end
-        summary = {}
-        [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
-          summary[key] = model[key]
-        end
-        summary[:nr_instances] = crossvalidations.first.nr_instances
-        summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
-        summary[:time] = crossvalidations.collect{|cv| cv.time}
-        parameters.each do |param|
-          summary[param] = crossvalidations.collect{|cv| cv.send(param)}
+    def report
+      # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
+      report = {}
+      report[:name] = name
+      report[:experiment_id] = self.id.to_s
+      report[:results] = {}
+      parameters = []
+      dataset_ids.each do |dataset_id|
+        dataset_name = Dataset.find(dataset_id).name
+        report[:results][dataset_name] = {}
+        report[:results][dataset_name][:anova] = {}
+        report[:results][dataset_name][:data] = []
+        results[dataset_id.to_s].each do |result|
+          model = Model::Lazar.find(result[:model_id])
+          repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
+          crossvalidations = repeated_cv.crossvalidations
+          if crossvalidations.first.is_a? ClassificationCrossValidation
+            parameters = [:accuracy,:true_rate,:predictivity]
+          elsif crossvalidations.first.is_a? RegressionCrossValidation
+            parameters = [:rmse,:mae,:r_squared]
+          end
+          summary = {}
+          [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
+            summary[key] = model[key]
+          end
+          summary[:nr_instances] = crossvalidations.first.nr_instances
+          summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
+          summary[:time] = crossvalidations.collect{|cv| cv.time}
+          parameters.each do |param|
+            summary[param] = crossvalidations.collect{|cv| cv.send(param)}
+          end
+          report[:results][dataset_name][:data] << summary
         end
-        report[:results][dataset_name][:data] << summary
       end
-    end
-    report[:results].each do |dataset,results|
-      ([:time,:nr_unpredicted]+parameters).each do |param|
-        experiments = []
-        outcome = []
-        results[:data].each_with_index do |result,i|
-          result[param].each do |p|
-            experiments << i
-            outcome << p
+      report[:results].each do |dataset,results|
+        ([:time,:nr_unpredicted]+parameters).each do |param|
+          experiments = []
+          outcome = []
+          results[:data].each_with_index do |result,i|
+            result[param].each do |p|
+              experiments << i
+              p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
+              outcome << p
+            end
           end
-        end
-        R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
-        R.eval "experiment_nr = factor(experiment_nr)"
-        R.assign "outcome",outcome
-        R.eval "data = data.frame(experiment_nr,outcome)"
-        # one-way ANOVA
-        R.eval "fit = aov(outcome ~ experiment_nr, data=data)"
-        # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
-        p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
-        # aequivalent
-        # sum = R.eval("summary(fit)")
-        #p_value = sum.to_ruby.first.last.first
+          R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
+          R.eval "experiment_nr = factor(experiment_nr)"
+          R.assign "outcome", outcome
+          R.eval "data = data.frame(experiment_nr,outcome)"
+          # one-way ANOVA
+          R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
+          # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
+          p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
+          # aequivalent
+          # sum = R.eval("summary(fit)")
+          #p_value = sum.to_ruby.first.last.first
+          report[:results][dataset][:anova][param] = p_value
 =begin
-        if p_value < 0.01
-          p_value = "#{p_value} ***"
-        elsif p_value < 0.05
-          p_value = "#{p_value} **"
-        elsif p_value < 0.1
-          p_value = "#{p_value} *"
-        end
 =end
-        report[:results][dataset][:anova][param] = p_value
+        end
       end
+      report
     end
-    report
-  end
 
-  def summary
-    report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
+    def summary
+      report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
+    end
   end
 
 end
diff --git a/lib/model.rb b/lib/model.rb
index 9892f64..817a61e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -26,25 +26,26 @@ module OpenTox
 
       # algorithms
       field :neighbor_algorithm, type: String
-      field :neighbor_algorithm_parameters, type: Hash
+      field :neighbor_algorithm_parameters, type: Hash, default: {}
 
       # Create a lazar model from a training_dataset and a feature_dataset
       # @param [OpenTox::Dataset] training_dataset
       # @return [OpenTox::Model::Lazar] Regression or classification model
-      def self.create training_dataset
+      def initialize training_dataset, params={}
 
+        super params
         bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
 
         # TODO document convention
         prediction_feature = training_dataset.features.first
-        prediction_feature.nominal ?  lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
-        lazar.training_dataset_id = training_dataset.id
-        lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
-        lazar.prediction_feature_id = prediction_feature.id
-        lazar.name = "#{training_dataset.name} #{prediction_feature.name}" 
-
-        lazar.save
-        lazar
+        # set defaults for empty parameters
+        self.prediction_feature_id ||= prediction_feature.id
+        self.training_dataset_id ||= training_dataset.id
+        self.name ||= "#{training_dataset.name} #{prediction_feature.name}" 
+        self.neighbor_algorithm_parameters ||= {}
+        self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
+        save
+        self
       end
 
       def predict object
@@ -80,6 +81,7 @@ module OpenTox
             next
           end
           neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+
           #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
           # add activities
           # TODO: improve efficiency, takes 3 times longer than previous version
@@ -90,6 +92,17 @@ module OpenTox
           end
           neighbors.compact! # remove neighbors without training activities
           predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size})
+=begin
+# TODO scaled dataset for physchem
+          p neighbor_algorithm_parameters
+          p (neighbor_algorithm_parameters["feature_dataset_id"])
+          d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"])
+          p d
+          p d.class
+          if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset
+            p "SCALED"
+          end
+=end
         end 
 
         # serialize result
@@ -128,15 +141,40 @@ module OpenTox
     end
 
     class LazarClassification < Lazar
-      def initialize
-        super
-        self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
-        self.neighbor_algorithm = "fingerprint_neighbors"
-        self.neighbor_algorithm_parameters = {
+      
+      def self.create training_dataset, params={}
+        model = self.new training_dataset, params
+        model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
+        model.neighbor_algorithm |= "fingerprint_neighbors"
+        model.neighbor_algorithm_parameters ||= {}
+        {
           :type => "FP4",
-          :training_dataset_id => training_dataset_id,
+          :training_dataset_id => training_dataset.id,
           :min_sim => 0.7
-        }
+        }.each do |key,value|
+          model.neighbor_algorithm_parameters[key] ||= value
+        end
+        model.save
+        model
+      end
+    end
+
+    class LazarRegression < Lazar
+
+      def self.create training_dataset, params={}
+        model = self.new training_dataset, params
+        #model.neighbor_algorithm ||= "fingerprint_neighbors"
+        #model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average" 
+        #model.neighbor_algorithm_parameters ||= {}
+        #{
+          #:type => "FP4",
+          #:training_dataset_id => training_dataset.id,
+          #:min_sim => 0.7
+        #}.each do |key,value|
+          #model.neighbor_algorithm_parameters[key] ||= value
+        #end
+        model.save
+        model
       end
     end
 
@@ -159,26 +197,12 @@ module OpenTox
       end
     end
 
-    class LazarRegression < Lazar
-
-      def initialize
-        super
-        #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
-        self.neighbor_algorithm = "fingerprint_neighbors"
-        self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" 
-        self.neighbor_algorithm_parameters = {
-          :type => "FP4",
-          :training_dataset_id => self.training_dataset_id,
-          :min_sim => 0.7
-        }
-      end
-    end
-
     class Prediction
       include OpenTox
       include Mongoid::Document
       include Mongoid::Timestamps
 
+      # TODO cv -> repeated cv
       # TODO field Validations
       field :endpoint, type: String
       field :species, type: String
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
index 676f34a..cf9cbf3 100644
--- a/lib/unique_descriptors.rb
+++ b/lib/unique_descriptors.rb
@@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [
   "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
   "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
   "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
-  "Openbabel.L5", #Lipinski Rule of Five
+  #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
   "Openbabel.logP", #octanol/water partition coefficient
   "Openbabel.MP", #Melting point
   "Openbabel.MR", #molar refractivity
@@ -56,7 +56,7 @@ UNIQUEDESCRIPTORS = [
   "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
   "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
   "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
-  "Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
+  #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
   "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
   "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
   "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
diff --git a/lib/validation.rb b/lib/validation.rb
index 63fbd89..9eebef8 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -2,6 +2,7 @@ module OpenTox
 
   class Validation
 
+    field :model_id, type: BSON::ObjectId
     field :prediction_dataset_id, type: BSON::ObjectId
     field :crossvalidation_id, type: BSON::ObjectId
     field :test_dataset_id, type: BSON::ObjectId
@@ -17,9 +18,17 @@ module OpenTox
       Dataset.find test_dataset_id
     end
 
+    def model
+      Model::Lazar.find model_id
+    end
+
     def self.create model, training_set, test_set, crossvalidation=nil
       
-      validation_model = model.class.create training_set#, features
+      atts = model.attributes.dup # do not modify attributes from original model
+      atts["_id"] = BSON::ObjectId.new
+      atts[:training_dataset_id] = training_set.id
+      validation_model = model.class.create training_set, atts
+      validation_model.save
       test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
       prediction_dataset = validation_model.predict test_set_without_activities
       predictions = []
@@ -36,6 +45,7 @@ module OpenTox
         end
       end
       validation = self.new(
+        :model_id => validation_model.id,
         :prediction_dataset_id => prediction_dataset.id,
         :test_dataset_id => test_set.id,
         :nr_instances => test_set.compound_ids.size,
-- 
cgit v1.2.3