From d3a4c309d48b794f2f60f44bb9a3d94f402cc82f Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 16 Sep 2015 13:11:45 +0200
Subject: repeated crossvalidations, improved experiment reports

---
 lib/crossvalidation.rb | 57 +++++++++++++++++++++--------------
 lib/dataset.rb         |  1 +
 lib/error.rb           |  2 +-
 lib/experiment.rb      | 81 +++++++++++++++++++++++++-------------------------
 lib/lazar.rb           |  5 ++--
 lib/model.rb           |  3 --
 test/experiment.rb     | 62 +++++++++++++++++++++++++++++---------
 test/validation.rb     | 12 ++++++++
 8 files changed, 141 insertions(+), 82 deletions(-)

diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 90c0d75..f480932 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -22,7 +22,9 @@ module OpenTox
     end
 
     def self.create model, n=10
-      cv = self.new(
+      model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
+      bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+      cv = klass.new(
         name: model.name,
         model_id: model.id,
         folds: n
@@ -55,6 +57,7 @@ module OpenTox
         nr_unpredicted: nr_unpredicted,
         predictions: predictions
       )
+      cv.statistics
       cv
     end
   end
@@ -70,14 +73,13 @@ module OpenTox
     field :predictivity, type: Hash
     # TODO auc, f-measure (usability??)
 
-    def self.create model, n=10
-      cv = super model, n
+    def statistics
       accept_values = Feature.find(model.prediction_feature_id).accept_values
       confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
       weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
       true_rate = {}
       predictivity = {}
-      cv.predictions.each do |pred|
+      predictions.each do |pred|
         compound_id,activity,prediction,confidence = pred
         if activity and prediction and confidence.numeric? 
           if prediction == activity
@@ -113,18 +115,16 @@ module OpenTox
           confidence_sum += c
         end
       end
-      cv.update_attributes(
+      update_attributes(
         accept_values: accept_values,
         confusion_matrix: confusion_matrix,
         weighted_confusion_matrix: weighted_confusion_matrix,
-        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(cv.nr_instances-cv.nr_unpredicted).to_f,
+        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
         weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
         true_rate: true_rate,
         predictivity: predictivity,
         finished_at: Time.now
       )
-      cv.save
-      cv
     end
 
     #Average area under roc  0.646
@@ -142,8 +142,7 @@ module OpenTox
     field :correlation_plot_id, type: BSON::ObjectId
     field :confidence_plot_id, type: BSON::ObjectId
 
-    def self.create model, n=10
-      cv = super model, n
+    def statistics
       rmse = 0
       weighted_rmse = 0
       rse = 0
@@ -153,7 +152,7 @@ module OpenTox
       rae = 0
       weighted_rae = 0
       confidence_sum = 0
-      cv.predictions.each do |pred|
+      predictions.each do |pred|
         compound_id,activity,prediction,confidence = pred
         if activity and prediction
           error = Math.log10(prediction)-Math.log10(activity)
@@ -163,24 +162,24 @@ module OpenTox
           weighted_mae += confidence*error.abs
           confidence_sum += confidence
         else
-          cv.warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}."
-          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}."
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
         end
       end
-      x = cv.predictions.collect{|p| p[1]}
-      y = cv.predictions.collect{|p| p[2]}
+      x = predictions.collect{|p| p[1]}
+      y = predictions.collect{|p| p[2]}
       R.assign "measurement", x
       R.assign "prediction", y
       R.eval "r <- cor(-log(measurement),-log(prediction))"
       r = R.eval("r").to_ruby
 
-      mae = mae/cv.predictions.size
+      mae = mae/predictions.size
       weighted_mae = weighted_mae/confidence_sum
-      rmse = Math.sqrt(rmse/cv.predictions.size)
+      rmse = Math.sqrt(rmse/predictions.size)
       weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
       # TODO check!!
 =begin
-      cv.predictions.sort! do |a,b|
+      predictions.sort! do |a,b|
         relative_error_a = (a[1]-a[2]).abs/a[1].to_f
         relative_error_a = 1/relative_error_a if relative_error_a < 1
         relative_error_b = (b[1]-b[2]).abs/b[1].to_f
@@ -188,15 +187,14 @@ module OpenTox
         [relative_error_b,b[3]] <=> [relative_error_a,a[3]]
       end
 =end
-      cv.update_attributes(
+      update_attributes(
         mae: mae,
         rmse: rmse,
         weighted_mae: weighted_mae,
         weighted_rmse: weighted_rmse,
-        r_squared: r**2
+        r_squared: r**2,
+        finished_at: Time.now
       )
-      cv.save
-      cv
     end
 
     def misclassifications n=nil
@@ -277,5 +275,20 @@ module OpenTox
     end
   end
 
+  class RepeatedCrossValidation
+    field :crossvalidation_ids, type: Array, default: []
+    def self.create model, folds=10, repeats=3
+      repeated_cross_validation = self.new
+      repeats.times do
+        repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+      end
+      repeated_cross_validation.save
+      repeated_cross_validation
+    end
+    def crossvalidations
+      crossvalidation_ids.collect{|id| CrossValidation.find(id)}
+    end
+  end
+
 
 end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 851fabd..d884716 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -47,6 +47,7 @@ module OpenTox
           @data_entries = Marshal.load(data_entry_file.data)
           bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
           bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
+          # TODO: data_entries can be empty, poorly reproducible, mongo problem?
           bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
           #$logger.debug "Retrieving data: #{Time.now-t}"
         end
diff --git a/lib/error.rb b/lib/error.rb
index 8fe8a1e..39b3c76 100644
--- a/lib/error.rb
+++ b/lib/error.rb
@@ -58,7 +58,7 @@ module OpenTox
     OpenTox.const_set error[:class],c
     
     # define global methods for raising errors, eg. bad_request_error
-    Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil|
+    Object.send(:define_method, error[:method]) do |message|
       raise c.new(message)
     end
   end
diff --git a/lib/experiment.rb b/lib/experiment.rb
index 2f51756..7849337 100644
--- a/lib/experiment.rb
+++ b/lib/experiment.rb
@@ -2,45 +2,22 @@ module OpenTox
 
   class Experiment
     field :dataset_ids, type: Array
-    field :model_algorithms, type: Array
-    field :model_ids, type: Array, default: []
-    field :crossvalidation_ids, type: Array, default: []
-    field :prediction_algorithms, type: Array
-    field :neighbor_algorithms, type: Array
-    field :neighbor_algorithm_parameters, type: Array
+    field :model_settings, type: Array
+    field :results, type: Hash, default: {}
   end
 
-  # TODO more sophisticated experimental design
   def run 
     dataset_ids.each do |dataset_id|
       dataset = Dataset.find(dataset_id)
-      model_algorithms.each do |model_algorithm|
-        prediction_algorithms.each do |prediction_algorithm|
-          neighbor_algorithms.each do |neighbor_algorithm|
-            neighbor_algorithm_parameters.each do |neighbor_algorithm_parameter|
-              $logger.debug "Creating #{model_algorithm} model for dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}."
-              model = Object.const_get(model_algorithm).create dataset
-              model.prediction_algorithm = prediction_algorithm
-              model.neighbor_algorithm = neighbor_algorithm
-              model.neighbor_algorithm_parameters = neighbor_algorithm_parameter
-              model.save
-              model_ids << model.id
-              cv = nil
-              if dataset.features.first.nominal
-                cv = ClassificationCrossValidation
-              elsif dataset.features.first.numeric
-                cv = RegressionCrossValidation
-              end
-              if cv
-                $logger.debug "Creating #{cv} for #{model_algorithm}, dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}."
-                crossvalidation = cv.create model
-                self.crossvalidation_ids << crossvalidation.id
-              else
-                $logger.warn "#{dataset.features.first} is neither nominal nor numeric."
-              end
-            end
-          end
-        end
+      results[dataset_id.to_s] = []
+      model_settings.each do |setting|
+        model = Object.const_get(setting[:algorithm]).create dataset
+        model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm]
+        model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm]
+        model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter]
+        model.save
+        repeated_crossvalidation = RepeatedCrossValidation.create model
+        results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
       end
     end
     save
@@ -54,13 +31,37 @@ module OpenTox
   end
 
   def report
-    # TODO create ggplot2 report
-    self.crossvalidation_ids.each do |id|
-      cv = CrossValidation.find(id)
-      file = "/tmp/#{id}.svg"
-      File.open(file,"w+"){|f| f.puts cv.correlation_plot}
-      `inkview '#{file}'`
+    # TODO significances
+    report = {}
+    report[:name] = name
+    report[:experiment_id] = self.id.to_s
+    dataset_ids.each do |dataset_id|
+      dataset_name = Dataset.find(dataset_id).name
+      report[dataset_name] = []
+      results[dataset_id.to_s].each do |result|
+        model = Model::Lazar.find(result[:model_id])
+        repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
+        crossvalidations = repeated_cv.crossvalidations
+        summary = {}
+        [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
+          summary[key] = model[key]
+        end
+        summary[:nr_instances] = crossvalidations.first.nr_instances
+        summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
+        summary[:time] = crossvalidations.collect{|cv| cv.time}
+        if crossvalidations.first.is_a? ClassificationCrossValidation
+          summary[:accuracies] = crossvalidations.collect{|cv| cv.accuracy}
+        elsif crossvalidations.first.is_a? RegressionCrossValidation
+          summary[:r_squared] = crossvalidations.collect{|cv| cv.r_squared}
+        end
+        report[dataset_name] << summary
+        #p repeated_cv.crossvalidations.collect{|cv| cv.accuracy}
+        #file = "/tmp/#{id}.svg"
+        #File.open(file,"w+"){|f| f.puts cv.correlation_plot}
+        #`inkview '#{file}'`
+      end
     end
+    report
   end
 
 end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index decbe69..9b02053 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -15,7 +15,8 @@ ENV["MONGOID_ENV"] ||= "development"
 # TODO remove config files, change default via ENV or directly in Mongoid class
 Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
 Mongoid.raise_not_found_error = false # return nil if no document is found
-$mongo = Mongoid.default_client
+$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
+#$mongo = Mongoid.default_client
 $gridfs = $mongo.database.fs
 
 # R setup
@@ -42,7 +43,7 @@ ENV['FMINER_SILENT'] = 'true'
 ENV['FMINER_NR_HITS'] = 'true'
 
 # OpenTox classes and includes
-CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
 
 [ # be aware of the require sequence as it affects class/method overwrites
   "overwrite.rb",
diff --git a/lib/model.rb b/lib/model.rb
index 0155fc8..ddb69e4 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -28,9 +28,6 @@ module OpenTox
       field :neighbor_algorithm, type: String
       field :neighbor_algorithm_parameters, type: Hash
 
-      #attr_accessor :prediction_dataset
-      #attr_accessor :training_dataset
-
       # Create a lazar model from a training_dataset and a feature_dataset
       # @param [OpenTox::Dataset] training_dataset
       # @return [OpenTox::Model::Lazar] Regression or classification model
diff --git a/test/experiment.rb b/test/experiment.rb
index c465d7b..cad4fa7 100644
--- a/test/experiment.rb
+++ b/test/experiment.rb
@@ -4,27 +4,61 @@ class ExperimentTest < MiniTest::Test
 
   def test_regression_experiment
     datasets = [
-      "EPAFHM.csv",
-      "FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv",
+      "EPAFHM.medi.csv",
+      #"EPAFHM.csv",
+      #"FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv",
       "LOAEL_mmol_corrected_smiles.csv"
+    ]
+    experiment = Experiment.create(
+      :name => "Default regression for datasets #{datasets}.",
+      :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
+      :model_settings => [
+        {
+          :algorithm => "OpenTox::Model::LazarRegression",
+        }
       ]
-    model_algorithms = ["OpenTox::Model::LazarRegression"]
-    neighbor_algorithms = ["OpenTox::Algorithm::Neighbor.fingerprint_similarity"]
-    prediction_algorithms = ["OpenTox::Algorithm::Regression.weighted_average"]
-    neighbor_algorithm_parameters = [{:min_sim => 0.7}]
+    )
+    experiment.run
+    puts experiment.report.to_yaml
+    assert_equal datasets.size, experiment.results.size
+    experiment.results.each do |dataset_id, result|
+      assert_equal 1, result.size
+      result.each do |r|
+        assert_kind_of BSON::ObjectId, r[:model_id]
+        assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id]
+      end
+    end
+  end
+
+  def test_classification_experiment
+
+    datasets = [ "hamster_carcinogenicity.csv" ]
     experiment = Experiment.create(
-      :name => "Regression for datasets #{datasets}.",
+      :name => "Fminer vs fingerprint classification for datasets #{datasets}.",
       :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
-      :model_algorithms => model_algorithms,
-      :neighbor_algorithms => neighbor_algorithms,
-      :neighbor_algorithm_parameters => neighbor_algorithm_parameters,
-      :prediction_algorithms => prediction_algorithms,
+      :model_settings => [
+        {
+          :algorithm => "OpenTox::Model::LazarClassification",
+        },{
+          :algorithm => "OpenTox::Model::LazarClassification",
+          :neighbor_algorithm_parameter => {:min_sim => 0.3}
+        },
+        #{
+          #:algorithm => "OpenTox::Model::LazarFminerClassification",
+        #}
+      ]
     )
     experiment.run
 =begin
-    p experiment
-    experiment.report
+    experiment = Experiment.find "55f944a22b72ed7de2000000"
 =end
-    refute_empty experiment.crossvalidation_ids
+    puts experiment.report.to_yaml
+    experiment.results.each do |dataset_id, result|
+      assert_equal 2, result.size
+      result.each do |r|
+        assert_kind_of BSON::ObjectId, r[:model_id]
+        assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id]
+      end
+    end
   end
 end
diff --git a/test/validation.rb b/test/validation.rb
index a4c3d80..dfa2c81 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -33,4 +33,16 @@ class ValidationTest < MiniTest::Test
     #assert cv.weighted_mae < cv.mae
   end
 
+  def test_repeated_crossvalidation
+    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+    model = Model::LazarClassification.create dataset
+    repeated_cv = RepeatedCrossValidation.create model
+    p repeated_cv
+    repeated_cv.crossvalidations.each do |cv|
+      p cv
+      p cv.accuracy
+      assert cv.accuracy > 0.7
+    end
+  end
+
 end
-- 
cgit v1.2.3


From 2fdecbed76c4db8dfe3f10f825fed9772e653197 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 16 Sep 2015 16:52:18 +0200
Subject: generic openbabel fingerprints

---
 lib/compound.rb  | 31 +++++++++++++++++++++++++++++--
 test/compound.rb | 11 +++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/lib/compound.rb b/lib/compound.rb
index 6adf3c0..7f175ca 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -42,6 +42,35 @@ module OpenTox
       compound
     end
 
+    def openbabel_fingerprint type="FP2"
+      fp = OpenBabel::OBFingerprint.find_fingerprint(type)
+      obmol = OpenBabel::OBMol.new
+      obconversion = OpenBabel::OBConversion.new
+      obconversion.set_in_format "smi"
+      obconversion.read_string obmol, smiles
+      result = OpenBabel::VectorUnsignedInt.new
+      fp.get_fingerprint(obmol,result)
+      # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
+      #p OpenBabel::OBFingerprint.describe_bits(result)
+      result = result.to_a
+      # convert result to a list of the bits that are set
+      # from openbabel/scripts/python/pybel.py line 830
+      # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
+      bitsperint = OpenBabel::OBFingerprint.getbitsperint()
+      bits_set = []
+      start = 1
+      result.each do |x|
+        i = start
+        while x > 0 do
+          bits_set << i if (x % 2) == 1
+          x >>= 1
+          i += 1
+        end
+        start += bitsperint
+      end
+      bits_set
+    end
+
     # Create a compound from smiles string
     # @example
     #   compound = OpenTox::Compound.from_smiles("c1ccccc1")
@@ -202,8 +231,6 @@ module OpenTox
       $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
         
     end
-=begin
-=end
 
     private
 
diff --git a/test/compound.rb b/test/compound.rb
index 06c19a2..6deba4e 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -97,4 +97,15 @@ print c.sdf
     c = Compound.from_inchi(inchi)
     assert_equal inchi, c.inchi
   end
+
+  def test_openbabel_fingerprint
+    [
+      "CC(=O)CC(C)C#N",
+      "CC(=O)CC(C)C",
+      "C(=O)CC(C)C#N",
+    ].each do |smi|
+      c = OpenTox::Compound.from_smiles smi
+      assert_equal c.openbabel_fingerprint("FP4").size, c.fp4.size
+    end
+  end
 end
-- 
cgit v1.2.3


From 6ac119c32cef094d4f1c2fb5c2daa4e274401f70 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 17 Sep 2015 14:56:25 +0200
Subject: neighbor calculation moved to Compound class

---
 lib/compound.rb        | 84 +++++++++++++++++++++++++++++++++++---------------
 lib/crossvalidation.rb |  3 +-
 lib/dataset.rb         |  2 +-
 lib/experiment.rb      |  4 +--
 lib/lazar.rb           |  2 +-
 lib/model.rb           | 24 +++++++++++----
 lib/neighbor.rb        | 25 ---------------
 lib/opentox.rb         |  1 -
 test/compound.rb       | 26 ++++++++++++++++
 test/experiment.rb     | 31 +++++++++++++++++--
 10 files changed, 139 insertions(+), 63 deletions(-)
 delete mode 100644 lib/neighbor.rb

diff --git a/lib/compound.rb b/lib/compound.rb
index 7f175ca..7abd913 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -19,8 +19,11 @@ module OpenTox
     field :png_id, type: BSON::ObjectId
     field :svg_id, type: BSON::ObjectId
     field :sdf_id, type: BSON::ObjectId
+    field :fp2, type: Array
+    field :fp3, type: Array
     field :fp4, type: Array
     field :fp4_size, type: Integer
+    field :maccs, type: Array
 
     index({smiles: 1}, {unique: true})
 
@@ -43,32 +46,35 @@ module OpenTox
     end
 
     def openbabel_fingerprint type="FP2"
-      fp = OpenBabel::OBFingerprint.find_fingerprint(type)
-      obmol = OpenBabel::OBMol.new
-      obconversion = OpenBabel::OBConversion.new
-      obconversion.set_in_format "smi"
-      obconversion.read_string obmol, smiles
-      result = OpenBabel::VectorUnsignedInt.new
-      fp.get_fingerprint(obmol,result)
-      # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
-      #p OpenBabel::OBFingerprint.describe_bits(result)
-      result = result.to_a
-      # convert result to a list of the bits that are set
-      # from openbabel/scripts/python/pybel.py line 830
-      # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
-      bitsperint = OpenBabel::OBFingerprint.getbitsperint()
-      bits_set = []
-      start = 1
-      result.each do |x|
-        i = start
-        while x > 0 do
-          bits_set << i if (x % 2) == 1
-          x >>= 1
-          i += 1
+      unless self.send(type.downcase.to_sym) # stored fingerprint
+        fp = OpenBabel::OBFingerprint.find_fingerprint(type)
+        obmol = OpenBabel::OBMol.new
+        obconversion = OpenBabel::OBConversion.new
+        obconversion.set_in_format "smi"
+        obconversion.read_string obmol, smiles
+        result = OpenBabel::VectorUnsignedInt.new
+        fp.get_fingerprint(obmol,result)
+        # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
+        #p OpenBabel::OBFingerprint.describe_bits(result)
+        # convert result to a list of the bits that are set
+        # from openbabel/scripts/python/pybel.py line 830
+        # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
+        result = result.to_a
+        bitsperint = OpenBabel::OBFingerprint.getbitsperint()
+        bits_set = []
+        start = 1
+        result.each do |x|
+          i = start
+          while x > 0 do
+            bits_set << i if (x % 2) == 1
+            x >>= 1
+            i += 1
+          end
+          start += bitsperint
         end
-        start += bitsperint
+        update type.downcase.to_sym, bits_set
       end
-      bits_set
+      self.send(type.downcase.to_sym) 
     end
 
     # Create a compound from smiles string
@@ -206,6 +212,36 @@ module OpenTox
       self["chemblid"]
     end
 
+    def fingerprint_neighbors params
+      bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
+      neighbors = []
+      query_fingerprint = self.openbabel_fingerprint params[:type]
+      training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+        unless self == compound
+          fingerprint = compound.openbabel_fingerprint params[:type]
+          sim = (query_fingerprint & fingerprint).size/(query_fingerprint | fingerprint).size.to_f
+          neighbors << [compound.id, sim] if sim >= params[:min_sim]
+        end
+      end
+      neighbors.sort{|a,b| b.last <=> a.last}
+    end
+
+    def fminer_neighbors params
+      bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
+      feature_dataset = Dataset.find params[:feature_dataset_id]
+      query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
+      neighbors = []
+
+      # find neighbors
+      feature_dataset.data_entries.each_with_index do |fingerprint, i|
+        sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
+        if sim >= params[:min_sim]
+          neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+        end
+      end
+      neighbors
+    end
+
     def neighbors threshold=0.7
       # TODO restrict to dataset
       # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index f480932..337b434 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -279,7 +279,8 @@ module OpenTox
     field :crossvalidation_ids, type: Array, default: []
     def self.create model, folds=10, repeats=3
       repeated_cross_validation = self.new
-      repeats.times do
+      repeats.times do |n|
+        $logger.debug "Crossvalidation #{n+1} for #{model.name}"
         repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
       end
       repeated_cross_validation.save
diff --git a/lib/dataset.rb b/lib/dataset.rb
index d884716..7d889f8 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -152,7 +152,7 @@ module OpenTox
       name = File.basename(file,".*")
       dataset = self.find_by(:source => source, :name => name)
       if dataset
-        $logger.debug "Skipping #{file}, it is already in the database (id: #{dataset.id})."
+        $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
       else
         $logger.debug "Parsing #{file}."
         table = CSV.read file, :skip_blanks => true
diff --git a/lib/experiment.rb b/lib/experiment.rb
index 7849337..985a491 100644
--- a/lib/experiment.rb
+++ b/lib/experiment.rb
@@ -2,7 +2,7 @@ module OpenTox
 
   class Experiment
     field :dataset_ids, type: Array
-    field :model_settings, type: Array
+    field :model_settings, type: Array, default: []
     field :results, type: Hash, default: {}
   end
 
@@ -26,7 +26,7 @@ module OpenTox
   def self.create params
     experiment = self.new
     $logge.debug "Experiment started ..."
-    experiment.run params
+    #experiment.run params
     experiment
   end
 
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 9b02053..89b50f7 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -59,7 +59,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat
   "bbrc.rb",
   "model.rb",
   "similarity.rb",
-  "neighbor.rb",
+  #"neighbor.rb",
   "classification.rb",
   "regression.rb",
   "validation.rb",
diff --git a/lib/model.rb b/lib/model.rb
index ddb69e4..9892f64 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -39,6 +39,7 @@ module OpenTox
         prediction_feature = training_dataset.features.first
         prediction_feature.nominal ?  lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
         lazar.training_dataset_id = training_dataset.id
+        lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
         lazar.prediction_feature_id = prediction_feature.id
         lazar.name = "#{training_dataset.name} #{prediction_feature.name}" 
 
@@ -78,7 +79,8 @@ module OpenTox
             predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
             next
           end
-          neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
+          neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+          #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
           # add activities
           # TODO: improve efficiency, takes 3 times longer than previous version
           neighbors.collect! do |n|
@@ -129,8 +131,12 @@ module OpenTox
       def initialize
         super
         self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
-        self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
-        self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+        self.neighbor_algorithm = "fingerprint_neighbors"
+        self.neighbor_algorithm_parameters = {
+          :type => "FP4",
+          :training_dataset_id => training_dataset_id,
+          :min_sim => 0.7
+        }
       end
     end
 
@@ -141,7 +147,7 @@ module OpenTox
         model = super(training_dataset)
         model.update "_type" => self.to_s # adjust class
         model = self.find model.id # adjust class
-        model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
+        model.neighbor_algorithm = "fminer_neighbors"
         model.neighbor_algorithm_parameters = {
           :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
           :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id,
@@ -154,11 +160,17 @@ module OpenTox
     end
 
     class LazarRegression < Lazar
+
       def initialize
         super
-        self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+        #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+        self.neighbor_algorithm = "fingerprint_neighbors"
         self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" 
-        self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+        self.neighbor_algorithm_parameters = {
+          :type => "FP4",
+          :training_dataset_id => self.training_dataset_id,
+          :min_sim => 0.7
+        }
       end
     end
 
diff --git a/lib/neighbor.rb b/lib/neighbor.rb
deleted file mode 100644
index d849cbf..0000000
--- a/lib/neighbor.rb
+++ /dev/null
@@ -1,25 +0,0 @@
-module OpenTox
-  module Algorithm
-    class Neighbor
-
-      def self.fingerprint_similarity compound, params={}
-        compound.neighbors params[:min_sim]
-      end
-
-      def self.fminer_similarity compound, params
-        feature_dataset = Dataset.find params[:feature_dataset_id]
-        query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
-        neighbors = []
-
-        # find neighbors
-        feature_dataset.data_entries.each_with_index do |fingerprint, i|
-          sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
-          if sim > params[:min_sim]
-            neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
-          end
-        end
-        neighbors
-      end
-    end
-  end
-end
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 875487c..186c87a 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -14,7 +14,6 @@ module OpenTox
       store_in collection: klass.downcase.pluralize
       field :name,  type: String
       field :warnings, type: Array, default: []
-
     end
     OpenTox.const_set klass,c
   end
diff --git a/test/compound.rb b/test/compound.rb
index 6deba4e..6a3c696 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -108,4 +108,30 @@ print c.sdf
       assert_equal c.openbabel_fingerprint("FP4").size, c.fp4.size
     end
   end
+
+  def test_fingerprint_neighbors
+    types = ["FP2", "FP3", "FP4", "MACCS"]
+    min_sim = 0.7
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
+    [
+      "CC(=O)CC(C)C#N",
+      "CC(=O)CC(C)C",
+      "C(=O)CC(C)C#N",
+    ].each do |smi|
+      c = OpenTox::Compound.from_smiles smi
+      p c.smiles
+      types.each do |type|
+        p type
+        neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
+        p neighbors.collect{|n| [Compound.find(n.first).smiles,n.last]}
+        if type == "FP4"
+          fp4_neighbors = c.neighbors
+          neighbors.each do |n|
+            p [Compound.find(n.first).smiles,n.last] unless fp4_neighbors.include?(n)
+            assert_includes fp4_neighbors, n
+          end
+        end
+      end
+    end
+  end
 end
diff --git a/test/experiment.rb b/test/experiment.rb
index cad4fa7..4b54768 100644
--- a/test/experiment.rb
+++ b/test/experiment.rb
@@ -18,7 +18,7 @@ class ExperimentTest < MiniTest::Test
         }
       ]
     )
-    experiment.run
+    #experiment.run
     puts experiment.report.to_yaml
     assert_equal datasets.size, experiment.results.size
     experiment.results.each do |dataset_id, result|
@@ -48,7 +48,7 @@ class ExperimentTest < MiniTest::Test
         #}
       ]
     )
-    experiment.run
+    #experiment.run
 =begin
     experiment = Experiment.find "55f944a22b72ed7de2000000"
 =end
@@ -61,4 +61,31 @@ class ExperimentTest < MiniTest::Test
       end
     end
   end
+
+  def test_regression_fingerprints
+    datasets = [
+      "LOAEL_mmol_corrected_smiles.csv"
+    ]
+    min_sims = [0.3,0.7]
+    types = ["FP2","FP3","FP4","MACCS"]
+    experiment = Experiment.create(
+      :name => "Fminer vs fingerprint classification for datasets #{datasets}.",
+      :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
+    )
+    types.each do |type|
+      min_sims.each do |min_sim|
+        experiment.model_settings << {
+          :algorithm => "OpenTox::Model::LazarRegression",
+          :neighbor_algorithm => "fingerprint_neighbors",
+          :neighbor_algorithm_parameter => {
+            :type => type,
+            :min_sim => min_sim,
+          }
+        }
+      end
+    end
+    experiment.run
+    p experiment.report
+
+  end
 end
-- 
cgit v1.2.3


From 33989261450bba279b4e002e5e4ea0475d742abb Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 18 Sep 2015 13:01:59 +0200
Subject: fix for empty values

---
 lib/dataset.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/dataset.rb b/lib/dataset.rb
index 7d889f8..00e2bc3 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -203,7 +203,7 @@ module OpenTox
             feature = NominalFeature.find_or_create_by(metadata)
           end
         end
-        feature_ids << feature.id
+        feature_ids << feature.id if feature
       end
       
       $logger.debug "Feature values: #{Time.now-time}"
@@ -245,7 +245,7 @@ module OpenTox
         end
 
         compound_ids << compound.id
-        @data_entries << Array.new(table.first.size-1)
+        @data_entries << Array.new(table.first.size-1) if (table.first.size-1) > 0
         
         vals.each_with_index do |v,j|
           if v.blank?
-- 
cgit v1.2.3