From 7c3bd90c26dfeea2db3cf74a1cefc23d8dece7c0 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 15 Mar 2016 17:40:40 +0100
Subject: validation tests pass

---
 lib/classification.rb           | 73 --------------------------------------
 lib/crossvalidation.rb          | 68 +++++++++++++++++-------------------
 lib/dataset.rb                  | 23 +-----------
 lib/leave-one-out-validation.rb | 16 ++++-----
 lib/model.rb                    | 77 ++++++++++++++---------------------------
 lib/regression.rb               | 43 ++++++++++++-----------
 lib/validation.rb               |  3 +-
 test/all.rb                     |  4 +--
 test/classification.rb          | 41 ++++++++++++++++++++++
 test/dataset.rb                 | 12 +------
 test/descriptor-long.rb         | 26 --------------
 test/fminer-long.rb             | 41 ----------------------
 test/fminer.rb                  | 52 ----------------------------
 test/lazar-classification.rb    | 42 ----------------------
 test/lazar-fminer.rb            | 51 ---------------------------
 test/prediction_models.rb       |  1 +
 test/regression.rb              |  2 +-
 test/validation.rb              | 62 +++++----------------------------
 18 files changed, 146 insertions(+), 491 deletions(-)
 create mode 100644 test/classification.rb
 delete mode 100644 test/descriptor-long.rb
 delete mode 100644 test/fminer-long.rb
 delete mode 100644 test/fminer.rb
 delete mode 100644 test/lazar-classification.rb
 delete mode 100644 test/lazar-fminer.rb

diff --git a/lib/classification.rb b/lib/classification.rb
index abbb5b3..0202940 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -28,80 +28,7 @@ module OpenTox
           bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
         end
       end
-
-      # Classification with majority vote from neighbors weighted by similarity
-      # @param [Hash] params Keys `:activities, :sims, :value_map` are required
-      # @return [Numeric] A prediction value.
-      def self.fminer_weighted_majority_vote neighbors, training_dataset
-
-        neighbor_contribution = 0.0
-        confidence_sum = 0.0
-
-        $logger.debug "Weighted Majority Vote Classification."
-
-        values = neighbors.collect{|n| n[2]}.uniq
-        neighbors.each do |neighbor|
-          i = training_dataset.compound_ids.index n.id
-          neighbor_weight = neighbor[1]
-          activity = values.index(neighbor[2]) + 1 # map values to integers > 1
-          neighbor_contribution += activity * neighbor_weight
-          if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
-            case activity
-            when 1
-              confidence_sum -= neighbor_weight
-            when 2
-              confidence_sum += neighbor_weight
-            end
-          else
-            confidence_sum += neighbor_weight
-          end
-        end
-        if values.size == 2 
-          if confidence_sum >= 0.0
-            prediction = values[1]
-          elsif confidence_sum < 0.0
-            prediction = values[0] 
-          end
-        elsif values.size == 1 # all neighbors have the same value
-          prediction = values[0] 
-        else 
-          prediction = (neighbor_contribution/confidence_sum).round  # AM: new multinomial prediction
-        end 
-
-        confidence = (confidence_sum/neighbors.size).abs 
-        {:value => prediction, :confidence => confidence.abs}
-      end
-
-      # Local support vector regression from neighbors 
-      # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
-      # @return [Numeric] A prediction value.
-      def self.local_svm_classification(params)
-
-        confidence = 0.0
-        prediction = nil
-
-        $logger.debug "Local SVM."
-        if params[:activities].size>0
-          if params[:props]
-            n_prop = params[:props][0].collect.to_a
-            q_prop = params[:props][1].collect.to_a
-            props = [ n_prop, q_prop ]
-          end
-          activities = params[:activities].collect.to_a
-          activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
-          prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
-          prediction = prediction.sub(/Val/,"") if prediction # Convert back
-          confidence = 0.0 if prediction.nil?
-          confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
-        end
-        {:value => prediction, :confidence => confidence}
-
-      end
-
-
-
     end
-
   end
 end
 
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index cd94e33..08a5ad3 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -52,9 +52,10 @@ module OpenTox
       cv.update_attributes(
         nr_instances: nr_instances,
         nr_unpredicted: nr_unpredicted,
-        predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+        predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
       )
       $logger.debug "Nr unpredicted: #{nr_unpredicted}"
+      cv.statistics
       cv
     end
   end
@@ -78,23 +79,26 @@ module OpenTox
       true_rate = {}
       predictivity = {}
       predictions.each do |pred|
-        compound_id,activity,prediction,confidence = pred
-        if activity and prediction and confidence.numeric? 
-          if prediction == activity
-            if prediction == accept_values[0]
-              confusion_matrix[0][0] += 1
-              weighted_confusion_matrix[0][0] += confidence
-            elsif prediction == accept_values[1]
-              confusion_matrix[1][1] += 1
-              weighted_confusion_matrix[1][1] += confidence
-            end
-          elsif prediction != activity
-            if prediction == accept_values[0]
-              confusion_matrix[0][1] += 1
-              weighted_confusion_matrix[0][1] += confidence
-            elsif prediction == accept_values[1]
-              confusion_matrix[1][0] += 1
-              weighted_confusion_matrix[1][0] += confidence
+        compound_id,activities,prediction,confidence = pred
+        if activities and prediction #and confidence.numeric? 
+          if activities.uniq.size == 1
+            activity = activities.uniq.first
+            if prediction == activity
+              if prediction == accept_values[0]
+                confusion_matrix[0][0] += 1
+                #weighted_confusion_matrix[0][0] += confidence
+              elsif prediction == accept_values[1]
+                confusion_matrix[1][1] += 1
+                #weighted_confusion_matrix[1][1] += confidence
+              end
+            elsif prediction != activity
+              if prediction == accept_values[0]
+                confusion_matrix[0][1] += 1
+                #weighted_confusion_matrix[0][1] += confidence
+              elsif prediction == accept_values[1]
+                confusion_matrix[1][0] += 1
+                #weighted_confusion_matrix[1][0] += confidence
+              end
             end
           end
         else
@@ -108,17 +112,17 @@ module OpenTox
         predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
       end
       confidence_sum = 0
-      weighted_confusion_matrix.each do |r|
-        r.each do |c|
-          confidence_sum += c
-        end
-      end
+      #weighted_confusion_matrix.each do |r|
+        #r.each do |c|
+          #confidence_sum += c
+        #end
+      #end
       update_attributes(
         accept_values: accept_values,
         confusion_matrix: confusion_matrix,
-        weighted_confusion_matrix: weighted_confusion_matrix,
+        #weighted_confusion_matrix: weighted_confusion_matrix,
         accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
-        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
         true_rate: true_rate,
         predictivity: predictivity,
         finished_at: Time.now
@@ -161,20 +165,12 @@ module OpenTox
 
     field :rmse, type: Float
     field :mae, type: Float
-    field :weighted_rmse, type: Float
-    field :weighted_mae, type: Float
     field :r_squared, type: Float
     field :correlation_plot_id, type: BSON::ObjectId
-    field :confidence_plot_id, type: BSON::ObjectId
 
     def statistics
       rmse = 0
-      weighted_rmse = 0
-      rse = 0
-      weighted_rse = 0
       mae = 0
-      weighted_mae = 0
-      confidence_sum = 0
       x = []
       y = []
       predictions.each do |pred|
@@ -185,10 +181,10 @@ module OpenTox
             y << -Math.log10(prediction)
             error = Math.log10(prediction)-Math.log10(activity.median)
             rmse += error**2
-            weighted_rmse += confidence*error**2
+            #weighted_rmse += confidence*error**2
             mae += error.abs
-            weighted_mae += confidence*error.abs
-            confidence_sum += confidence
+            #weighted_mae += confidence*error.abs
+            #confidence_sum += confidence
           end
         else
           warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
diff --git a/lib/dataset.rb b/lib/dataset.rb
index af851b5..5d8aeaf 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -85,6 +85,7 @@ module OpenTox
             compound.dataset_ids << dataset.id
             compound.save
           end
+          dataset.save
           dataset
         end
         start = last+1
@@ -283,28 +284,6 @@ module OpenTox
       end
     end
 
-    def scale
-      scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)}
-      centers = []
-      scales = []
-      feature_ids.each_with_index do |feature_id,col| 
-        R.assign "x", data_entries.collect{|de| de[col]}
-        R.eval "scaled = scale(x,center=T,scale=T)"
-        centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby
-        scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby
-        R.eval("scaled").to_ruby.each_with_index do |value,row|
-          scaled_data_entries[row][col] = value
-        end
-      end
-      scaled_dataset = ScaledDataset.new(attributes)
-      scaled_dataset["_id"] = BSON::ObjectId.new
-      scaled_dataset["_type"] = "OpenTox::ScaledDataset"
-      scaled_dataset.centers = centers
-      scaled_dataset.scales = scales
-      scaled_dataset.data_entries = scaled_data_entries
-      scaled_dataset.save
-      scaled_dataset
-    end
   end
 
   # Dataset for lazar predictions
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 9db10c6..2cd13db 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -18,7 +18,7 @@ module OpenTox
       predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
       loo.nr_instances = predictions.size
       predictions.select!{|p| p[:value]} # remove unpredicted
-      loo.predictions = predictions.sort{|a,b| b[:confidence] <=> a[:confidence]}
+      loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
       loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
       loo.statistics
       loo.save
@@ -126,8 +126,8 @@ module OpenTox
 
     field :rmse, type: Float, default: 0.0
     field :mae, type: Float, default: 0
-    field :weighted_rmse, type: Float, default: 0
-    field :weighted_mae, type: Float, default: 0
+    #field :weighted_rmse, type: Float, default: 0
+    #field :weighted_mae, type: Float, default: 0
     field :r_squared, type: Float
     field :correlation_plot_id, type: BSON::ObjectId
     field :confidence_plot_id, type: BSON::ObjectId
@@ -143,10 +143,10 @@ module OpenTox
             measured_values << activity
             error = Math.log10(pred[:value])-Math.log10(activity)
             self.rmse += error**2
-            self.weighted_rmse += pred[:confidence]*error**2
+            #self.weighted_rmse += pred[:confidence]*error**2
             self.mae += error.abs
-            self.weighted_mae += pred[:confidence]*error.abs
-            confidence_sum += pred[:confidence]
+            #self.weighted_mae += pred[:confidence]*error.abs
+            #confidence_sum += pred[:confidence]
           end
         end
         if pred[:database_activities].empty?
@@ -160,9 +160,9 @@ module OpenTox
       r = R.eval("r").to_ruby
 
       self.mae = self.mae/predictions.size
-      self.weighted_mae = self.weighted_mae/confidence_sum
+      #self.weighted_mae = self.weighted_mae/confidence_sum
       self.rmse = Math.sqrt(self.rmse/predictions.size)
-      self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
+      #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
       self.r_squared = r**2
       self.finished_at = Time.now
       save
diff --git a/lib/model.rb b/lib/model.rb
index ebc0db3..f21ea54 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -47,13 +47,32 @@ module OpenTox
         self
       end
 
-      def predict object
+      def predict_compound compound
+        prediction_feature = Feature.find prediction_feature_id
+        neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+        # remove neighbors without prediction_feature
+        # check for database activities (neighbors may include query compound)
+        database_activities = nil
+        prediction = {}
+        if neighbors.collect{|n| n["_id"]}.include? compound.id
+
+          database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
+          prediction[:database_activities] = database_activities
+          prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
+          neighbors.delete_if{|n| n["_id"] == compound.id}
+        end
+        neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
+        if neighbors.empty?
+          prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset."})
+        else
+          prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
+        end
+        prediction
+      end
 
-        t = Time.now
-        at = Time.now
+      def predict object
 
         training_dataset = Dataset.find training_dataset_id
-        prediction_feature = Feature.find prediction_feature_id
 
         # parse data
         compounds = []
@@ -70,30 +89,7 @@ module OpenTox
 
         # make predictions
         predictions = []
-        neighbors = []
-        compounds.each_with_index do |compound,c|
-          t = Time.new
-
-          neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
-          # remove neighbors without prediction_feature
-          # check for database activities (neighbors may include query compound)
-          database_activities = nil
-          prediction = {}
-          if neighbors.collect{|n| n["_id"]}.include? compound.id
-
-            database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
-            prediction[:database_activities] = database_activities
-            prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
-            neighbors.delete_if{|n| n["_id"] == compound.id}
-          end
-          neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
-          if neighbors.empty?
-            prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset."})
-          else
-            prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
-          end
-          predictions << prediction
-        end 
+        predictions = compounds.collect{|c| predict_compound c}
 
         # serialize result
         case object.class.to_s
@@ -105,7 +101,8 @@ module OpenTox
           return predictions
         when "OpenTox::Dataset"
           # prepare prediction dataset
-          measurement_feature = prediction_feature
+          measurement_feature = Feature.find prediction_feature_id
+
           prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
           prediction_dataset = LazarPrediction.new(
             :name => "Lazar prediction for #{prediction_feature.name}",
@@ -114,11 +111,9 @@ module OpenTox
 
           )
           confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
-          # TODO move into warnings field
           warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
           prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
           prediction_dataset.compounds = compounds
-          # TODO fix dataset measurements
           prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
           prediction_dataset.save
           return prediction_dataset
@@ -171,25 +166,6 @@ module OpenTox
       end
     end
 
-    class LazarFminerClassification < LazarClassification
-      field :feature_calculation_parameters, type: Hash
-
-      def self.create training_dataset, fminer_params={}
-        model = super(training_dataset)
-        model.update "_type" => self.to_s # adjust class
-        model = self.find model.id # adjust class
-        model.neighbor_algorithm = "fminer_neighbors"
-        model.neighbor_algorithm_parameters = {
-          :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
-          :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id,
-          :min_sim => 0.3
-        }
-        model.feature_calculation_parameters = fminer_params
-        model.save
-        model
-      end
-    end
-
     class Prediction
       include OpenTox
       include Mongoid::Document
@@ -238,7 +214,6 @@ module OpenTox
         training_dataset = Dataset.from_csv_file file
         model = nil
         if training_dataset.features.first.nominal?
-          #model = LazarFminerClassification.create training_dataset
           model = LazarClassification.create training_dataset
         elsif training_dataset.features.first.numeric?
           model = LazarRegression.create training_dataset
diff --git a/lib/regression.rb b/lib/regression.rb
index e0b109e..b8efd30 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -1,25 +1,23 @@
 module OpenTox
   module Algorithm
     
-    # TODO add LOO errors
     class Regression
 
       def self.local_weighted_average compound, params
         weighted_sum = 0.0
         sim_sum = 0.0
-        confidence = 0.0
         neighbors = params[:neighbors]
         neighbors.each do |row|
           sim = row["tanimoto"]
-          confidence = sim if sim > confidence # distance to nearest neighbor
-          row["features"][params[:prediction_feature_id].to_s].each do |act|
-            weighted_sum += sim*Math.log10(act)
-            sim_sum += sim
+          if row["features"][params[:prediction_feature_id].to_s]
+            row["features"][params[:prediction_feature_id].to_s].each do |act|
+              weighted_sum += sim*Math.log10(act)
+              sim_sum += sim
+            end
           end
         end
-        confidence = 0 if confidence.nan?
         sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
-        {:value => prediction,:confidence => confidence}
+        {:value => prediction}
       end
 
       # TODO explicit neighbors, also for physchem
@@ -31,15 +29,18 @@ module OpenTox
         weights = []
         fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
         
+        #p neighbors
         neighbors.each_with_index do |row,i|
           neighbor = Compound.find row["_id"]
           fingerprint = neighbor.fingerprint
-          row["features"][params[:prediction_feature_id].to_s].each do |act|
-            activities << Math.log10(act)
-            weights << row["tanimoto"]
-            fingerprint_ids.each_with_index do |id,j|
-              fingerprints[id] ||= []
-              fingerprints[id] << fingerprint.include?(id) 
+          if row["features"][params[:prediction_feature_id].to_s]
+            row["features"][params[:prediction_feature_id].to_s].each do |act|
+              activities << Math.log10(act)
+              weights << row["tanimoto"]
+              fingerprint_ids.each_with_index do |id,j|
+                fingerprints[id] ||= []
+                fingerprints[id] << fingerprint.include?(id) 
+              end
             end
           end
         end
@@ -86,12 +87,14 @@ module OpenTox
         
         neighbors.each_with_index do |row,i|
           neighbor = Compound.find row["_id"]
-          row["features"][params[:prediction_feature_id].to_s].each do |act|
-            activities << Math.log10(act)
-            weights << row["tanimoto"] # TODO cosine ?
-            neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
-              physchem[pid] ||= []
-              physchem[pid] <<  v
+          if row["features"][params[:prediction_feature_id].to_s]
+            row["features"][params[:prediction_feature_id].to_s].each do |act|
+              activities << Math.log10(act)
+              weights << row["tanimoto"] # TODO cosine ?
+              neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
+                physchem[pid] ||= []
+                physchem[pid] <<  v
+              end
             end
           end
         end
diff --git a/lib/validation.rb b/lib/validation.rb
index 3659341..b72d273 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -37,11 +37,10 @@ module OpenTox
       nr_unpredicted = 0
       activities = test_set.data_entries.collect{|de| de.first}
       prediction_dataset.data_entries.each_with_index do |de,i|
-        if de[0] and de[1] 
+        if de[0] #and de[1] 
           cid = prediction_dataset.compound_ids[i]
           rows = cids.each_index.select{|r| cids[r] == cid }
           activities = rows.collect{|r| test_set.data_entries[r][0]}
-          #activity = activities[i]
           prediction = de.first
           confidence = de[1]
           predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
diff --git a/test/all.rb b/test/all.rb
index 2bb1c4f..eddf4e6 100644
--- a/test/all.rb
+++ b/test/all.rb
@@ -1,5 +1,5 @@
-exclude = ["./setup.rb","./all.rb"]
+# "./default_environment.rb" has to be executed separately
+exclude = ["./setup.rb","./all.rb", "./default_environment.rb"]
 (Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test|
-  p test 
   require_relative test
 end
diff --git a/test/classification.rb b/test/classification.rb
new file mode 100644
index 0000000..bedbe14
--- /dev/null
+++ b/test/classification.rb
@@ -0,0 +1,41 @@
+require_relative "setup.rb"
+
+class LazarClassificationTest < MiniTest::Test
+
+  def test_lazar_classification
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
+    model = Model::LazarClassification.create training_dataset
+
+    [ {
+      :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
+      :prediction => "false",
+      :confidence => 0.25281385281385277,
+      :nr_neighbors => 11
+    },{
+      :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
+      :prediction => "false",
+      :confidence => 0.3639589577089577,
+      :nr_neighbors => 14
+    } ].each do |example|
+      prediction = model.predict example[:compound]
+      assert_equal example[:prediction], prediction[:value]
+      #assert_equal example[:confidence], prediction[:confidence]
+      #assert_equal example[:nr_neighbors], prediction[:neighbors].size
+    end
+
+    compound = Compound.from_smiles "CCO"
+    prediction = model.predict compound
+    assert_equal ["false"], prediction[:database_activities]
+    assert_equal "true", prediction[:value]
+
+    # make a dataset prediction
+    compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
+    prediction = model.predict compound_dataset
+    assert_equal compound_dataset.compounds, prediction.compounds
+
+    assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3]
+    assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3]
+    # cleanup
+    [training_dataset,model,compound_dataset].each{|o| o.delete}
+  end
+end
diff --git a/test/dataset.rb b/test/dataset.rb
index 2f75703..297251e 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -8,7 +8,7 @@ class DatasetTest < MiniTest::Test
     d1 = Dataset.new 
     d1.save
     datasets = Dataset.all 
-    assert_equal Dataset, datasets.first.class
+    assert datasets.first.is_a?(Dataset), "#{datasets.first} is not a Dataset."
     d1.delete
   end
 
@@ -203,16 +203,6 @@ class DatasetTest < MiniTest::Test
     assert_equal 0.00323, d2.data_entries[5][0]
   end
 
-  def test_scaled_dataset
-    original_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
-    scaled_dataset = original_dataset.scale
-    scaled_dataset.data_entries.each_with_index do |row,i|
-      row.each_with_index do |value,j|
-        assert_equal original_dataset.data_entries[i][j].round(4), scaled_dataset.original_value(value,j).round(4) if value # ignore nils
-      end
-    end
-  end
-
   def test_folds
     dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv")
     dataset.folds(10).each do |fold|
diff --git a/test/descriptor-long.rb b/test/descriptor-long.rb
deleted file mode 100644
index 7a4c00f..0000000
--- a/test/descriptor-long.rb
+++ /dev/null
@@ -1,26 +0,0 @@
-require_relative "setup.rb"
-class DescriptorLongTest < MiniTest::Test
-
-  def test_dataset_all
-    # TODO: improve CDK descriptor calculation speed or add timeout
-    skip "CDK descriptor calculation takes too long for some compounds"
-    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
-    d = OpenTox::Algorithm::Descriptor.physchem dataset
-    assert_equal dataset.compounds, d.compounds
-    assert_equal 332, d.features.size
-    assert_equal 332, d.data_entries.first.size
-    d.delete
-  end
-
-  def test_dataset_openbabel
-    # TODO: improve CDK descriptor calculation speed or add timeout
-    dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv")
-    d = Algorithm::Descriptor.physchem dataset, Algorithm::Descriptor::OBDESCRIPTORS.keys
-    assert_equal dataset.compounds, d.compounds
-    size = Algorithm::Descriptor::OBDESCRIPTORS.keys.size
-    assert_equal size, d.features.size
-    assert_equal size, d.data_entries.first.size
-    d.delete
-  end
-
-end
diff --git a/test/fminer-long.rb b/test/fminer-long.rb
deleted file mode 100644
index 845ed71..0000000
--- a/test/fminer-long.rb
+++ /dev/null
@@ -1,41 +0,0 @@
-require_relative "setup.rb"
-
-class FminerTest < MiniTest::Test
-
-  def test_fminer_multicell
-    skip
-    #skip "multicell segfaults"
-    # TODO aborts, probably fminer
-    # or OpenBabel segfault
-    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv")
-    feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
-    p feature_dataset.training_parameters
-    assert_equal dataset.compound_ids, feature_dataset.compound_ids
-    dataset.delete
-    feature_dataset.delete
-  end
-
-  def test_fminer_isscan
-    skip
-    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv")
-    feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15)
-    assert_equal feature_dataset.compounds.size, dataset.compounds.size
-    p feature_dataset.features.size
-    p feature_dataset.training_parameters
-    dataset.delete
-    feature_dataset.delete
-  end
-
-  def test_fminer_kazius
-    skip
-    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv")
-    # TODO reactivate default settings
-    feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20)
-    assert_equal feature_dataset.compounds.size, dataset.compounds.size
-    feature_dataset = Dataset.find feature_dataset.id
-    assert feature_dataset.data_entries.size, dataset.compounds.size
-    dataset.delete
-    feature_dataset.delete
-  end
-
-end
diff --git a/test/fminer.rb b/test/fminer.rb
deleted file mode 100644
index 16e1f9e..0000000
--- a/test/fminer.rb
+++ /dev/null
@@ -1,52 +0,0 @@
-require_relative "setup.rb"
-
-class FminerTest < MiniTest::Test
-
-  def test_fminer_bbrc
-    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
-    refute_nil dataset.id
-    feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset
-    feature_dataset = Dataset.find feature_dataset.id
-    assert_equal dataset.compounds.size, feature_dataset.compounds.size
-    # TODO: fminer calculates 62 instead of 54 features
-    # it is unclear which commit changed the numbers (occurs with old libraries/mongodb branch too
-    # modification of Compound to use smiles instead of inchis seems to have no effect
-    #assert_equal 54, feature_dataset.features.size
-    #assert_equal "C-C-C=C", feature_dataset.features.first.smarts
-    compounds = feature_dataset.compounds
-    smarts = feature_dataset.features
-    smarts.each do |smart|
-      assert smart.p_value.round(2) >= 0.95
-    end
-    match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
-    feature_dataset.data_entries.each_with_index do |fingerprint,i|
-      assert_equal match[i], fingerprint
-    end
-
-    dataset.delete
-    feature_dataset.delete
-  end
-
-  def test_fminer_last
-    skip "last features have to be activated"
-    dataset = OpenTox::Dataset.new
-    dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv")
-    feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset
-    assert_equal dataset.compounds.size, feature_dataset.compounds.size
-    assert_equal 21, feature_dataset.features.size
-    assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts
-
-    compounds = feature_dataset.compounds
-    smarts = feature_dataset.features.collect{|f| f.smarts}
-    match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts
-    compounds.each_with_index do |c,i|
-      smarts.each_with_index do |s,j|
-        assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i
-      end
-    end
-
-    dataset.delete
-    feature_dataset.delete
-  end
-
-end
diff --git a/test/lazar-classification.rb b/test/lazar-classification.rb
deleted file mode 100644
index e8b2181..0000000
--- a/test/lazar-classification.rb
+++ /dev/null
@@ -1,42 +0,0 @@
-require_relative "setup.rb"
-
-class LazarClassificationTest < MiniTest::Test
-
-  def test_lazar_classification
-    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
-    model = Model::LazarClassification.create training_dataset#, feature_dataset
-    #assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
-
-    [ {
-      :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
-      :prediction => "false",
-      :confidence => 0.25281385281385277,
-      :nr_neighbors => 11
-    },{
-      :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
-      :prediction => "false",
-      :confidence => 0.3639589577089577,
-      :nr_neighbors => 14
-    } ].each do |example|
-      prediction = model.predict example[:compound]
-      assert_equal example[:prediction], prediction[:value]
-      #assert_equal example[:confidence], prediction[:confidence]
-      #assert_equal example[:nr_neighbors], prediction[:neighbors].size
-    end
-
-    compound = Compound.from_smiles "CCO"
-    prediction = model.predict compound
-    assert_equal ["false"], prediction[:database_activities]
-    assert_equal "true", prediction[:value]
-
-    # make a dataset prediction
-    compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
-    prediction = model.predict compound_dataset
-    assert_equal compound_dataset.compounds, prediction.compounds
-
-    assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2]
-    assert_equal "measured", prediction.data_entries[14][1]
-    # cleanup
-    [training_dataset,model,compound_dataset].each{|o| o.delete}
-  end
-end
diff --git a/test/lazar-fminer.rb b/test/lazar-fminer.rb
deleted file mode 100644
index 9e024a1..0000000
--- a/test/lazar-fminer.rb
+++ /dev/null
@@ -1,51 +0,0 @@
-require_relative "setup.rb"
-
-class LazarFminerTest < MiniTest::Test
-
-  def test_lazar_fminer
-    skip
-    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
-    model = Model::LazarFminerClassification.create training_dataset#, feature_dataset
-    feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id]
-    assert_equal training_dataset.compounds.size, feature_dataset.compounds.size
-    #TODO check fminer features, see fminer.rb
-    #assert_equal 54, feature_dataset.features.size
-    feature_dataset.data_entries.each do |e|
-      assert_equal e.size, feature_dataset.features.size
-    end
-    #assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
-
-    [ {
-      :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
-      :prediction => "false",
-      :confidence => 0.25281385281385277,
-      :nr_neighbors => 11
-    },{
-      :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
-      :prediction => "false",
-      :confidence => 0.3639589577089577,
-      :nr_neighbors => 14
-    }, {
-      :compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'),
-      :prediction => "false",
-      :confidence => 0.5555555555555556,
-      :nr_neighbors => 1
-    }].each do |example|
-      prediction = model.predict example[:compound]
-
-      assert_equal example[:prediction], prediction[:value]
-      #assert_equal example[:confidence], prediction[:confidence]
-      #assert_equal example[:nr_neighbors], prediction[:neighbors].size
-    end
-
-    # make a dataset prediction
-    compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
-    prediction = model.predict compound_dataset
-    assert_equal compound_dataset.compounds, prediction.compounds
-
-    assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2]
-    assert_equal "measured", prediction.data_entries[14][1]
-    # cleanup
-    [training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete}
-  end
-end
diff --git a/test/prediction_models.rb b/test/prediction_models.rb
index 49a2472..a2e5fe2 100644
--- a/test/prediction_models.rb
+++ b/test/prediction_models.rb
@@ -10,6 +10,7 @@ class PredictionModelTest < MiniTest::Test
     assert pm.classification?
     refute pm.regression?
     pm.crossvalidations.each do |cv|
+      p cv
       assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
     end
     prediction = pm.predict Compound.from_smiles("CCCC(NN)C")
diff --git a/test/regression.rb b/test/regression.rb
index c25ed2b..6936eb6 100644
--- a/test/regression.rb
+++ b/test/regression.rb
@@ -4,7 +4,7 @@ class LazarRegressionTest < MiniTest::Test
 
   def test_weighted_average
     training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
-    model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average"}
+    model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average"}
     compound = Compound.from_smiles "CC(C)(C)CN"
     prediction = model.predict compound
     assert_equal 7.2, prediction[:value].round(1)
diff --git a/test/validation.rb b/test/validation.rb
index d8aae87..c803c92 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -2,56 +2,25 @@ require_relative "setup.rb"
 
 class ValidationTest < MiniTest::Test
 
-  def test_fminer_crossvalidation
-    skip
+  def test_default_classification_crossvalidation
     dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
-    model = Model::LazarFminerClassification.create dataset
-    cv = ClassificationCrossValidation.create model
-    refute_empty cv.validation_ids
-    assert cv.accuracy > 0.8, "Crossvalidation accuracy lower than 0.8"
-    assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) larger than unweighted accuracy(#{cv.accuracy}) "
-  end
-
-  def test_classification_crossvalidation
-    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
-    model = Model::LazarClassification.create dataset#, features
+    model = Model::LazarClassification.create dataset
     cv = ClassificationCrossValidation.create model
-    #p cv
     assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
-    #File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
-    #`inkview tmp.svg`
-    p cv.nr_unpredicted
-    p cv.accuracy
-    assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than unweighted accuracy (#{cv.accuracy}) ."
   end
 
   def test_default_regression_crossvalidation
     dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
     model = Model::LazarRegression.create dataset
     cv = RegressionCrossValidation.create model
-    #cv = RegressionCrossValidation.find '561503262b72ed54fd000001'
-    p cv
-    #File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot}
-    #`inkview tmp.svg`
-    #File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
-    #`inkview tmp.svg`
-    
-    #puts cv.misclassifications.to_yaml
-    p cv.rmse
-    p cv.weighted_rmse 
     assert cv.rmse < 1.5, "RMSE > 1.5"
-    #assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
-    p cv.mae 
-    p cv.weighted_mae 
     assert cv.mae < 1
-    #assert cv.weighted_mae < cv.mae
   end
 
   def test_regression_crossvalidation
     dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
-    #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
     params = {
-      :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average",
+      :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average",
       :neighbor_algorithm => "fingerprint_neighbors",
       :neighbor_algorithm_parameters => {
         :type => "MACCS",
@@ -67,17 +36,15 @@ class ValidationTest < MiniTest::Test
       refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
     end
 
-    assert cv.rmse < 1.5, "RMSE > 30"
-    assert cv.mae < 1
+    refute_nil cv.rmse
+    refute_nil cv.mae 
   end
 
   def test_pls_regression_crossvalidation
     dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
-    params = { :prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression", }
+    params = { :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression", }
     model = Model::LazarRegression.create dataset, params
     cv = RegressionCrossValidation.create model
-    p cv.nr_instances
-    p cv.nr_unpredicted
     assert cv.rmse < 1.5, "RMSE > 1.5"
     assert cv.mae < 1
   end
@@ -88,13 +55,13 @@ class ValidationTest < MiniTest::Test
     repeated_cv = RepeatedCrossValidation.create model
     repeated_cv.crossvalidations.each do |cv|
       assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
-      assert_operator cv.weighted_accuracy, :>, cv.accuracy
     end
   end
 
   def test_crossvalidation_parameters
     dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
     params = {
+        :training_dataset_id => dataset.id,
       :neighbor_algorithm_parameters => {
         :min_sim => 0.3,
         :type => "FP3"
@@ -116,13 +83,11 @@ class ValidationTest < MiniTest::Test
 
   def test_physchem_regression_crossvalidation
 
-    # UPLOAD DATA
     training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
     model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
     cv = RegressionCrossValidation.create model
-    p cv
-    p cv.id
-    p cv.statistics
+    refute_nil cv.rmse
+    refute_nil cv.mae 
   end
 
   def test_classification_loo_validation
@@ -132,22 +97,13 @@ class ValidationTest < MiniTest::Test
     assert_equal 14, loo.nr_unpredicted
     refute_empty loo.confusion_matrix
     assert loo.accuracy > 0.77
-    assert loo.weighted_accuracy > 0.85
-    assert loo.accuracy < loo.weighted_accuracy
   end
 
   def test_regression_loo_validation
     dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
     model = Model::LazarRegression.create dataset
     loo = RegressionLeaveOneOutValidation.create model
-    assert_equal 11, loo.nr_unpredicted
-    assert loo.weighted_mae < loo.mae
     assert loo.r_squared > 0.34
-    #assert_equal 14, loo.nr_unpredicted
-    #p loo.confusion_matrix
-    #p loo.accuracy
-    #File.open("tmp.svg","w+"){|f| f.puts loo.correlation_plot}
-    #`inkview tmp.svg`
   end
 
 end
-- 
cgit v1.2.3