10 files changed, 54 insertions, 29 deletions
diff --git a/lib/caret.rb b/lib/caret.rb
index f5c2bde..8bccf74 100644
--- a/lib/caret.rb
+++ b/lib/caret.rb
@@ -22,12 +22,11 @@ module OpenTox
         end
         if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] 
           prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+          prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances."
         elsif
           dependent_variables.size < 3
           prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-          prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
-
+          prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
         else
           dependent_variables.each_with_index do |v,i| 
             dependent_variables[i] = to_r(v)
@@ -52,7 +51,7 @@ module OpenTox
             $logger.debug dependent_variables
             $logger.debug independent_variables
             prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-            prediction[:warning] = "R caret model creation error. Using weighted average of similar substances."
+            prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
             return prediction
           end
           begin
@@ -73,12 +72,12 @@ module OpenTox
             $logger.debug "R caret prediction error for:"
             $logger.debug self.inspect
             prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-            prediction[:warning] = "R caret prediction error. Using weighted average of similar substances"
+            prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
             return prediction
           end
           if prediction.nil? or prediction[:value].nil?
             prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
-            prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
+            prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
           end
         end
         prediction
diff --git a/lib/classification.rb b/lib/classification.rb
index 638492b..a875903 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -18,6 +18,11 @@ module OpenTox
         class_weights.each do |a,w|
           probabilities[a] = w.sum/weights.sum
         end
+        # DG: hack to ensure always two probability values
+        if probabilities.keys.uniq.size == 1
+          missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
+          probabilities[missing_key] = 0.0
+        end
         probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
         p_max = probabilities.collect{|a,p| p}.max
         prediction = probabilities.key(p_max)
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 75c5db5..06a1e2a 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -90,6 +90,7 @@ module OpenTox
       field :within_prediction_interval, type: Integer, default:0
       field :out_of_prediction_interval, type: Integer, default:0
       field :correlation_plot_id, type: BSON::ObjectId
+      field :warnings, type: Array
     end
 
     # Independent repeated crossvalidations
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 44690e1..6e7d67f 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -46,7 +46,7 @@ module OpenTox
       if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
         data_entries[substance.to_s][feature.to_s]
       else
-        nil
+        [nil]
       end
     end
 
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 8d22018..c33c92b 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -58,6 +58,7 @@ module OpenTox
       field :within_prediction_interval, type: Integer, default:0
       field :out_of_prediction_interval, type: Integer, default:0
       field :correlation_plot_id, type: BSON::ObjectId
+      field :warnings, type: Array
     end
 
   end
diff --git a/lib/model.rb b/lib/model.rb
index b18610d..475a346 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -57,7 +57,7 @@ module OpenTox
           model.version = {:warning => "git is not installed"}
         end
 
-        # set defaults
+        # set defaults#
         substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
         bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
 
@@ -68,10 +68,6 @@ module OpenTox
               :method => "fingerprint",
               :type => "MP2D",
             },
-            :similarity => {
-              :method => "Algorithm::Similarity.tanimoto",
-              :min => 0.1
-            },
             :feature_selection => nil
           }
 
@@ -79,9 +75,17 @@ module OpenTox
             model.algorithms[:prediction] = {
                 :method => "Algorithm::Classification.weighted_majority_vote",
             }
+            model.algorithms[:similarity] = {
+              :method => "Algorithm::Similarity.tanimoto",
+              :min => 0.1,
+            }
           elsif model.class == LazarRegression
             model.algorithms[:prediction] = {
-              :method => "Algorithm::Caret.pls",
+              :method => "Algorithm::Caret.rf",
+            }
+            model.algorithms[:similarity] = {
+              :method => "Algorithm::Similarity.tanimoto",
+              :min => 0.5,
             }
           end
 
@@ -93,7 +97,7 @@ module OpenTox
             },
             :similarity => {
               :method => "Algorithm::Similarity.weighted_cosine",
-              :min => 0.5
+              :min => 0.5,
             },
             :prediction => {
               :method => "Algorithm::Caret.rf",
@@ -141,7 +145,6 @@ module OpenTox
           end
           model.descriptor_ids = model.fingerprints.flatten.uniq
           model.descriptor_ids.each do |d|
-            # resulting model may break BSON size limit (e.g. f Kazius dataset)
             model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
           end
         # calculate physchem properties
@@ -191,7 +194,7 @@ module OpenTox
       # Predict a substance (compound or nanoparticle)
       # @param [OpenTox::Substance]
       # @return [Hash]
-      def predict_substance substance
+      def predict_substance substance, threshold = self.algorithms[:similarity][:min]
         
         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
         case algorithms[:similarity][:method]
@@ -221,20 +224,19 @@ module OpenTox
           bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
         end
         
-        prediction = {}
+        prediction = {:warnings => [], :measurements => []}
+        prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
         neighbor_ids = []
         neighbor_similarities = []
         neighbor_dependent_variables = []
         neighbor_independent_variables = []
 
-        prediction = {}
         # find neighbors
         substance_ids.each_with_index do |s,i|
           # handle query substance
           if substance.id.to_s == s
-            prediction[:measurements] ||= []
             prediction[:measurements] << dependent_variables[i]
-            prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
+            prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
           else
             if fingerprints?
               neighbor_descriptors = fingerprints[i]
@@ -243,7 +245,7 @@ module OpenTox
               neighbor_descriptors = scaled_variables.collect{|v| v[i]}
             end
             sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
-            if sim >= algorithms[:similarity][:min]
+            if sim >= threshold
               neighbor_ids << s
               neighbor_similarities << sim
               neighbor_dependent_variables << dependent_variables[i]
@@ -258,17 +260,27 @@ module OpenTox
         measurements = nil
         
         if neighbor_similarities.empty?
-          prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
+          prediction[:value] = nil
+          prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
         elsif neighbor_similarities.size == 1
-          prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
+          prediction[:value] = nil
+          prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
+          prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
         else
           query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
           # call prediction algorithm
           result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
           prediction.merge! result
           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
+          #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
+            #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
+          #end
+        end
+        if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
+          prediction
+        else # try again with a lower threshold
+          predict_substance substance, 0.2
         end
-        prediction
       end
 
       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
@@ -300,7 +312,7 @@ module OpenTox
         # serialize result
         if object.is_a? Substance
           prediction = predictions[substances.first.id.to_s]
-          prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
+          prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
           return prediction
         elsif object.is_a? Array
           return predictions
diff --git a/lib/regression.rb b/lib/regression.rb
index fd2855f..25c0732 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -17,7 +17,7 @@ module OpenTox
           sim_sum += weights[i]
         end if dependent_variables
         sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
-        {:value => prediction}
+        {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]}
       end
 
     end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
index 034ae3a..9a5532d 100644
--- a/lib/train-test-validation.rb
+++ b/lib/train-test-validation.rb
@@ -27,6 +27,8 @@ module OpenTox
           end
         end
         predictions.select!{|cid,p| p[:value] and p[:measurements]}
+        # hack to avoid mongos file size limit error on large datasets
+        #predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i)
         validation = self.new(
           :model_id => validation_model.id,
           :test_dataset_id => test_set.id,
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
index 8341a67..fc10cd4 100644
--- a/lib/unique_descriptors.rb
+++ b/lib/unique_descriptors.rb
@@ -48,7 +48,8 @@ UNIQUEDESCRIPTORS = [
   #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
   #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
   "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
-  "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
+  # TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable.
+  #"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
   "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
   "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
   "Cdk.LargestChain", #Returns the number of atoms in the largest chain
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 2d522ae..69e7992 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -111,6 +111,7 @@ module OpenTox
       # Get statistics
       # @return [Hash]
       def statistics
+        self.warnings = []
         self.rmse = 0
         self.mae = 0
         self.within_prediction_interval = 0
@@ -132,8 +133,10 @@ module OpenTox
               end
             end
           else
-            warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-            $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+            trd_id = model.training_dataset_id
+            smiles = Compound.find(cid).smiles
+            self.warnings << "No training activities for #{smiles} in training dataset #{trd_id}."
+            $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
           end
         end
         R.assign "measurement", x
@@ -146,6 +149,7 @@ module OpenTox
         $logger.debug "RMSE #{rmse}"
         $logger.debug "MAE #{mae}"
         $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
+        $logger.debug "#{warnings}"
         save
         {
           :mae => mae,