From 13e7865f386603fb784e62feef2ee2a56c015b45 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Tue, 14 Aug 2018 12:02:03 +0000
Subject: classification min_sim=0.5, 5 repeated cvs

---
 lib/model.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index dce53a9..0ed70f2 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -80,7 +80,7 @@ module OpenTox
             }
             model.algorithms[:similarity] = {
               :method => "Algorithm::Similarity.tanimoto",
-              :min => 0.1,
+              :min => 0.5,
             }
           elsif model.class == LazarRegression
             model.algorithms[:prediction] = {
-- 
cgit v1.2.3


From bdc6b5b40437896384561d74a510560e9e592364 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Tue, 9 Oct 2018 18:20:27 +0200
Subject: tentative random forest classification: hangs unpredictably during
 caret model generation/optimization for some (inorganic?) compounds.

---
 lib/model.rb | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 0ed70f2..8901a2c 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -37,7 +37,7 @@ module OpenTox
       # @return [OpenTox::Model::Lazar]
       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
         bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
-        prediction_feature = training_dataset.features.first unless prediction_feature
+        prediction_feature = training_dataset.features.select{|f| f.measured}.first unless prediction_feature
         # TODO: prediction_feature without training_dataset: use all available data
 
         # guess model type
@@ -199,6 +199,8 @@ module OpenTox
       # @return [Hash]
       def predict_substance substance, threshold = self.algorithms[:similarity][:min]
         
+        p substance.smiles
+        t = Time.now
         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
         case algorithms[:similarity][:method]
         when /tanimoto/ # binary features
@@ -284,6 +286,9 @@ module OpenTox
         else # try again with a lower threshold
           predict_substance substance, 0.2
         end
+        p prediction
+        p Time.now - t
+        prediction
       end
 
       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
-- 
cgit v1.2.3


From de763211bd2b6451e3a8dc20eb95a3ecf72bef17 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Thu, 11 Oct 2018 12:13:40 +0200
Subject: initial dataset batch prediction

---
 lib/model.rb | 50 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 17 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 8901a2c..7ee50fe 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -36,12 +36,12 @@ module OpenTox
       #
       # @return [OpenTox::Model::Lazar]
       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
-        bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
-        prediction_feature = training_dataset.features.select{|f| f.measured}.first unless prediction_feature
+        bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
+        prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
         # TODO: prediction_feature without training_dataset: use all available data
 
         # guess model type
-        prediction_feature.numeric? ?  model = LazarRegression.new : model = LazarClassification.new
+        prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
 
         model.prediction_feature_id = prediction_feature.id
         model.training_dataset_id = training_dataset.id
@@ -199,7 +199,7 @@ module OpenTox
       # @return [Hash]
       def predict_substance substance, threshold = self.algorithms[:similarity][:min]
         
-        p substance.smiles
+        #p substance.smiles
         t = Time.now
         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
         case algorithms[:similarity][:method]
@@ -286,8 +286,8 @@ module OpenTox
         else # try again with a lower threshold
           predict_substance substance, 0.2
         end
-        p prediction
-        p Time.now - t
+        #p prediction
+        #p Time.now - t
         prediction
       end
 
@@ -314,6 +314,11 @@ module OpenTox
         predictions = {}
         substances.each do |c|
           predictions[c.id.to_s] = predict_substance c
+          if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value]
+            prediction_feature.accept_values.each do |v|
+              predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity)
+            end
+          end
           predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
         end
 
@@ -325,17 +330,28 @@ module OpenTox
         elsif object.is_a? Array
           return predictions
         elsif object.is_a? Dataset
-          # prepare prediction dataset
-          measurement_feature = Feature.find prediction_feature_id
-
-          prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
-          prediction_dataset = LazarPrediction.create(
-            :name => "Lazar prediction for #{prediction_feature.name}",
-            :creator =>  __FILE__,
-            :prediction_feature_id => prediction_feature.id,
-            :predictions => predictions
-          )
-          return prediction_dataset
+          if prediction_feature.is_a? NominalBioActivity
+            f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
+            probability_features = {}
+            prediction_feature.accept_values.each do |v|
+              probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => "probability(#{v})", :accept_values => prediction_feature.accept_values, :value => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+            end
+          elsif prediction_feature.is_a? NumericBioActivity
+            f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
+            # TODO prediction interval
+          end
+
+          d = Dataset.new(:name => object.name)
+          # add predictions to dataset
+          predictions.each do |substance_id,p|
+            d.warnings += p[:warnings]
+            unless p[:value].nil?
+              d.add substance_id,f,p[:value]
+              p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p}
+            end
+          end
+          d.save
+          return d
         end
 
       end
-- 
cgit v1.2.3


From 1652fd5df948da7ace622c73d158010add656b9f Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Wed, 24 Oct 2018 18:21:34 +0200
Subject: dataset map

---
 lib/model.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 7ee50fe..9858949 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -199,7 +199,6 @@ module OpenTox
       # @return [Hash]
       def predict_substance substance, threshold = self.algorithms[:similarity][:min]
         
-        #p substance.smiles
         t = Time.now
         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
         case algorithms[:similarity][:method]
@@ -286,7 +285,6 @@ module OpenTox
         else # try again with a lower threshold
           predict_substance substance, 0.2
         end
-        #p prediction
         #p Time.now - t
         prediction
       end
@@ -330,11 +328,12 @@ module OpenTox
         elsif object.is_a? Array
           return predictions
         elsif object.is_a? Dataset
+          warning_feature = InfoFeature.find_or_create_by(:name => "Warnings")
           if prediction_feature.is_a? NominalBioActivity
             f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
             probability_features = {}
             prediction_feature.accept_values.each do |v|
-              probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => "probability(#{v})", :accept_values => prediction_feature.accept_values, :value => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+              probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
             end
           elsif prediction_feature.is_a? NumericBioActivity
             f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
@@ -344,10 +343,11 @@ module OpenTox
           d = Dataset.new(:name => object.name)
           # add predictions to dataset
           predictions.each do |substance_id,p|
-            d.warnings += p[:warnings]
+            d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings]
             unless p[:value].nil?
               d.add substance_id,f,p[:value]
               p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p}
+            # TODO prediction interval
             end
           end
           d.save
-- 
cgit v1.2.3


From 5e9a08c0b534fa96179fb5c81a9b4193e7b0aad8 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Mon, 29 Oct 2018 17:58:09 +0100
Subject: dataset folds fixed

---
 lib/model.rb | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 9858949..fc98e09 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -277,7 +277,7 @@ module OpenTox
           prediction.merge! result
           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
           #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
-            #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
+            #prediction[:warnings] << "Closest neighbor has similarity #{neighbor_similarities.max} < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
           #end
         end
         if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
@@ -328,7 +328,8 @@ module OpenTox
         elsif object.is_a? Array
           return predictions
         elsif object.is_a? Dataset
-          warning_feature = InfoFeature.find_or_create_by(:name => "Warnings")
+          d = object.copy
+          warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
           if prediction_feature.is_a? NominalBioActivity
             f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
             probability_features = {}
@@ -337,17 +338,19 @@ module OpenTox
             end
           elsif prediction_feature.is_a? NumericBioActivity
             f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
-            # TODO prediction interval
+            prediction_interval = {}
+            ["lower","upper"].each do |v|
+              prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+            end
           end
 
-          d = Dataset.new(:name => object.name)
           # add predictions to dataset
           predictions.each do |substance_id,p|
             d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings]
             unless p[:value].nil?
               d.add substance_id,f,p[:value]
-              p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p}
-            # TODO prediction interval
+              p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
+              p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval]
             end
           end
           d.save
-- 
cgit v1.2.3


From d9c9d78e49d886ea91386adbbd2b523347df226e Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Mon, 29 Oct 2018 20:34:39 +0100
Subject: dataset predictions fixed

---
 lib/model.rb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index fc98e09..7eaa469 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -346,7 +346,8 @@ module OpenTox
 
           # add predictions to dataset
           predictions.each do |substance_id,p|
-            d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings]
+            substance_id = BSON::ObjectId.from_string(substance_id)
+            d.add substance_id,warning_feature,p[:warnings].join(" ") unless p[:warnings].empty?
             unless p[:value].nil?
               d.add substance_id,f,p[:value]
               p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
-- 
cgit v1.2.3


From d61f78093f4ddf03c27a2c8ae0bab9c1f10c80f5 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Tue, 30 Oct 2018 17:26:59 +0100
Subject: tests fixed

---
 lib/model.rb | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 7eaa469..6d5cf7b 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -46,6 +46,7 @@ module OpenTox
         model.prediction_feature_id = prediction_feature.id
         model.training_dataset_id = training_dataset.id
         model.name = "#{prediction_feature.name} (#{training_dataset.name})" 
+        
         # git or gem versioning
         dir = File.dirname(__FILE__)
         path = File.expand_path("../", File.expand_path(dir))
@@ -485,6 +486,8 @@ module OpenTox
         model.is_a? LazarClassification
       end
 
+      # TODO from_pubchem_aid
+
       # Create and validate a lazar model from a csv file with training data and a json file with metadata
       # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
       # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
@@ -533,6 +536,14 @@ module OpenTox
 
     end
 
+    # TODO
+    def to_json
+      "{\n  metadata:#{super},\n  model:#{model.to_json},  repeated_crossvalidations:#{repeated_crossvalidations.to_json}\n}"
+    end
+
+    def from_json_file
+    end
+
   end
 
 end
-- 
cgit v1.2.3


From 2d4ce39cb1b489e26b0d6d96026054566a4f77b9 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Tue, 30 Oct 2018 21:11:04 +0100
Subject: dataset merge

---
 lib/model.rb | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 6d5cf7b..f50fcd7 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -487,6 +487,16 @@ module OpenTox
       end
 
       # TODO from_pubchem_aid
+      def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: 
+        model_validation = Model::Validation.create species: species, endpoint: endpoint
+        p "create model"
+        model = Lazar.create training_dataset: training_dataset, prediction_feature: prediction_feature
+        model_validation[:model_id] = model.id
+        p "create_crossvalidations"
+        model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
+        model_validation.save
+        model_validation
+      end
 
       # Create and validate a lazar model from a csv file with training data and a json file with metadata
       # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
-- 
cgit v1.2.3


From 3a9c9332b660d35720ad4fa1f55ee0883e53aecd Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Fri, 2 Nov 2018 20:34:44 +0100
Subject: warnings fixed, cleanup

---
 lib/model.rb | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index f50fcd7..a0c60f0 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -38,7 +38,6 @@ module OpenTox
       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
         bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
         prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
-        # TODO: prediction_feature without training_dataset: use all available data
 
         # guess model type
         prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
@@ -198,9 +197,8 @@ module OpenTox
       # Predict a substance (compound or nanoparticle)
       # @param [OpenTox::Substance]
       # @return [Hash]
-      def predict_substance substance, threshold = self.algorithms[:similarity][:min]
+      def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil
         
-        t = Time.now
         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
         case algorithms[:similarity][:method]
         when /tanimoto/ # binary features
@@ -229,7 +227,7 @@ module OpenTox
           bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
         end
         
-        prediction = {:warnings => [], :measurements => []}
+        prediction ||= {:warnings => [], :measurements => []}
         prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
         neighbor_ids = []
         neighbor_similarities = []
@@ -240,7 +238,7 @@ module OpenTox
         substance_ids.each_with_index do |s,i|
           # handle query substance
           if substance.id.to_s == s
-            prediction[:measurements] << dependent_variables[i]
+            prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass
             prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
           else
             if fingerprints?
@@ -277,17 +275,13 @@ module OpenTox
           result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
           prediction.merge! result
           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
-          #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
-            #prediction[:warnings] << "Closest neighbor has similarity #{neighbor_similarities.max} < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
-          #end
         end
         if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
           prediction
         else # try again with a lower threshold
-          predict_substance substance, 0.2
+          prediction[:warnings] << "Lowering similarity threshold to 0.2."
+          predict_substance substance, 0.2, prediction
         end
-        #p Time.now - t
-        prediction
       end
 
       # Predict a substance (compound or nanoparticle), an array of substances or a dataset
-- 
cgit v1.2.3


From cf80ed17102a0368df8d65037d113b521cdf6f0c Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Tue, 6 Nov 2018 19:01:58 +0100
Subject: sdf export fixed

---
 lib/model.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index a0c60f0..966460b 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -481,13 +481,13 @@ module OpenTox
       end
 
       # TODO from_pubchem_aid
-      def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: 
+      def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: , folds: 10, repeats: 5
         model_validation = Model::Validation.create species: species, endpoint: endpoint
-        p "create model"
+        #p "create model"
         model = Lazar.create training_dataset: training_dataset, prediction_feature: prediction_feature
         model_validation[:model_id] = model.id
-        p "create_crossvalidations"
-        model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required
+        #p "create_crossvalidations"
+        model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model,folds,repeats).id # full class name required
         model_validation.save
         model_validation
       end
-- 
cgit v1.2.3


From 6e23be652ad90c747aaccf15258bdaa4458185a4 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Tue, 13 Nov 2018 14:32:09 +0100
Subject: public dataset download

---
 lib/model.rb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 966460b..70ae43c 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -422,6 +422,7 @@ module OpenTox
       field :species, type: String
       field :source, type: String
       field :unit, type: String
+      field :warnings, type: Array
       field :model_id, type: BSON::ObjectId
       field :repeated_crossvalidation_id, type: BSON::ObjectId
 
@@ -494,7 +495,7 @@ module OpenTox
 
       # Create and validate a lazar model from a csv file with training data and a json file with metadata
       # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
-      # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
+      # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
       def self.from_csv_file file
         metadata_file = file.sub(/csv$/,"json")
         bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
-- 
cgit v1.2.3


From ae78e8216909ebfa708b8da3c55248a68abc291c Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Wed, 14 Nov 2018 13:35:17 +0100
Subject: public model validation, updated documentation

---
 lib/model.rb | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 70ae43c..db69120 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -44,7 +44,7 @@ module OpenTox
 
         model.prediction_feature_id = prediction_feature.id
         model.training_dataset_id = training_dataset.id
-        model.name = "#{prediction_feature.name} (#{training_dataset.name})" 
+        model.name = training_dataset.name
         
         # git or gem versioning
         dir = File.dirname(__FILE__)
@@ -481,20 +481,8 @@ module OpenTox
         model.is_a? LazarClassification
       end
 
-      # TODO from_pubchem_aid
-      def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: , folds: 10, repeats: 5
-        model_validation = Model::Validation.create species: species, endpoint: endpoint
-        #p "create model"
-        model = Lazar.create training_dataset: training_dataset, prediction_feature: prediction_feature
-        model_validation[:model_id] = model.id
-        #p "create_crossvalidations"
-        model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model,folds,repeats).id # full class name required
-        model_validation.save
-        model_validation
-      end
-
       # Create and validate a lazar model from a csv file with training data and a json file with metadata
-      # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
+      # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar.
       # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
       def self.from_csv_file file
         metadata_file = file.sub(/csv$/,"json")
@@ -510,6 +498,7 @@ module OpenTox
 
       # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
       #   nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
+      #   *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!*
       # @param [OpenTox::Dataset, nil] training_dataset
       # @param [OpenTox::Feature, nil] prediction_feature
       # @param [Hash, nil] algorithms
@@ -541,14 +530,6 @@ module OpenTox
 
     end
 
-    # TODO
-    def to_json
-      "{\n  metadata:#{super},\n  model:#{model.to_json},  repeated_crossvalidations:#{repeated_crossvalidations.to_json}\n}"
-    end
-
-    def from_json_file
-    end
-
   end
 
 end
-- 
cgit v1.2.3


From 7e547fd4a296f497615a7805d565b378cb1bd7cd Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Wed, 14 Nov 2018 17:33:44 +0100
Subject: bad_request_error substituted with ArgumentError

---
 lib/model.rb | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index db69120..caf8a6e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -36,7 +36,7 @@ module OpenTox
       #
       # @return [OpenTox::Model::Lazar]
       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
-        bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
+        raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
         prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
 
         # guess model type
@@ -62,7 +62,7 @@ module OpenTox
 
         # set defaults#
         substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
-        bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
+        raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
 
         if substance_classes.first == "OpenTox::Compound"
 
@@ -110,7 +110,7 @@ module OpenTox
             },
           }
         else
-          bad_request_error "Cannot create models for #{substance_classes.first}."
+          raise ArgumentError, "Cannot create models for #{substance_classes.first}."
         end
         
         # overwrite defaults with explicit parameters
@@ -175,7 +175,7 @@ module OpenTox
           model.descriptor_ids = feature_ids & property_ids
           model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
         else
-          bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
+          raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented."
         end
         
         if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
@@ -224,7 +224,7 @@ module OpenTox
             end
           end
         else
-          bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
+          raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
         end
         
         prediction ||= {:warnings => [], :measurements => []}
@@ -300,7 +300,7 @@ module OpenTox
         elsif object.is_a? Dataset
           substances = object.substances
         else 
-          bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
+          raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
         end
 
         # make predictions
@@ -486,7 +486,7 @@ module OpenTox
       # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
       def self.from_csv_file file
         metadata_file = file.sub(/csv$/,"json")
-        bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
+        raise ArgumentError, "No metadata file #{metadata_file}" unless File.exist? metadata_file
         model_validation = self.new JSON.parse(File.read(metadata_file))
         training_dataset = Dataset.from_csv_file file
         model = Lazar.create training_dataset: training_dataset
@@ -510,7 +510,7 @@ module OpenTox
         unless training_dataset # try to import 
           Import::Enanomapper.import
           training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-          bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
+          raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
         end
         prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
 
-- 
cgit v1.2.3


From 0882c2cd0de934d7377fc9d08c306be98612c88a Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Fri, 16 Nov 2018 18:42:42 +0100
Subject: real datasets for testing, test data cleanup, Daphnia import, upper
 and lower similarity thresholds

---
 lib/model.rb | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index caf8a6e..08ca07e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -32,7 +32,7 @@ module OpenTox
       # @param [OpenTox::Feature, nil] prediction_feature
       #   By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
       # @param [Hash, nil] algorithms
-      #   Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. 
+      #   Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. 
       #
       # @return [OpenTox::Model::Lazar]
       def self.create prediction_feature:nil, training_dataset:, algorithms:{}
@@ -80,7 +80,7 @@ module OpenTox
             }
             model.algorithms[:similarity] = {
               :method => "Algorithm::Similarity.tanimoto",
-              :min => 0.5,
+              :min => [0.5,0.2],
             }
           elsif model.class == LazarRegression
             model.algorithms[:prediction] = {
@@ -88,7 +88,7 @@ module OpenTox
             }
             model.algorithms[:similarity] = {
               :method => "Algorithm::Similarity.tanimoto",
-              :min => 0.5,
+              :min => [0.5,0.2],
             }
           end
 
@@ -100,7 +100,7 @@ module OpenTox
             },
             :similarity => {
               :method => "Algorithm::Similarity.weighted_cosine",
-              :min => 0.5,
+              :min => [0.5,0.2],
             },
             :prediction => {
               :method => "Algorithm::Caret.rf",
@@ -197,7 +197,7 @@ module OpenTox
       # Predict a substance (compound or nanoparticle)
       # @param [OpenTox::Substance]
       # @return [Hash]
-      def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil
+      def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil
         
         @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
         case algorithms[:similarity][:method]
@@ -228,7 +228,7 @@ module OpenTox
         end
         
         prediction ||= {:warnings => [], :measurements => []}
-        prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
+        prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first
         neighbor_ids = []
         neighbor_similarities = []
         neighbor_dependent_variables = []
@@ -238,7 +238,7 @@ module OpenTox
         substance_ids.each_with_index do |s,i|
           # handle query substance
           if substance.id.to_s == s
-            prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass
+            prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass
             prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
           else
             if fingerprints?
@@ -264,11 +264,19 @@ module OpenTox
         
         if neighbor_similarities.empty?
           prediction[:value] = nil
-          prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
+          prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset."
+          if threshold == algorithms[:similarity][:min].last
+            prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})."
+            return prediction
+          end
         elsif neighbor_similarities.size == 1
           prediction[:value] = nil
-          prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
+          prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})."
           prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
+          if threshold == algorithms[:similarity][:min].last
+            prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set."
+            return prediction
+          end
         else
           query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
           # call prediction algorithm
@@ -276,11 +284,17 @@ module OpenTox
           prediction.merge! result
           prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
         end
-        if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
-          prediction
-        else # try again with a lower threshold
-          prediction[:warnings] << "Lowering similarity threshold to 0.2."
-          predict_substance substance, 0.2, prediction
+        if threshold == algorithms[:similarity][:min].first
+          if prediction[:warnings].empty? 
+            prediction[:confidence] = "High (close to bioassay results)"
+            return prediction
+          else # try again with a lower threshold
+            prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
+            predict_substance substance, algorithms[:similarity][:min].last, prediction
+          end
+        elsif threshold < algorithms[:similarity][:min].first
+          prediction[:confidence] = "Low (lower than bioassay results)"
+          return prediction
         end
       end
 
-- 
cgit v1.2.3


From 1b44e0cd76f2ead93b8b3fa0f970c85ef32a4b14 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Fri, 16 Nov 2018 22:45:17 +0100
Subject: confidence for prediction datasets

---
 lib/model.rb | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 08ca07e..cbfefe3 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -338,7 +338,8 @@ module OpenTox
           return predictions
         elsif object.is_a? Dataset
           d = object.copy
-          warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
+          #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
+          confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id)
           if prediction_feature.is_a? NominalBioActivity
             f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
             probability_features = {}
@@ -356,7 +357,7 @@ module OpenTox
           # add predictions to dataset
           predictions.each do |substance_id,p|
             substance_id = BSON::ObjectId.from_string(substance_id)
-            d.add substance_id,warning_feature,p[:warnings].join(" ") unless p[:warnings].empty?
+            d.add substance_id,confidence_feature,p[:confidence]
             unless p[:value].nil?
               d.add substance_id,f,p[:value]
               p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
-- 
cgit v1.2.3


From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 20 Jun 2019 22:01:50 +0200
Subject: separate csv serialisations for batch predictions and training data,
 repeated measurements in mutagenicity dataset fixed, daphnia import fixed,
 CENTRAL_MONGO_IP removed

---
 lib/model.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index cbfefe3..05cd113 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -286,14 +286,14 @@ module OpenTox
         end
         if threshold == algorithms[:similarity][:min].first
           if prediction[:warnings].empty? 
-            prediction[:confidence] = "High (close to bioassay results)"
+            prediction[:confidence] = "Similar to bioassay results"
             return prediction
           else # try again with a lower threshold
             prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
             predict_substance substance, algorithms[:similarity][:min].last, prediction
           end
         elsif threshold < algorithms[:similarity][:min].first
-          prediction[:confidence] = "Low (lower than bioassay results)"
+          prediction[:confidence] = "Lower than bioassay results"
           return prediction
         end
       end
@@ -348,9 +348,9 @@ module OpenTox
             end
           elsif prediction_feature.is_a? NumericBioActivity
             f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
-            prediction_interval = {}
+            prediction_interval = []
             ["lower","upper"].each do |v|
-              prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+              prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
             end
           end
 
-- 
cgit v1.2.3


From bea6f89f2a54a0612a30ce90c56c4ddecc4f7ed1 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 21 Jun 2019 12:55:21 +0200
Subject: measurement added for single neighbors

---
 lib/model.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index 05cd113..d7b2df6 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -272,7 +272,7 @@ module OpenTox
         elsif neighbor_similarities.size == 1
           prediction[:value] = nil
           prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})."
-          prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
+          prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}]
           if threshold == algorithms[:similarity][:min].last
             prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set."
             return prediction
-- 
cgit v1.2.3