From 6c4fd5809d20596ad2cfe507cd762bdcdce7fc57 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 3 Feb 2017 14:36:07 +0100
Subject: algorithm selection tutorial

---
 README.md | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 97 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 28ed18f..2bb5c80 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,74 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
 
 #### Experiment with other algorithms
 
-  You can pass algorithms parameters to the `Model::Validation.create_from_csv_file` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions.
+  You can pass algorithm specifications as parameters to the `Model::Validation.create_from_csv_file` and `Model::Lazar.create` commands. Algorithms for descriptors, similarity calculations, feature_selection and local models are specified in the `algorithm` parameter. Unspecified algorithms and parameters are substituted by default values. The example below selects 
+
+  - MP2D fingerprint descriptors
+  - Tanimoto similarity with a threshold of 0.1
+  - no feature selction
+  - weighted majority vote predictions
+
+  ```
+    algorithms = {
+      :descriptors => { # descriptor algorithm
+        :method => "fingerprint", # fingerprint descriptors
+        :type => "MP2D" # fingerprint type, e.g. FP4, MACCS
+      },
+      :similarity => { # similarity algorithm
+        :method => "Algorithm::Similarity.tanimoto",
+        :min => 0.1 # similarity threshold for neighbors
+      },
+      :feature_selection => nil, # no feature selection
+      :prediction => { # local modelling algorithm
+        :method => "Algorithm::Classification.weighted_majority_vote",
+      },
+    }
+
+    training_dataset = Dataset.from_csv_file "hamster_carcinogenicity.csv"
+    model = Model::Lazar.create  training_dataset: training_dataset, algorithms: algorithms
+  ```
+
+  The next example creates a regression model with
+
+  - calculated descriptors from OpenBabel libraries
+  - weighted cosine similarity and a threshold of 0.5
+  - descriptors that are correlated with the endpoint
+  - local partial least squares models from the R caret package
+
+  ```
+    algorithms = {
+      :descriptors => { # descriptor algorithm
+        :method => "calculate_properties",
+        :features => PhysChem.openbabel_descriptors,
+      },
+      :similarity => { # similarity algorithm
+        :method => "Algorithm::Similarity.weighted_cosine",
+        :min => 0.5
+      },
+      :feature_selection => { # feature selection algorithm
+        :method => "Algorithm::FeatureSelection.correlation_filter",
+      },
+      :prediction => { # local modelling algorithm
+        :method => "Algorithm::Caret.pls",
+      },
+    }
+    training_dataset = Dataset.from_csv_file "EPAFHM_log10.csv"
+    model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
+    ```
+    Please consult the [API documentation](http://rdoc.info/gems/lazar) and [source code](https:://github.com/opentox/lazar) for up to date information about implemented algorithms:
+
+    - Descriptor algorithms
+      - [Compounds](http://www.rubydoc.info/gems/lazar/OpenTox/Compound)
+      - [Nanoparticles](http://www.rubydoc.info/gems/lazar/OpenTox/Nanoparticle)
+    - [Similarity algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Similarity)
+    - [Feature selection algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/FeatureSelection)
+    - Local models
+      - [Classification](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Classification)
+      - [Regression](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Regression)
+      - [R caret](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Caret)
+
+
+    You can find more working examples in the `lazar` `model-*.rb` and `validation-*.rb` [tests](https://github.com/opentox/lazar/tree/master/test).
 
 ### Create and use `lazar` nanoparticle models
 
@@ -87,7 +154,35 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
 
 #### Experiment with other datasets, endpoints and algorithms
 
-  You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. Detailed documentation and validation results can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
+  You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. Procedure and options are the same as for compounds. The following commands create and validate a `nano-lazar` model with
+
+  - measured P-CHEM properties as descriptors
+  - descriptors selected with correlation filter
+  - weighted cosine similarity with a threshold of 0.5
+  - Caret random forests
+
+```
+          algorithms = {
+            :descriptors => {
+              :method => "properties",
+              :categories => ["P-CHEM"],
+            },
+            :similarity => {
+              :method => "Algorithm::Similarity.weighted_cosine",
+              :min => 0.5
+            },
+            :feature_selection => {
+              :method => "Algorithm::FeatureSelection.correlation_filter",
+            },
+            :prediction => {
+              :method => "Algorithm::Caret.rf",
+            },
+          }
+          validation_model = Model::Validation.from_enanomapper algorithms: algorithms
+```
+
+
+  Detailed documentation and validation results for nanoparticle models can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
 
 Documentation
 -------------
-- 
cgit v1.2.3


From 2ab9b97d82ea2ec1bd8b6e6d0400c1661f219839 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 3 Feb 2017 14:40:21 +0100
Subject: typos fixed

---
 README.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 2bb5c80..5a2ea06 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
 
   - MP2D fingerprint descriptors
   - Tanimoto similarity with a threshold of 0.1
-  - no feature selction
+  - no feature selection
   - weighted majority vote predictions
 
   ```
@@ -113,17 +113,18 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
     training_dataset = Dataset.from_csv_file "EPAFHM_log10.csv"
     model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
     ```
+
     Please consult the [API documentation](http://rdoc.info/gems/lazar) and [source code](https:://github.com/opentox/lazar) for up to date information about implemented algorithms:
 
-    - Descriptor algorithms
-      - [Compounds](http://www.rubydoc.info/gems/lazar/OpenTox/Compound)
-      - [Nanoparticles](http://www.rubydoc.info/gems/lazar/OpenTox/Nanoparticle)
-    - [Similarity algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Similarity)
-    - [Feature selection algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/FeatureSelection)
-    - Local models
-      - [Classification](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Classification)
-      - [Regression](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Regression)
-      - [R caret](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Caret)
+- Descriptor algorithms
+  - [Compounds](http://www.rubydoc.info/gems/lazar/OpenTox/Compound)
+  - [Nanoparticles](http://www.rubydoc.info/gems/lazar/OpenTox/Nanoparticle)
+- [Similarity algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Similarity)
+- [Feature selection algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/FeatureSelection)
+- Local models
+  - [Classification](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Classification)
+  - [Regression](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Regression)
+  - [R caret](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Caret)
 
 
     You can find more working examples in the `lazar` `model-*.rb` and `validation-*.rb` [tests](https://github.com/opentox/lazar/tree/master/test).
-- 
cgit v1.2.3


From 9c456a580515055b15a7091ceeaf67308bade881 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 3 Feb 2017 14:43:29 +0100
Subject: block idents

---
 README.md | 108 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index 5a2ea06..1f62c36 100644
--- a/README.md
+++ b/README.md
@@ -67,23 +67,23 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
   - weighted majority vote predictions
 
   ```
-    algorithms = {
-      :descriptors => { # descriptor algorithm
-        :method => "fingerprint", # fingerprint descriptors
-        :type => "MP2D" # fingerprint type, e.g. FP4, MACCS
-      },
-      :similarity => { # similarity algorithm
-        :method => "Algorithm::Similarity.tanimoto",
-        :min => 0.1 # similarity threshold for neighbors
-      },
-      :feature_selection => nil, # no feature selection
-      :prediction => { # local modelling algorithm
-        :method => "Algorithm::Classification.weighted_majority_vote",
-      },
-    }
-
-    training_dataset = Dataset.from_csv_file "hamster_carcinogenicity.csv"
-    model = Model::Lazar.create  training_dataset: training_dataset, algorithms: algorithms
+algorithms = {
+  :descriptors => { # descriptor algorithm
+    :method => "fingerprint", # fingerprint descriptors
+    :type => "MP2D" # fingerprint type, e.g. FP4, MACCS
+  },
+  :similarity => { # similarity algorithm
+    :method => "Algorithm::Similarity.tanimoto",
+    :min => 0.1 # similarity threshold for neighbors
+  },
+  :feature_selection => nil, # no feature selection
+  :prediction => { # local modelling algorithm
+    :method => "Algorithm::Classification.weighted_majority_vote",
+  },
+}
+
+training_dataset = Dataset.from_csv_file "hamster_carcinogenicity.csv"
+model = Model::Lazar.create  training_dataset: training_dataset, algorithms: algorithms
   ```
 
   The next example creates a regression model with
@@ -94,27 +94,27 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
   - local partial least squares models from the R caret package
 
   ```
-    algorithms = {
-      :descriptors => { # descriptor algorithm
-        :method => "calculate_properties",
-        :features => PhysChem.openbabel_descriptors,
-      },
-      :similarity => { # similarity algorithm
-        :method => "Algorithm::Similarity.weighted_cosine",
-        :min => 0.5
-      },
-      :feature_selection => { # feature selection algorithm
-        :method => "Algorithm::FeatureSelection.correlation_filter",
-      },
-      :prediction => { # local modelling algorithm
-        :method => "Algorithm::Caret.pls",
-      },
-    }
-    training_dataset = Dataset.from_csv_file "EPAFHM_log10.csv"
-    model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
+algorithms = {
+  :descriptors => { # descriptor algorithm
+    :method => "calculate_properties",
+    :features => PhysChem.openbabel_descriptors,
+  },
+  :similarity => { # similarity algorithm
+    :method => "Algorithm::Similarity.weighted_cosine",
+    :min => 0.5
+  },
+  :feature_selection => { # feature selection algorithm
+    :method => "Algorithm::FeatureSelection.correlation_filter",
+  },
+  :prediction => { # local modelling algorithm
+    :method => "Algorithm::Caret.pls",
+  },
+}
+training_dataset = Dataset.from_csv_file "EPAFHM_log10.csv"
+model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
     ```
 
-    Please consult the [API documentation](http://rdoc.info/gems/lazar) and [source code](https:://github.com/opentox/lazar) for up to date information about implemented algorithms:
+Please consult the [API documentation](http://rdoc.info/gems/lazar) and [source code](https:://github.com/opentox/lazar) for up to date information about implemented algorithms:
 
 - Descriptor algorithms
   - [Compounds](http://www.rubydoc.info/gems/lazar/OpenTox/Compound)
@@ -127,7 +127,7 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
   - [R caret](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Caret)
 
 
-    You can find more working examples in the `lazar` `model-*.rb` and `validation-*.rb` [tests](https://github.com/opentox/lazar/tree/master/test).
+You can find more working examples in the `lazar` `model-*.rb` and `validation-*.rb` [tests](https://github.com/opentox/lazar/tree/master/test).
 
 ### Create and use `lazar` nanoparticle models
 
@@ -163,23 +163,23 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
   - Caret random forests
 
 ```
-          algorithms = {
-            :descriptors => {
-              :method => "properties",
-              :categories => ["P-CHEM"],
-            },
-            :similarity => {
-              :method => "Algorithm::Similarity.weighted_cosine",
-              :min => 0.5
-            },
-            :feature_selection => {
-              :method => "Algorithm::FeatureSelection.correlation_filter",
-            },
-            :prediction => {
-              :method => "Algorithm::Caret.rf",
-            },
-          }
-          validation_model = Model::Validation.from_enanomapper algorithms: algorithms
+algorithms = {
+  :descriptors => {
+    :method => "properties",
+    :categories => ["P-CHEM"],
+  },
+  :similarity => {
+    :method => "Algorithm::Similarity.weighted_cosine",
+    :min => 0.5
+  },
+  :feature_selection => {
+    :method => "Algorithm::FeatureSelection.correlation_filter",
+  },
+  :prediction => {
+    :method => "Algorithm::Caret.rf",
+  },
+}
+validation_model = Model::Validation.from_enanomapper algorithms: algorithms
 ```
 
 
-- 
cgit v1.2.3


From 75408162397b9db75b042fc128e9a01a2832828c Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Sat, 18 Feb 2017 09:49:48 +0100
Subject: fixed header for nano-lazar correlation plots

---
 lib/validation-statistics.rb | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 553e6ac..236a66c 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -179,8 +179,12 @@ module OpenTox
           R.assign "prediction", y
           R.eval "all = c(measurement,prediction)"
           R.eval "range = c(min(all), max(all))"
-          title = feature.name
-          title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
+          if feature.name.match /Net cell association/ # ad hoc fix for awkward units
+            title = "log2(Net cell association [mL/ug(Mg)])"
+          else
+            title = feature.name
+            title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank?
+          end
           R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
           R.eval "image = image + geom_abline(intercept=0, slope=1)"
           R.eval "ggsave(file='#{tmpfile}', plot=image)"
-- 
cgit v1.2.3


From c26dc4713bba5de9e38e54f870f01071c2c4c960 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 20 Feb 2017 09:41:05 +0100
Subject: worst predictions fixed for regression

---
 lib/validation-statistics.rb | 54 ++++++++++----------------------------------
 1 file changed, 12 insertions(+), 42 deletions(-)

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 236a66c..2d522ae 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -195,51 +195,21 @@ module OpenTox
         $gridfs.find_one(_id: correlation_plot_id).data
       end
 
-      # Get predictions with the largest difference between predicted and measured values
-      # @params [Fixnum] number of predictions
-      # @params [TrueClass,FalseClass,nil] include neighbors
-      # @params [TrueClass,FalseClass,nil] show common descriptors
+      # Get predictions with measurements outside of the prediction interval
       # @return [Hash]
-      def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
-        worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
-        worst_predictions.collect do |p|
-          substance = Substance.find(p.first)
-          prediction = p[1]
-          if show_neigbors
-            neighbors = prediction["neighbors"].collect do |n|
-              common_descriptors = []
-              if show_common_descriptors
-                common_descriptors = n["common_descriptors"].collect do |d|
-                  f=Feature.find(d)
-                  {
-                    :id => f.id.to_s,
-                    :name => "#{f.name} (#{f.conditions})",
-                    :p_value => d[:p_value],
-                    :r_squared => d[:r_squared],
-                  }
-                end
-              else
-                common_descriptors = n["common_descriptors"].size
-              end
-              {
-                :name => Substance.find(n["_id"]).name,
-                :id => n["_id"].to_s,
-                :common_descriptors => common_descriptors
-              }
-            end
-          else
-            neighbors = prediction["neighbors"].size
+      def worst_predictions
+        worst_predictions = predictions.select do |sid,p|
+          p["prediction_interval"] and p["value"] and (p["measurements"].max < p["prediction_interval"][0] or p["measurements"].min > p["prediction_interval"][1])
+        end.compact.to_h
+        worst_predictions.each do |sid,p|
+          p["error"] = (p["value"] - p["measurements"].median).abs
+          if p["measurements"].max < p["prediction_interval"][0]
+            p["distance_prediction_interval"] = (p["measurements"].max - p["prediction_interval"][0]).abs
+          elsif p["measurements"].min > p["prediction_interval"][1]
+            p["distance_prediction_interval"] = (p["measurements"].min - p["prediction_interval"][1]).abs
           end
-          {
-            :id => substance.id.to_s,
-            :name => substance.name,
-            :feature => Feature.find(prediction["prediction_feature_id"]).name,
-            :error => (prediction["value"] - prediction["measurements"].median).abs,
-            :prediction => prediction["value"],
-            :measurements => prediction["measurements"],
-            :neighbors => neighbors
-          }
         end
+        worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
       end
     end
   end
-- 
cgit v1.2.3