From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 20 Jun 2019 22:01:50 +0200
Subject: separate csv serialisations for batch predictions and training data,
 repeated measurements in mutagenicity dataset fixed, daphnia import fixed,
 CENTRAL_MONGO_IP removed

---
 lib/dataset.rb               | 75 ++++++++++++++++++++++++++++++--------------
 lib/download.rb              |  3 +-
 lib/feature.rb               | 11 ++++---
 lib/lazar.rb                 |  9 ++++--
 lib/model.rb                 |  8 ++---
 lib/validation-statistics.rb |  4 +--
 6 files changed, 71 insertions(+), 39 deletions(-)

(limited to 'lib')

diff --git a/lib/dataset.rb b/lib/dataset.rb
index df17569..596c53c 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -96,8 +96,14 @@ module OpenTox
 
     # Get nominal and numeric prediction features
     # @return [Array<OpenTox::NominalLazarPrediction,OpenTox::NumericLazarPrediction>]
-    def prediction_features
-      features.select{|f| f._type.match("Prediction")}
+    def prediction_feature
+      features.select{|f| f._type.match(/Prediction$/)}.first
+    end
+
+    # Get supporting nominal and numeric prediction features (class probabilities, prediction interval)
+    # @return [Array<OpenTox::LazarPredictionProbability,OpenTox::LazarPredictionInterval>]
+    def prediction_supporting_features
+      features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)}
     end
 
     # Get nominal and numeric merged features
@@ -259,7 +265,7 @@ module OpenTox
       feature_names = table.shift.collect{|f| f.strip}
       raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
 
-      if feature_names[0] =~ /ID/i # check ID column
+      if feature_names[0] !~ /SMILES|InChI/i # check ID column
         original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift)
       else
         original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID")
@@ -343,30 +349,52 @@ module OpenTox
 
     # Serialisation
     
-    # Convert dataset to csv format 
+    # Convert lazar prediction dataset to csv format 
     # @return [String]
-    def to_csv #inchi=false
-      CSV.generate() do |csv| 
-        
-        compound = substances.first.is_a? Compound
-        f = features - original_id_features - original_smiles_features - warnings_features
-        header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name}
-        header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound
-        compound ? header << "Canonical SMILES" : header << "Name"
-        header += f.collect{|f| f.name}
-        header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name} 
-        csv << header
-
-        substances.each do |substance|
-          row = original_id_features.collect{|f| values(substance,f).join(" ")}
-          row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
-          compound ? row << substance.smiles : row << substance.name
-          row += f.collect{|f| values(substance,f).join(" ")}
-          row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")} 
+    def to_prediction_csv
+      
+      compound = substances.first.is_a? Compound
+      header = ["ID"]
+      header << "Original SMILES" if compound
+      compound ? header << "Canonical SMILES" : header << "Name"
+      header << "Prediction" if prediction_feature
+      header << "Confidence" if confidence_feature
+      header += prediction_supporting_features.collect{|f| f.name}
+      header << "Measurements" 
+      csv = [header]
+
+      substances.each do |substance|
+        row = original_id_features.collect{|f| values(substance,f).join(" ")}
+        row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
+        compound ? row << substance.smiles : row << substance.name
+        row << values(substance,prediction_feature).join(" ")
+        row << values(substance,confidence_feature).join(" ")
+        row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")}
+        row << values(substance,bioactivity_features[0]).join(" ")
+        csv << row
+      end
+      csv.collect{|r| r.join(",")}.join("\n")
+    end
+    
+    # Convert dataset into csv formatted training data
+    # @return [String]
+    def to_training_csv 
+      
+      p features
+      p bioactivity_features
+      header = ["Canonical SMILES"]
+      header << bioactivity_features[0].name
+      csv = [header]
+
+      substances.each do |substance|
+        nr_activities = values(substance,bioactivity_features.first).size
+        (0..nr_activities-1).each do |n| # new row for each value
+          row = [substance.smiles]
+          row << values(substance,bioactivity_features[0])[n] 
           csv << row
         end
-
       end
+      csv.collect{|r| r.join(",")}.join("\n")
     end
 
     # Convert dataset to SDF format
@@ -396,7 +424,6 @@ module OpenTox
       predictions = {}
       substances.each do |s| 
         predictions[s] ||= {}
-        prediction_feature = prediction_features.first
         predictions[s][:value] = values(s,prediction_feature).first
         #predictions[s][:warnings] = []
         #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) }
diff --git a/lib/download.rb b/lib/download.rb
index f17d060..2546dc4 100644
--- a/lib/download.rb
+++ b/lib/download.rb
@@ -122,7 +122,6 @@ module OpenTox
     # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder
     def self.mutagenicity
       $logger.debug "Mutagenicity"
-      # TODO add download/conversion programs to lazar dependencies
       hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
       kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
       efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
@@ -185,7 +184,7 @@ module OpenTox
       map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
       dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
       dataset.merged_features.first.name = "Mutagenicity"
-      File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_csv}
+      File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv}
       meta = {
         :species => "Salmonella typhimurium",
         :endpoint => "Mutagenicity",
diff --git a/lib/feature.rb b/lib/feature.rb
index 72c26d7..296a174 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -18,6 +18,9 @@ module OpenTox
   # Confidence
   class Confidence < Feature
     field :dataset_id, type: BSON::ObjectId
+    def name
+      "Confidence"
+    end
   end
 
   # Categorical variables
@@ -66,13 +69,13 @@ module OpenTox
     field :model_id, type: BSON::ObjectId
     field :training_feature_id, type: BSON::ObjectId
     def name
-      "#{self[:name]} Prediction"
+      "Prediction: #{self[:name]}"
     end
   end
 
   class LazarPredictionProbability < NominalLazarPrediction
     def name
-      "probability(#{self[:name]})"
+      "Probability: #{self[:name]}"
     end
   end
 
@@ -81,13 +84,13 @@ module OpenTox
     field :model_id, type: BSON::ObjectId
     field :training_feature_id, type: BSON::ObjectId
     def name
-      "#{self[:name]} Prediction"
+      "Prediction: #{self[:name]}"
     end
   end
 
   class LazarPredictionInterval < NumericLazarPrediction
     def name
-      "prediction_interval_#{self[:name]}"
+      "#{self[:name].capitalize} prediction interval"
     end
   end
 
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 2a3f749..e77de9d 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -17,19 +17,22 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple
 
 ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"] 
 ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
+# CH: this interferes with /etc/hosts on my machine
 # search for a central mongo database in use
 # http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment
-CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
+# CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
 Mongoid.load_configuration({
   :clients => {
     :default => {
       :database => ENV["LAZAR_ENV"],
-      :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+      #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+      :hosts => ["localhost:27017"]
     }
   }
 })
 Mongoid.raise_not_found_error = false # return nil if no document is found
-$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
 $gridfs = $mongo.database.fs
 
 # Logger setup
diff --git a/lib/model.rb b/lib/model.rb
index cbfefe3..05cd113 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -286,14 +286,14 @@ module OpenTox
         end
         if threshold == algorithms[:similarity][:min].first
           if prediction[:warnings].empty? 
-            prediction[:confidence] = "High (close to bioassay results)"
+            prediction[:confidence] = "Similar to bioassay results"
             return prediction
           else # try again with a lower threshold
             prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
             predict_substance substance, algorithms[:similarity][:min].last, prediction
           end
         elsif threshold < algorithms[:similarity][:min].first
-          prediction[:confidence] = "Low (lower than bioassay results)"
+          prediction[:confidence] = "Lower than bioassay results"
           return prediction
         end
       end
@@ -348,9 +348,9 @@ module OpenTox
             end
           elsif prediction_feature.is_a? NumericBioActivity
             f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
-            prediction_interval = {}
+            prediction_interval = []
             ["lower","upper"].each do |v|
-              prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+              prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
             end
           end
 
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 8a8970e..d603294 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -18,7 +18,7 @@ module OpenTox
                 if pred[:value] == v
                   confusion_matrix[:all][i][i] += 1
                   self.nr_predictions[:all] += 1
-                  if pred[:confidence].match(/High/i)
+                  if pred[:confidence].match(/Similar/i)
                     confusion_matrix[:confidence_high][i][i] += 1
                     self.nr_predictions[:confidence_high] += 1
                   elsif pred[:confidence].match(/Low/i)
@@ -32,7 +32,7 @@ module OpenTox
                 if pred[:value] == v
                   confusion_matrix[:all][i][(i+1)%2] += 1
                   self.nr_predictions[:all] += 1
-                  if pred[:confidence].match(/High/i)
+                  if pred[:confidence].match(/Similar/i)
                     confusion_matrix[:confidence_high][i][(i+1)%2] += 1
                     self.nr_predictions[:confidence_high] += 1
                   elsif pred[:confidence].match(/Low/i)
-- 
cgit v1.2.3