From 19eb655f4af1a4631692989a30a59b7b78e6669b Mon Sep 17 00:00:00 2001
From: gebele <gebele@in-silico.ch>
Date: Fri, 20 Apr 2018 10:45:11 +0000
Subject: batch download with original identifiers

---
 application.rb   | 59 +++++++++++++++++++++++++----------------
 batch.rb         | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 helper.rb        |  9 +++----
 views/batch.haml |  1 -
 4 files changed, 120 insertions(+), 30 deletions(-)
 create mode 100644 batch.rb
diff --git a/application.rb b/application.rb
index 9c6f63e..b567147 100644
--- a/application.rb
+++ b/application.rb
@@ -1,5 +1,6 @@
 require_relative 'task.rb'
 require_relative 'prediction.rb'
+require_relative 'batch.rb'
 require_relative 'helper.rb'
 include OpenTox
 
@@ -8,6 +9,7 @@ configure :production, :development do
   enable :reloader
   also_reload './helper.rb'
   also_reload './prediction.rb'
+  also_reload './batch.rb'
 end
 
 before do
@@ -46,12 +48,11 @@ get '/task/?' do
     task = Task.find(params[:turi].to_s)
     return JSON.pretty_generate(:percent => task.percent)
   elsif params[:predictions]
-    task = Task.find(params[:predictions].to_s)
-    predictions = task.predictions[params[:model]]
+    task = Task.find(params[:predictions])
     pageSize = params[:pageSize].to_i - 1
     pageNumber= params[:pageNumber].to_i - 1
     if params[:model] == "Cramer"
-      prediction = predictions
+      prediction = task.predictions[params[:model]]
       compound = Compound.find prediction["compounds"][pageNumber]
       image = compound.svg
       smiles = compound.smiles
@@ -63,6 +64,7 @@ get '/task/?' do
       string += "</table></td>"
       html += "#{string}</tr></table>"
     else
+      predictions = task.predictions[params[:model]].collect{|hash| hash.values[0]}
       prediction_object = Prediction.find predictions[pageNumber]
       prediction = prediction_object.prediction
       compound = Compound.find prediction_object.compound
@@ -147,7 +149,7 @@ get '/download/dataset/:id' do
 end
 
 get '/delete/dataset/:id' do
-  dataset = Dataset.find params[:id]
+  dataset = Batch.find params[:id]
   dataset.delete
   File.delete File.join("tmp/"+dataset.name+".csv")
   redirect to("/")
@@ -164,11 +166,13 @@ get '/predict/csv/:task/:model/:filename/?' do
   else
     header = task.csv
     lines = []
-    task.predictions[params[:model]].each_with_index do |p,idx| 
-      if p.is_a? BSON::ObjectId
-        lines << "#{idx+1},#{Prediction.find(p).csv}"
+    task.predictions[params[:model]].each_with_index do |hash,idx|
+      identifier = hash.keys[0]
+      prediction_id = hash.values[0]
+      if prediction_id.is_a? BSON::ObjectId
+        lines << "#{idx+1},#{identifier},#{Prediction.find(prediction_id).csv}"
       else
-        lines << "#{idx+1},#{p}\n"
+        lines << "#{idx+1},#{identifier},#{p}\n"
       end
     end
     csv = header + lines.join("")
@@ -182,8 +186,9 @@ post '/predict/?' do
   # process batch prediction
   if !params[:fileselect].blank? || !params[:existing].blank?
     if !params[:existing].blank?
-      @dataset = Dataset.find params[:existing].keys[0]
+      @dataset = Batch.find params[:existing].keys[0]
       @compounds = @dataset.compounds
+      @identifiers = @dataset.identifiers
       @filename = @dataset.name
     end
     if !params[:fileselect].blank?
@@ -192,19 +197,21 @@ post '/predict/?' do
       end
       @filename = params[:fileselect][:filename]
       begin
-        @dataset = Dataset.find_by(:name => params[:fileselect][:filename].sub(/\.csv$/,""))
+        @dataset = Batch.find_by(:name => params[:fileselect][:filename].sub(/\.csv$/,""))
         if @dataset
           $logger.debug "Take file from database."
           @compounds = @dataset.compounds
+          @identifiers = @dataset.identifiers
         else
           File.open('tmp/' + params[:fileselect][:filename], "w") do |f|
             f.write(params[:fileselect][:tempfile].read)
           end
-          input = Dataset.from_csv_file File.join("tmp", params[:fileselect][:filename]), true
+          input = Batch.from_csv_file File.join("tmp", params[:fileselect][:filename])
           $logger.debug "Processing '#{params[:fileselect][:filename]}'"
-          if input.class == OpenTox::Dataset
+          if input.class == OpenTox::Batch
             @dataset = input
-            @compounds = input.compounds
+            @compounds = @dataset.compounds
+            @identifiers = @dataset.identifiers
           else
             File.delete File.join("tmp", params[:fileselect][:filename])
             bad_request_error "Could not serialize file '#{@filename}'."
@@ -216,7 +223,7 @@ post '/predict/?' do
       end
 
       if @compounds.size == 0
-        message = dataset[:warnings]
+        message = @dataset[:warnings]
         @dataset.delete
         bad_request_error message
       end
@@ -237,7 +244,7 @@ post '/predict/?' do
           if type == "Regression"
             unit = (type == "Regression") ? "(#{m.unit})" : ""
             converted_unit = (type == "Regression") ? "#{m.unit =~ /\b(mmol\/L)\b/ ? "(mg/L)" : "(mg/kg_bw/day)"}" : ""
-            header = "ID,Endpoint,Unique SMILES,inTrainingSet,Measurements #{unit},Prediction #{unit},Prediction #{converted_unit},"\
+            header = "ID,Input,Endpoint,Unique SMILES,inTrainingSet,Measurements #{unit},Prediction #{unit},Prediction #{converted_unit},"\
               "Prediction Interval Low #{unit},Prediction Interval High #{unit},"\
               "Prediction Interval Low #{converted_unit},Prediction Interval High #{converted_unit},"\
               "inApplicabilityDomain,Note\n"
@@ -245,7 +252,7 @@ post '/predict/?' do
           # add header for classification
           if type == "Classification"
             av = m.prediction_feature.accept_values
-            header = "ID,Endpoint,Unique SMILES,inTrainingSet,Measurements,Consensus Prediction,Consensus Confidence,"\
+            header = "ID,Input,Endpoint,Unique SMILES,inTrainingSet,Measurements,Consensus Prediction,Consensus Confidence,"\
               "Structural alerts for mutagenicity,Lazar Prediction,"\
               "Lazar predProbability #{av[0]},Lazar predProbability #{av[1]},inApplicabilityDomain,Note\n"
           end
@@ -253,7 +260,8 @@ post '/predict/?' do
           p = 100.0/@compounds.size
           counter = 1
           predictions = []
-          @compounds.each_with_index do |compound,idx|
+          @compounds.each_with_index do |cid,idx|
+            compound = Compound.find cid
             if Prediction.where(compound: compound.id, model: m.id).exists?
               prediction_object = Prediction.find_by(compound: compound.id, model: m.id)
               prediction = prediction_object.prediction
@@ -263,6 +271,8 @@ post '/predict/?' do
                 prediction_object[:csv] = prediction_to_csv(m,compound,prediction)
                 prediction_object.save
               end
+              # identifier
+              identifier = @identifiers[idx]
             else
               prediction = m.predict(compound)
               # save prediction object
@@ -312,9 +322,12 @@ post '/predict/?' do
               prediction_object[:prediction] = prediction
               prediction_object[:csv] = prediction_to_csv(m,compound,prediction)
               prediction_object.save
+
+              # identifier
+              identifier = @identifiers[idx]
             end
-            # collect prediction_object ids
-            predictions << prediction_id
+            # collect prediction_object ids with identifier
+            predictions << {identifier => prediction_id}
             t.update_percent((counter*p).ceil > 100 ? 100 : (counter*p).ceil)
             counter += 1
           end
@@ -323,24 +336,24 @@ post '/predict/?' do
           # write predictions
           @predictions["#{model}"] = predictions
         else # Cramer model
-          compounds = @compounds.collect{|c| c.smiles}
+          compounds = @compounds.collect{|cid| c = Compound.find cid; c.smiles}
           prediction = [Toxtree.predict(compounds, "Cramer rules"), Toxtree.predict(compounds, "Cramer rules with extensions")]
           output = {}
           output["model_name"] = "Oral toxicity (Cramer rules)"
           output["cramer_rules"] = prediction.collect{|array| array.collect{|hash| hash["Cramer rules"]}}.flatten.compact
           output["cramer_rules_extensions"] = prediction.collect{|array| array.collect{|hash| hash["Cramer rules, with extensions"]}}.flatten.compact
           # header
-          csv = "ID,Endpoint,Unique SMILES,Cramer rules,Cramer rules with extensions\n"
+          csv = "ID,Input,Endpoint,Unique SMILES,Cramer rules,Cramer rules with extensions\n"
           # content
           compounds.each_with_index do |smiles, idx|
-            csv << "#{idx+1},#{output["model_name"]},#{smiles},"\
+            csv << "#{idx+1},#{@identifiers[idx]},#{output["model_name"]},#{smiles},"\
               "#{output["cramer_rules"][idx] != "nil" ? output["cramer_rules"][idx] : "none" },"\
               "#{output["cramer_rules_extensions"][idx] != "nil" ? output["cramer_rules_extensions"][idx] : "none"}\n"
           end
           predictions = {}
           predictions["Cramer rules"] = output["cramer_rules"].collect{|rule| rule != "nil" ? rule : "none"}
           predictions["Cramer rules, with extensions"] = output["cramer_rules_extensions"].collect{|rule| rule != "nil" ? rule : "none"}
-          predictions["compounds"] = @compounds.collect{|c| c.id}
+          predictions["compounds"] = @compounds
 
           if @dataset.warnings
             @dataset.warnings.each do |warning|
diff --git a/batch.rb b/batch.rb
new file mode 100644
index 0000000..2dd9359
--- /dev/null
+++ b/batch.rb
@@ -0,0 +1,81 @@
+require 'csv'
+require 'tempfile'
+
+module OpenTox
+
+  class Batch
+
+    include OpenTox
+    include Mongoid::Document
+    include Mongoid::Timestamps
+    store_in collection: "batch"
+    field :name,  type: String
+    field :source,  type: String
+    field :identifiers, type: Array
+    field :compounds, type: Array
+    field :warnings, type: Array, default: []
+
+    def self.from_csv_file file
+      source = file
+      name = File.basename(file,".*")
+      batch = self.find_by(:source => source, :name => name)
+      if batch
+        $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})."
+      else
+        $logger.debug "Parsing #{file}."
+        table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
+        batch = self.new(:source => source, :name => name, :identifiers => [], :compounds => [])
+        
+        # features
+        feature_names = table.shift.collect{|f| f.strip}
+        warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+        compound_format = feature_names.shift.strip
+        bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+        numeric = []
+        features = []
+        # guess feature types
+        feature_names.each_with_index do |f,i|
+          metadata = {:name => f}
+          values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+          types = values.collect{|v| v.numeric? ? true : false}.uniq
+          feature = nil
+          if values.size == 0 # empty feature
+          elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+            numeric[i] = true
+            feature = NumericFeature.find_or_create_by(metadata)
+          else
+            metadata["accept_values"] = values
+            numeric[i] = false
+            feature = NominalFeature.find_or_create_by(metadata)
+          end
+          features << feature if feature
+        end
+        
+        table.each_with_index do |vals,i|
+          identifier = vals.shift.strip
+          batch.identifiers << identifier
+          begin
+            case compound_format
+            when /SMILES/i
+              compound = OpenTox::Compound.from_smiles(identifier)
+            when /InChI/i
+              compound = OpenTox::Compound.from_inchi(identifier)
+            end
+          rescue 
+            compound = nil
+          end
+          if compound.nil? # compound parsers may return nil
+            #warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
+            batch.compounds  << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}."
+            next
+          end
+          batch.compounds << compound.id
+        end
+        batch.save
+      end
+      batch
+    end
+
+  end
+
+end
diff --git a/helper.rb b/helper.rb
index dc4b695..ca20def 100644
--- a/helper.rb
+++ b/helper.rb
@@ -16,17 +16,14 @@ helpers do
   end
 
   def prediction_to_csv(m,c,p)
-    #model = Model::Validation.find(m.to_s)
     model = m
     model_name = "#{model.endpoint.gsub('_', ' ')} (#{model.species})"
     model_unit = model.regression? ? "(#{model.unit})" : ""
     converted_model_unit = model.regression? ? "#{model.unit =~ /\b(mmol\/L)\b/ ? "(mg/L)" : "(mg/kg_bw/day)"}" : ""
 
-    #predictions = predictions_ids.collect{|prediction_id| Prediction.find prediction_id}
     csv = ""
-    compound = c#Compound.find prediction_object.compound
-    prediction = p#prediction_object.prediction
-    #prediction.delete_if{|k,v| k =~ /neighbors|prediction_feature_id/}
+    compound = c
+    prediction = p
     output = {}
     line = ""
     output["model_name"] = model_name
@@ -123,7 +120,7 @@ helpers do
   end
 
   def dataset_storage
-    all = Dataset.where(:source => /^tmp/)
+    all = Batch.where(:source => /^tmp/)
     out = Hash.new
     all.reverse.each{|d| out[d.id] = [d.name, d.created_at]}
     out
diff --git a/views/batch.haml b/views/batch.haml
index c71d056..1aa6779 100644
--- a/views/batch.haml
+++ b/views/batch.haml
@@ -102,7 +102,6 @@
   - @models.each_with_index do |model,idx|
     - m = Model::Validation.find model unless model == "Cramer"
     - task = @tasks[idx].id
-    - predictions = @tasks[idx].predictions["#{model}"]
     #result.panel{:id=>idx}
       %div.row
         %div.col-md-6
-- 
cgit v1.2.3