From e718cf76f32fb29d6c7c3732ec82f35b0da49122 Mon Sep 17 00:00:00 2001
From: "helma@in-silico.ch" <helma@in-silico.ch>
Date: Fri, 5 Oct 2018 17:06:46 +0200
Subject: sdf import, csv files with id column

---
 lib/compound.rb        |   3 +-
 lib/dataset.rb         | 237 ++++++++++++++++++++++++++++++++++++++++++++++---
 test/data/input_53.csv |  54 +++++++++++
 test/data/input_53.tsv |  54 +++++++++++
 test/dataset.rb        |  52 +++++++++++
 test/setup.rb          |   6 +-
 6 files changed, 389 insertions(+), 17 deletions(-)
 create mode 100644 test/data/input_53.csv
 create mode 100644 test/data/input_53.tsv

diff --git a/lib/compound.rb b/lib/compound.rb
index e8f6bc4..d80f579 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -319,7 +319,8 @@ module OpenTox
       obconversion.read_string obmol, identifier
       case output_format
       when /smi|can|inchi/
-        obconversion.write_string(obmol).gsub(/\s/,'').chomp
+        #obconversion.write_string(obmol).gsub(/\s/,'').chomp
+        obconversion.write_string(obmol).split(/\s/).first
       when /sdf/
         # TODO: find disconnected structures
         # strip_salts
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 4e504de..17c30d5 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -1,5 +1,6 @@
 require 'csv'
 require 'tempfile'
+require 'digest/md5'
 
 module OpenTox
 
@@ -7,6 +8,7 @@ module OpenTox
   class Dataset
 
     field :data_entries, type: Hash, default: {}
+    field :md5, type: String
 
     # Readers
 
@@ -104,6 +106,7 @@ module OpenTox
     
     # Convert dataset to csv format including compound smiles as first column, other column headers are feature names
     # @return [String]
+    # TODO original_id
     def to_csv(inchi=false)
       CSV.generate() do |csv| 
         compound = substances.first.is_a? Compound
@@ -152,28 +155,120 @@ module OpenTox
 
     # Parsers
 
-    # Create a dataset from file (csv,sdf,...)
-    # @param filename [String]
-    # @return [String] dataset uri
-    # TODO
-    #def self.from_sdf_file
-    #end
+    # Create a dataset from PubChem Assay
+    # @param [File] 
+    # @return [OpenTox::Dataset]
+    def self.from_pubchem aid
+      csv = RestClientWrapper.get "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV"
+      table = CSV.read csv
+      puts table
+=begin
+          dataset = self.new(:source => file, :name => name, :md5 => md5)
+          dataset.parse_table table, accept_empty_values
+        else
+      puts csv
+i = 0
+activities = []
+File.readlines(ARGV[0]).each do |line|
+  if i > 2
+    tokens = line.split ","
+    p line if tokens[1].empty?
+    activities << [tokens[1],tokens[3]]
+  end
+  i += 1
+end
+
+puts "SMILES,Activity"
+activities.each_slice(100) do |slice| # get SMILES in chunks
+  sids = slice.collect{|e| e[0]}
+  smiles = `curl https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT`.split("\n")
+  abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size
+  smiles.each_with_index do |smi,i|
+    act = slice[i]
+    puts [smi.chomp,act[1]].join(",")
+  end
+end
+=end
+    end
+
+    # Create a dataset from SDF file 
+    # @param [File] 
+    # @return [OpenTox::Dataset]
+    def self.from_sdf_file file
+      md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
+      dataset = self.find_by(:md5 => md5)
+      if dataset
+        $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
+      else
+        $logger.debug "Parsing #{file}."
+        table = nil
+        read_result = false
+        sdf = ""
+        dataset = self.new(:source => file, :name => name, :md5 => md5)
+        original_id = NominalFeature.find_or_create_by(:name => "original_id")
+
+        feature_name = ""
+        compound = nil
+        features = {}
+
+        File.readlines(file).each do |line|
+          if line.match %r{\$\$\$\$}
+            sdf << line
+            id = sdf.split("\n").first.chomp
+            compound = Compound.from_sdf sdf
+            dataset.add compound, original_id, id
+            features.each { |f,v| dataset.add compound, f, v }
+            sdf = ""
+            features = {}
+          elsif line.match /^>\s+</
+            feature_name = line.match(/^>\s+<(.*)>/)[1]
+            read_result = true
+          else
+            if read_result
+              value = line.chomp
+              if value.numeric?
+                feature = NumericFeature.find_or_create_by(:name => feature_name)
+                value = value.to_f
+              else
+                feature = NominalFeature.find_or_create_by(:name => feature_name)
+              end
+              features[feature] = value
+              #p compound.smiles, feature.name, value
+              read_result = false
+            else
+              sdf << line
+            end
+          end
+        end
+      end
+      dataset
+ 
+    end
     
     # Create a dataset from CSV file
     # @param [File] 
     # @param [TrueClass,FalseClass] accept or reject empty values
     # @return [OpenTox::Dataset]
     def self.from_csv_file file, accept_empty_values=false
-      source = file
-      name = File.basename(file,".*")
-      dataset = self.find_by(:source => source, :name => name)
+      md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
+      dataset = self.find_by(:md5 => md5)
       if dataset
         $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
       else
         $logger.debug "Parsing #{file}."
-        table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
-        dataset = self.new(:source => source, :name => name)
-        dataset.parse_table table, accept_empty_values
+        table = nil
+        [",","\t",";"].each do |sep| # guess CSV separator
+          if File.readlines(file).first.match(/#{sep}/)
+            table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
+            break
+          end
+        end
+        if table
+          dataset = self.new(:source => file, :name => name, :md5 => md5)
+          dataset.parse_table table, accept_empty_values
+        else
+          bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator."
+        end
       end
       dataset
     end
@@ -187,10 +282,18 @@ module OpenTox
       # features
       feature_names = table.shift.collect{|f| f.strip}
       warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
-      compound_format = feature_names.shift.strip
+
+      original_id = nil 
+      if feature_names[0] =~ /ID/i # check ID column
+        feature_names.shift 
+        original_id = NominalFeature.find_or_create_by(:name => "original_id")
+      end
+
+      compound_format = feature_names.shift
       bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
       numeric = []
       features = []
+
       # guess feature types
       feature_names.each_with_index do |f,i|
         metadata = {:name => f}
@@ -213,6 +316,7 @@ module OpenTox
 
       all_substances = []
       table.each_with_index do |vals,i|
+        original_id_value = vals.shift.strip if original_id
         identifier = vals.shift.strip
         warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
         begin
@@ -239,6 +343,8 @@ module OpenTox
           next
         end
 
+        add substance, original_id, original_id_value if original_id
+
         vals.each_with_index do |v,j|
           if v.blank?
             warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
@@ -294,4 +400,109 @@ module OpenTox
 
   end
 
+  class Batch
+
+    include OpenTox
+    include Mongoid::Document
+    include Mongoid::Timestamps
+    store_in collection: "batch"
+    field :name,  type: String
+    field :source,  type: String
+    field :identifiers, type: Array
+    field :ids, type: Array
+    field :compounds, type: Array
+    field :warnings, type: Array, default: []
+
+    def self.from_csv_file file
+      source = file
+      name = File.basename(file,".*")
+      batch = self.find_by(:source => source, :name => name)
+      if batch
+        $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})."
+      else
+        $logger.debug "Parsing #{file}."
+        # check delimiter
+        line = File.readlines(file).first
+        if line.match(/\t/)
+          table = CSV.read file, :col_sep => "\t", :skip_blanks => true, :encoding => 'windows-1251:utf-8'
+        else
+          table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
+        end
+        batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => [])
+
+        # original IDs
+        if table[0][0] =~ /ID/i
+          @original_ids = table.collect{|row| row.shift}
+          @original_ids.shift
+        end
+        
+        # features
+        feature_names = table.shift.collect{|f| f.strip}
+        warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+        compound_format = feature_names.shift.strip
+        unless compound_format =~ /SMILES|InChI/i
+          File.delete file
+          bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \
+          "Accepted formats: SMILES, InChI. Please take a look on the help page."
+        end
+        numeric = []
+        features = []
+        # guess feature types
+        feature_names.each_with_index do |f,i|
+          metadata = {:name => f}
+          values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+          types = values.collect{|v| v.numeric? ? true : false}.uniq
+          feature = nil
+          if values.size == 0 # empty feature
+          elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+            numeric[i] = true
+            feature = NumericFeature.find_or_create_by(metadata)
+          else
+            metadata["accept_values"] = values
+            numeric[i] = false
+            feature = NominalFeature.find_or_create_by(metadata)
+          end
+          features << feature if feature
+        end
+        
+        table.each_with_index do |vals,i|
+          identifier = vals.shift.strip.gsub(/^'|'$/,"")
+          begin
+            case compound_format
+            when /SMILES/i
+              compound = OpenTox::Compound.from_smiles(identifier)
+            when /InChI/i
+              compound = OpenTox::Compound.from_inchi(identifier)
+            end
+          rescue 
+            compound = nil
+          end
+          # collect only for present compounds
+          unless compound.nil?
+            batch.identifiers << identifier
+            batch.compounds << compound.id
+            batch.ids << @original_ids[i] if @original_ids
+          else
+            batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}."
+          end
+        end
+        batch.compounds.duplicates.each do |duplicate|
+          $logger.debug "Duplicates found in #{name}."
+          dup = Compound.find duplicate
+          positions = []
+          batch.compounds.each_with_index do |co,i|
+            c = Compound.find co
+            if !c.blank? and c.inchi and c.inchi == dup.inchi
+              positions << i+1
+            end
+          end
+          batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}."
+        end
+        batch.save
+      end
+      batch
+    end
+
+  end
+
 end
diff --git a/test/data/input_53.csv b/test/data/input_53.csv
new file mode 100644
index 0000000..b213027
--- /dev/null
+++ b/test/data/input_53.csv
@@ -0,0 +1,54 @@
+ID,SMILES
+123-30-8,Oc1ccc(N)cc1
+68391-25-3,OC(COc1ccccc1)CNc2ccc(cc2)Cc3ccc(N)cc3
+62-53-3,Nc1ccccc1
+123-98-8,O=C(CCCCCCCC(=O)Cl)Cl
+106-51-4,O=C1C=CC(=O)C=C1
+7144-65-2,O(c1ccccc1c2ccccc2)CC3OC3
+3130-19-6,O=C(OCC1CCC2OC2(C1))CCCCC(=O)OCC3CCC4OC4(C3)
+140-95-4,O=C(NCO)NCO
+2778-42-9,O=C=NC(c1cccc(c1)C(N=C=O)(C)C)(C)C
+593-60-2,C=CBr
+75-25-2,C(Br)(Br)Br
+1852-16-0,O=C(C=C)NCOCCCC
+107-58-4,O=C(C=C)NC(C)(C)C
+592-35-8,O=C(OCCCC)N
+2426-08-6,O(CCCC)CC1OC1
+79-07-2,O=C(N)CCl
+110-75-8,O(C=C)CCCl
+67-66-3,C(Cl)(Cl)Cl
+26172-55-4,O=C1C=C(Cl)SN1C
+598-09-4,O1CC1(C)CCl
+2556-36-7,O=C=NC1CCC(N=C=O)CC1
+3271-22-5,n1c(nc(nc1OC)c2ccc3ccc4cccc5ccc2c3c45)OC
+2680-03-7,O=C(C=C)N(C)C
+13036-41-4,O=C(C=C)NCOCC
+556-52-5,OCC1OC1
+2530-83-8,O(CCC[Si](OC)(OC)OC)CC1OC1
+106-90-1,O=C(OCC1OC1)C=C
+26761-45-5,O=C(OCC1OC1)C(C)(C)CCCCCC
+122-60-1,O(c1ccccc1)CC2OC2
+2210-79-9,O(c1ccccc1C)CC2OC2
+2461-15-6,O(CC1OC1)CC(CC)CCCC
+75-02-5,FC=C
+98-01-1,O=Cc1occc1
+111-30-8,O=CCCCC=O
+107-22-2,O=CC=O
+78-84-2,O=CC(C)C
+11087-88-0,O=C(OCCCCCC(C)C)CCCCCCCC1OC1(CCCCCCCC)
+3644-11-9,O=C(C=C)NCOC
+1187-59-3,O=C(C=C)NC
+54208-63-8,O(c1ccccc1Cc3ccccc3(OCC2OC2))CC4OC4
+110-26-9,O=C(C=C)NCNC(=O)C=C
+1208-52-2,Nc1ccc(cc1)Cc2ccccc2(N)
+71033-08-4,O(c1ccc(cc1)C(c3ccc(OCC(OCC2OC2)COCCCC)cc3)(C)C)CC(OCC4OC4)COCCCC
+5165-97-9,O=C(C=C)NC(C)(C)CS(=O)(=O)O
+34813-62-2,O=C=NCCCC(C)CN=C=O
+16669-59-3,O=C(C=C)NCOCC(C)C
+80-48-8,O=S(=O)(OC)c1ccc(cc1)C
+2386-87-0,O=C(OCC1CCC2OC2(C1))C3CCC4OC4(C3)
+104-49-4,O=C=Nc1ccc(N=C=O)cc1
+103-71-9,O=C=Nc1ccccc1
+111-19-3,O=C(CCCCCCCCC(=O)Cl)Cl
+7320-37-8,O1CC1CCCCCCCCCCCCCC
+2451-62-9,O=C1N(C(=O)N(C(=O)N1CC2OC2)CC3OC3)CC4OC4
diff --git a/test/data/input_53.tsv b/test/data/input_53.tsv
new file mode 100644
index 0000000..c46fdd4
--- /dev/null
+++ b/test/data/input_53.tsv
@@ -0,0 +1,54 @@
+Id	Smiles
+123-30-8	Oc1ccc(N)cc1
+68391-25-3	OC(COc1ccccc1)CNc2ccc(cc2)Cc3ccc(N)cc3
+62-53-3	Nc1ccccc1
+123-98-8	O=C(CCCCCCCC(=O)Cl)Cl
+106-51-4	O=C1C=CC(=O)C=C1
+7144-65-2	O(c1ccccc1c2ccccc2)CC3OC3
+3130-19-6	O=C(OCC1CCC2OC2(C1))CCCCC(=O)OCC3CCC4OC4(C3)
+140-95-4	O=C(NCO)NCO
+2778-42-9	O=C=NC(c1cccc(c1)C(N=C=O)(C)C)(C)C
+593-60-2	C=CBr
+75-25-2	C(Br)(Br)Br
+1852-16-0	O=C(C=C)NCOCCCC
+107-58-4	O=C(C=C)NC(C)(C)C
+592-35-8	O=C(OCCCC)N
+2426-08-6	O(CCCC)CC1OC1
+79-07-2	O=C(N)CCl
+110-75-8	O(C=C)CCCl
+67-66-3	C(Cl)(Cl)Cl
+26172-55-4	O=C1C=C(Cl)SN1C
+598-09-4	O1CC1(C)CCl
+2556-36-7	O=C=NC1CCC(N=C=O)CC1
+3271-22-5	n1c(nc(nc1OC)c2ccc3ccc4cccc5ccc2c3c45)OC
+2680-03-7	O=C(C=C)N(C)C
+13036-41-4	O=C(C=C)NCOCC
+556-52-5	OCC1OC1
+2530-83-8	O(CCC[Si](OC)(OC)OC)CC1OC1
+106-90-1	O=C(OCC1OC1)C=C
+26761-45-5	O=C(OCC1OC1)C(C)(C)CCCCCC
+122-60-1	O(c1ccccc1)CC2OC2
+2210-79-9	O(c1ccccc1C)CC2OC2
+2461-15-6	O(CC1OC1)CC(CC)CCCC
+75-02-5	FC=C
+98-01-1	O=Cc1occc1
+111-30-8	O=CCCCC=O
+107-22-2	O=CC=O
+78-84-2	O=CC(C)C
+11087-88-0	O=C(OCCCCCC(C)C)CCCCCCCC1OC1(CCCCCCCC)
+3644-11-9	O=C(C=C)NCOC
+1187-59-3	O=C(C=C)NC
+54208-63-8	O(c1ccccc1Cc3ccccc3(OCC2OC2))CC4OC4
+110-26-9	O=C(C=C)NCNC(=O)C=C
+1208-52-2	Nc1ccc(cc1)Cc2ccccc2(N)
+71033-08-4	O(c1ccc(cc1)C(c3ccc(OCC(OCC2OC2)COCCCC)cc3)(C)C)CC(OCC4OC4)COCCCC
+5165-97-9	O=C(C=C)NC(C)(C)CS(=O)(=O)O
+34813-62-2	O=C=NCCCC(C)CN=C=O
+16669-59-3	O=C(C=C)NCOCC(C)C
+80-48-8	O=S(=O)(OC)c1ccc(cc1)C
+2386-87-0	O=C(OCC1CCC2OC2(C1))C3CCC4OC4(C3)
+104-49-4	O=C=Nc1ccc(N=C=O)cc1
+103-71-9	O=C=Nc1ccccc1
+111-19-3	O=C(CCCCCCCCC(=O)Cl)Cl
+7320-37-8	O1CC1CCCCCCCCCCCCCC
+2451-62-9	O=C1N(C(=O)N(C(=O)N1CC2OC2)CC3OC3)CC4OC4
diff --git a/test/dataset.rb b/test/dataset.rb
index 055a029..11a4697 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -1,6 +1,21 @@
+# batch class
+
 require_relative "setup.rb"
 
 class DatasetTest < MiniTest::Test
+  
+  # TODO 
+  def test_from_pubchem
+    d = Dataset.from_pubchem 1190
+  end
+
+  def test_merge
+    skip "TODO"
+  end
+
+  def test_to_sdf
+    skip "TODO"
+  end
 
   # basics
 
@@ -21,6 +36,34 @@ class DatasetTest < MiniTest::Test
 
   # real datasets
 
+  def test_upload_csv_with_id
+    d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv"
+    assert_equal 53, d.compounds.size
+    assert_equal 1, d.features.size
+    f = d.features[0]
+    assert_equal "original_id", f.name
+    assert_equal ["123-30-8"], d.values(d.compounds.first,f)
+  end
+
+  def test_upload_tsv_with_id
+    d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv"
+    assert_equal 53, d.compounds.size
+    assert_equal 1, d.features.size
+    assert_equal 1, d.features.size
+    f = d.features[0]
+    assert_equal "original_id", f.name
+    assert_equal ["123-30-8"], d.values(d.compounds.first,f)
+  end
+
+  def test_upload_sdf
+    #d = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf"
+    d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf"
+    assert_equal Compound.from_smiles("C[C@H]1C(=O)O[C@@H]2CCN3[C@@H]2C(=CC3)COC(=O)[C@]([C@]1(C)O)(C)O").smiles, d.compounds.first.smiles
+    f = Feature.find_by(:name => "original_id")
+    assert_equal 35, d.features.size
+    assert_equal ["9415"], d.values(d.compounds.first,f)
+  end
+
   def test_upload_hamster
     d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
     assert_equal Dataset, d.class
@@ -103,6 +146,15 @@ class DatasetTest < MiniTest::Test
     d.delete
   end
 
+  def test_multiple_uploads
+    datasets = []
+    2.times do
+      d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv")
+      datasets << d
+    end
+    assert_equal datasets[0],datasets[1]
+  end
+
   # batch predictions
 
   def test_create_without_features_smiles_and_inchi
diff --git a/test/setup.rb b/test/setup.rb
index 4a11aa0..c4c04cb 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,8 +5,8 @@ require_relative '../lib/lazar.rb'
 include OpenTox
 #$mongo.database.drop
 #$gridfs = $mongo.database.fs # recreate GridFS indexes
-PhysChem.descriptors
+#PhysChem.descriptors
 TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
 DATA_DIR ||= File.join(TEST_DIR,"data")
-training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-Import::Enanomapper.import unless training_dataset
+#training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+#Import::Enanomapper.import unless training_dataset
-- 
cgit v1.2.3