From e718cf76f32fb29d6c7c3732ec82f35b0da49122 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 5 Oct 2018 17:06:46 +0200 Subject: sdf import, csv files with id column --- lib/compound.rb | 3 +- lib/dataset.rb | 237 ++++++++++++++++++++++++++++++++++++++++++++++--- test/data/input_53.csv | 54 +++++++++++ test/data/input_53.tsv | 54 +++++++++++ test/dataset.rb | 52 +++++++++++ test/setup.rb | 6 +- 6 files changed, 389 insertions(+), 17 deletions(-) create mode 100644 test/data/input_53.csv create mode 100644 test/data/input_53.tsv diff --git a/lib/compound.rb b/lib/compound.rb index e8f6bc4..d80f579 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -319,7 +319,8 @@ module OpenTox obconversion.read_string obmol, identifier case output_format when /smi|can|inchi/ - obconversion.write_string(obmol).gsub(/\s/,'').chomp + #obconversion.write_string(obmol).gsub(/\s/,'').chomp + obconversion.write_string(obmol).split(/\s/).first when /sdf/ # TODO: find disconnected structures # strip_salts diff --git a/lib/dataset.rb b/lib/dataset.rb index 4e504de..17c30d5 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -1,5 +1,6 @@ require 'csv' require 'tempfile' +require 'digest/md5' module OpenTox @@ -7,6 +8,7 @@ module OpenTox class Dataset field :data_entries, type: Hash, default: {} + field :md5, type: String # Readers @@ -104,6 +106,7 @@ module OpenTox # Convert dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] + # TODO original_id def to_csv(inchi=false) CSV.generate() do |csv| compound = substances.first.is_a? Compound @@ -152,28 +155,120 @@ module OpenTox # Parsers - # Create a dataset from file (csv,sdf,...) - # @param filename [String] - # @return [String] dataset uri - # TODO - #def self.from_sdf_file - #end + # Create a dataset from PubChem Assay + # @param [File] + # @return [OpenTox::Dataset] + def self.from_pubchem aid + csv = RestClientWrapper.get "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" + table = CSV.read csv + puts table +=begin + dataset = self.new(:source => file, :name => name, :md5 => md5) + dataset.parse_table table, accept_empty_values + else + puts csv +i = 0 +activities = [] +File.readlines(ARGV[0]).each do |line| + if i > 2 + tokens = line.split "," + p line if tokens[1].empty? + activities << [tokens[1],tokens[3]] + end + i += 1 +end + +puts "SMILES,Activity" +activities.each_slice(100) do |slice| # get SMILES in chunks + sids = slice.collect{|e| e[0]} + smiles = `curl https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT`.split("\n") + abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size + smiles.each_with_index do |smi,i| + act = slice[i] + puts [smi.chomp,act[1]].join(",") + end +end +=end + end + + # Create a dataset from SDF file + # @param [File] + # @return [OpenTox::Dataset] + def self.from_sdf_file file + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) + if dataset + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." + else + $logger.debug "Parsing #{file}." + table = nil + read_result = false + sdf = "" + dataset = self.new(:source => file, :name => name, :md5 => md5) + original_id = NominalFeature.find_or_create_by(:name => "original_id") + + feature_name = "" + compound = nil + features = {} + + File.readlines(file).each do |line| + if line.match %r{\$\$\$\$} + sdf << line + id = sdf.split("\n").first.chomp + compound = Compound.from_sdf sdf + dataset.add compound, original_id, id + features.each { |f,v| dataset.add compound, f, v } + sdf = "" + features = {} + elsif line.match /^>\s+\s+<(.*)>/)[1] + read_result = true + else + if read_result + value = line.chomp + if value.numeric? + feature = NumericFeature.find_or_create_by(:name => feature_name) + value = value.to_f + else + feature = NominalFeature.find_or_create_by(:name => feature_name) + end + features[feature] = value + #p compound.smiles, feature.name, value + read_result = false + else + sdf << line + end + end + end + end + dataset + + end # Create a dataset from CSV file # @param [File] # @param [TrueClass,FalseClass] accept or reject empty values # @return [OpenTox::Dataset] def self.from_csv_file file, accept_empty_values=false - source = file - name = File.basename(file,".*") - dataset = self.find_by(:source => source, :name => name) + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) if dataset $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." else $logger.debug "Parsing #{file}." - table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - dataset = self.new(:source => source, :name => name) - dataset.parse_table table, accept_empty_values + table = nil + [",","\t",";"].each do |sep| # guess CSV separator + if File.readlines(file).first.match(/#{sep}/) + table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + break + end + end + if table + dataset = self.new(:source => file, :name => name, :md5 => md5) + dataset.parse_table table, accept_empty_values + else + bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." + end end dataset end @@ -187,10 +282,18 @@ module OpenTox # features feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - compound_format = feature_names.shift.strip + + original_id = nil + if feature_names[0] =~ /ID/i # check ID column + feature_names.shift + original_id = NominalFeature.find_or_create_by(:name => "original_id") + end + + compound_format = feature_names.shift bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] features = [] + # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} @@ -213,6 +316,7 @@ module OpenTox all_substances = [] table.each_with_index do |vals,i| + original_id_value = vals.shift.strip if original_id identifier = vals.shift.strip warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values begin @@ -239,6 +343,8 @@ module OpenTox next end + add substance, original_id, original_id_value if original_id + vals.each_with_index do |v,j| if v.blank? warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." @@ -294,4 +400,109 @@ module OpenTox end + class Batch + + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "batch" + field :name, type: String + field :source, type: String + field :identifiers, type: Array + field :ids, type: Array + field :compounds, type: Array + field :warnings, type: Array, default: [] + + def self.from_csv_file file + source = file + name = File.basename(file,".*") + batch = self.find_by(:source => source, :name => name) + if batch + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})." + else + $logger.debug "Parsing #{file}." + # check delimiter + line = File.readlines(file).first + if line.match(/\t/) + table = CSV.read file, :col_sep => "\t", :skip_blanks => true, :encoding => 'windows-1251:utf-8' + else + table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + end + batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => []) + + # original IDs + if table[0][0] =~ /ID/i + @original_ids = table.collect{|row| row.shift} + @original_ids.shift + end + + # features + feature_names = table.shift.collect{|f| f.strip} + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + compound_format = feature_names.shift.strip + unless compound_format =~ /SMILES|InChI/i + File.delete file + bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \ + "Accepted formats: SMILES, InChI. Please take a look on the help page." + end + numeric = [] + features = [] + # guess feature types + feature_names.each_with_index do |f,i| + metadata = {:name => f} + values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil + if values.size == 0 # empty feature + elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes + numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) + else + metadata["accept_values"] = values + numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) + end + features << feature if feature + end + + table.each_with_index do |vals,i| + identifier = vals.shift.strip.gsub(/^'|'$/,"") + begin + case compound_format + when /SMILES/i + compound = OpenTox::Compound.from_smiles(identifier) + when /InChI/i + compound = OpenTox::Compound.from_inchi(identifier) + end + rescue + compound = nil + end + # collect only for present compounds + unless compound.nil? + batch.identifiers << identifier + batch.compounds << compound.id + batch.ids << @original_ids[i] if @original_ids + else + batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}." + end + end + batch.compounds.duplicates.each do |duplicate| + $logger.debug "Duplicates found in #{name}." + dup = Compound.find duplicate + positions = [] + batch.compounds.each_with_index do |co,i| + c = Compound.find co + if !c.blank? and c.inchi and c.inchi == dup.inchi + positions << i+1 + end + end + batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}." + end + batch.save + end + batch + end + + end + end diff --git a/test/data/input_53.csv b/test/data/input_53.csv new file mode 100644 index 0000000..b213027 --- /dev/null +++ b/test/data/input_53.csv @@ -0,0 +1,54 @@ +ID,SMILES +123-30-8,Oc1ccc(N)cc1 +68391-25-3,OC(COc1ccccc1)CNc2ccc(cc2)Cc3ccc(N)cc3 +62-53-3,Nc1ccccc1 +123-98-8,O=C(CCCCCCCC(=O)Cl)Cl +106-51-4,O=C1C=CC(=O)C=C1 +7144-65-2,O(c1ccccc1c2ccccc2)CC3OC3 +3130-19-6,O=C(OCC1CCC2OC2(C1))CCCCC(=O)OCC3CCC4OC4(C3) +140-95-4,O=C(NCO)NCO +2778-42-9,O=C=NC(c1cccc(c1)C(N=C=O)(C)C)(C)C +593-60-2,C=CBr +75-25-2,C(Br)(Br)Br +1852-16-0,O=C(C=C)NCOCCCC +107-58-4,O=C(C=C)NC(C)(C)C +592-35-8,O=C(OCCCC)N +2426-08-6,O(CCCC)CC1OC1 +79-07-2,O=C(N)CCl +110-75-8,O(C=C)CCCl +67-66-3,C(Cl)(Cl)Cl +26172-55-4,O=C1C=C(Cl)SN1C +598-09-4,O1CC1(C)CCl +2556-36-7,O=C=NC1CCC(N=C=O)CC1 +3271-22-5,n1c(nc(nc1OC)c2ccc3ccc4cccc5ccc2c3c45)OC +2680-03-7,O=C(C=C)N(C)C +13036-41-4,O=C(C=C)NCOCC +556-52-5,OCC1OC1 +2530-83-8,O(CCC[Si](OC)(OC)OC)CC1OC1 +106-90-1,O=C(OCC1OC1)C=C +26761-45-5,O=C(OCC1OC1)C(C)(C)CCCCCC +122-60-1,O(c1ccccc1)CC2OC2 +2210-79-9,O(c1ccccc1C)CC2OC2 +2461-15-6,O(CC1OC1)CC(CC)CCCC +75-02-5,FC=C +98-01-1,O=Cc1occc1 +111-30-8,O=CCCCC=O +107-22-2,O=CC=O +78-84-2,O=CC(C)C +11087-88-0,O=C(OCCCCCC(C)C)CCCCCCCC1OC1(CCCCCCCC) +3644-11-9,O=C(C=C)NCOC +1187-59-3,O=C(C=C)NC +54208-63-8,O(c1ccccc1Cc3ccccc3(OCC2OC2))CC4OC4 +110-26-9,O=C(C=C)NCNC(=O)C=C +1208-52-2,Nc1ccc(cc1)Cc2ccccc2(N) +71033-08-4,O(c1ccc(cc1)C(c3ccc(OCC(OCC2OC2)COCCCC)cc3)(C)C)CC(OCC4OC4)COCCCC +5165-97-9,O=C(C=C)NC(C)(C)CS(=O)(=O)O +34813-62-2,O=C=NCCCC(C)CN=C=O +16669-59-3,O=C(C=C)NCOCC(C)C +80-48-8,O=S(=O)(OC)c1ccc(cc1)C +2386-87-0,O=C(OCC1CCC2OC2(C1))C3CCC4OC4(C3) +104-49-4,O=C=Nc1ccc(N=C=O)cc1 +103-71-9,O=C=Nc1ccccc1 +111-19-3,O=C(CCCCCCCCC(=O)Cl)Cl +7320-37-8,O1CC1CCCCCCCCCCCCCC +2451-62-9,O=C1N(C(=O)N(C(=O)N1CC2OC2)CC3OC3)CC4OC4 diff --git a/test/data/input_53.tsv b/test/data/input_53.tsv new file mode 100644 index 0000000..c46fdd4 --- /dev/null +++ b/test/data/input_53.tsv @@ -0,0 +1,54 @@ +Id Smiles +123-30-8 Oc1ccc(N)cc1 +68391-25-3 OC(COc1ccccc1)CNc2ccc(cc2)Cc3ccc(N)cc3 +62-53-3 Nc1ccccc1 +123-98-8 O=C(CCCCCCCC(=O)Cl)Cl +106-51-4 O=C1C=CC(=O)C=C1 +7144-65-2 O(c1ccccc1c2ccccc2)CC3OC3 +3130-19-6 O=C(OCC1CCC2OC2(C1))CCCCC(=O)OCC3CCC4OC4(C3) +140-95-4 O=C(NCO)NCO +2778-42-9 O=C=NC(c1cccc(c1)C(N=C=O)(C)C)(C)C +593-60-2 C=CBr +75-25-2 C(Br)(Br)Br +1852-16-0 O=C(C=C)NCOCCCC +107-58-4 O=C(C=C)NC(C)(C)C +592-35-8 O=C(OCCCC)N +2426-08-6 O(CCCC)CC1OC1 +79-07-2 O=C(N)CCl +110-75-8 O(C=C)CCCl +67-66-3 C(Cl)(Cl)Cl +26172-55-4 O=C1C=C(Cl)SN1C +598-09-4 O1CC1(C)CCl +2556-36-7 O=C=NC1CCC(N=C=O)CC1 +3271-22-5 n1c(nc(nc1OC)c2ccc3ccc4cccc5ccc2c3c45)OC +2680-03-7 O=C(C=C)N(C)C +13036-41-4 O=C(C=C)NCOCC +556-52-5 OCC1OC1 +2530-83-8 O(CCC[Si](OC)(OC)OC)CC1OC1 +106-90-1 O=C(OCC1OC1)C=C +26761-45-5 O=C(OCC1OC1)C(C)(C)CCCCCC +122-60-1 O(c1ccccc1)CC2OC2 +2210-79-9 O(c1ccccc1C)CC2OC2 +2461-15-6 O(CC1OC1)CC(CC)CCCC +75-02-5 FC=C +98-01-1 O=Cc1occc1 +111-30-8 O=CCCCC=O +107-22-2 O=CC=O +78-84-2 O=CC(C)C +11087-88-0 O=C(OCCCCCC(C)C)CCCCCCCC1OC1(CCCCCCCC) +3644-11-9 O=C(C=C)NCOC +1187-59-3 O=C(C=C)NC +54208-63-8 O(c1ccccc1Cc3ccccc3(OCC2OC2))CC4OC4 +110-26-9 O=C(C=C)NCNC(=O)C=C +1208-52-2 Nc1ccc(cc1)Cc2ccccc2(N) +71033-08-4 O(c1ccc(cc1)C(c3ccc(OCC(OCC2OC2)COCCCC)cc3)(C)C)CC(OCC4OC4)COCCCC +5165-97-9 O=C(C=C)NC(C)(C)CS(=O)(=O)O +34813-62-2 O=C=NCCCC(C)CN=C=O +16669-59-3 O=C(C=C)NCOCC(C)C +80-48-8 O=S(=O)(OC)c1ccc(cc1)C +2386-87-0 O=C(OCC1CCC2OC2(C1))C3CCC4OC4(C3) +104-49-4 O=C=Nc1ccc(N=C=O)cc1 +103-71-9 O=C=Nc1ccccc1 +111-19-3 O=C(CCCCCCCCC(=O)Cl)Cl +7320-37-8 O1CC1CCCCCCCCCCCCCC +2451-62-9 O=C1N(C(=O)N(C(=O)N1CC2OC2)CC3OC3)CC4OC4 diff --git a/test/dataset.rb b/test/dataset.rb index 055a029..11a4697 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -1,6 +1,21 @@ +# batch class + require_relative "setup.rb" class DatasetTest < MiniTest::Test + + # TODO + def test_from_pubchem + d = Dataset.from_pubchem 1190 + end + + def test_merge + skip "TODO" + end + + def test_to_sdf + skip "TODO" + end # basics @@ -21,6 +36,34 @@ class DatasetTest < MiniTest::Test # real datasets + def test_upload_csv_with_id + d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv" + assert_equal 53, d.compounds.size + assert_equal 1, d.features.size + f = d.features[0] + assert_equal "original_id", f.name + assert_equal ["123-30-8"], d.values(d.compounds.first,f) + end + + def test_upload_tsv_with_id + d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv" + assert_equal 53, d.compounds.size + assert_equal 1, d.features.size + assert_equal 1, d.features.size + f = d.features[0] + assert_equal "original_id", f.name + assert_equal ["123-30-8"], d.values(d.compounds.first,f) + end + + def test_upload_sdf + #d = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" + assert_equal Compound.from_smiles("C[C@H]1C(=O)O[C@@H]2CCN3[C@@H]2C(=CC3)COC(=O)[C@]([C@]1(C)O)(C)O").smiles, d.compounds.first.smiles + f = Feature.find_by(:name => "original_id") + assert_equal 35, d.features.size + assert_equal ["9415"], d.values(d.compounds.first,f) + end + def test_upload_hamster d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal Dataset, d.class @@ -103,6 +146,15 @@ class DatasetTest < MiniTest::Test d.delete end + def test_multiple_uploads + datasets = [] + 2.times do + d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") + datasets << d + end + assert_equal datasets[0],datasets[1] + end + # batch predictions def test_create_without_features_smiles_and_inchi diff --git a/test/setup.rb b/test/setup.rb index 4a11aa0..c4c04cb 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -5,8 +5,8 @@ require_relative '../lib/lazar.rb' include OpenTox #$mongo.database.drop #$gridfs = $mongo.database.fs # recreate GridFS indexes -PhysChem.descriptors +#PhysChem.descriptors TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first -Import::Enanomapper.import unless training_dataset +#training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first +#Import::Enanomapper.import unless training_dataset -- cgit v1.2.3