From 47a49508a736549006418ac9a9607ec0f5083a55 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 5 Oct 2018 19:31:48 +0200 Subject: partial pubchem classification import --- lib/dataset.rb | 51 ++++++++++++++++++--------------------------------- test/dataset.rb | 8 ++++++-- 2 files changed, 24 insertions(+), 35 deletions(-) diff --git a/lib/dataset.rb b/lib/dataset.rb index 17c30d5..1cc388c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -156,39 +156,24 @@ module OpenTox # Parsers # Create a dataset from PubChem Assay - # @param [File] + # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] def self.from_pubchem aid - csv = RestClientWrapper.get "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" - table = CSV.read csv - puts table -=begin - dataset = self.new(:source => file, :name => name, :md5 => md5) - dataset.parse_table table, accept_empty_values - else - puts csv -i = 0 -activities = [] -File.readlines(ARGV[0]).each do |line| - if i > 2 - tokens = line.split "," - p line if tokens[1].empty? - activities << [tokens[1],tokens[3]] - end - i += 1 -end - -puts "SMILES,Activity" -activities.each_slice(100) do |slice| # get SMILES in chunks - sids = slice.collect{|e| e[0]} - smiles = `curl https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT`.split("\n") - abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size - smiles.each_with_index do |smi,i| - act = slice[i] - puts [smi.chomp,act[1]].join(",") - end -end -=end + url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" + csv = CSV.parse(RestClientWrapper.get(url)) + csv.select!{|r| r[0].match /^\d/} # discard header rows + table = [["SID","SMILES","Activity"]] + csv.each_slice(100) do |slice| + sids = slice.collect{|s| s[1]} + smiles = RestClientWrapper.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT").split("\n") + abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size + smiles.each_with_index do |smi,i| + table << [slice[i][1],smi.chomp,slice[i][3]] + end + end + dataset = self.new(:source => url) # TODO name + dataset.parse_table table, false + dataset end # Create a dataset from SDF file @@ -233,7 +218,6 @@ end feature = NominalFeature.find_or_create_by(:name => feature_name) end features[feature] = value - #p compound.smiles, feature.name, value read_result = false else sdf << line @@ -297,7 +281,8 @@ end # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} - values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + original_id ? j = i+2 : j = i+1 + values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq feature = nil if values.size == 0 # empty feature diff --git a/test/dataset.rb b/test/dataset.rb index 11a4697..5157803 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -4,9 +4,13 @@ require_relative "setup.rb" class DatasetTest < MiniTest::Test - # TODO def test_from_pubchem - d = Dataset.from_pubchem 1190 + d = Dataset.from_pubchem 1191 + assert_equal 87, d.compounds.size + assert_equal 2, d.features.size + assert_equal "Active", d.values(d.compounds[10],d.features[1]) + # TODO endpoint name + # TODO regression import end def test_merge -- cgit v1.2.3