PubChem classification download
[lazar] / lib / download.rb
1 module OpenTox
2
3   class Download
4
5     def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil
6       aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
7       
8       # Get assay data in chunks
9       # Assay record retrieval is limited to 10000 SIDs
10       # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
11       list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
12       listkey = list["ListKey"]
13       size = list["Size"]
14       start = 0
15       csv = []
16       while start < size
17         url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
18         csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
19         start += 10000
20       end
21       warnings = []
22       name = endpoint+"-"+species
23       table = [["SID","SMILES",name]]
24       csv.each_slice(100) do |slice| # get SMILES in chunks
25         cids = slice.collect{|s| s[2]}
26         pubchem_cids = []
27         JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
28           i = cids.index(prop["CID"].to_s)
29           value = slice[i][3]
30           if value == "Active"
31             table << [slice[i][1].to_s,prop["CanonicalSMILES"],active]
32             pubchem_cids << prop["CID"].to_s
33           elsif value == "Inactive"
34             table << [slice[i][1].to_s,prop["CanonicalSMILES"],inactive]
35             pubchem_cids << prop["CID"].to_s
36           else
37             warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
38           end
39         end
40         (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
41       end
42       File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
43       meta = {
44         :species => species,
45         :endpoint => endpoint,
46         :source => aid_url,
47         :qmrf => qmrf,
48         :warnings => warnings
49       }
50       File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json}
51     end
52
53   end
54
55 end