summaryrefslogtreecommitdiff
path: root/lib/download.rb
blob: 9e3079021fee7c632ba5a5e7c5388c253d5aa441 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
module OpenTox

  class Download

    def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil
      aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
      
      # Get assay data in chunks
      # Assay record retrieval is limited to 10000 SIDs
      # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
      list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
      listkey = list["ListKey"]
      size = list["Size"]
      start = 0
      csv = []
      while start < size
        url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
        csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
        start += 10000
      end
      warnings = []
      name = endpoint+"-"+species
      table = [["SID","SMILES",name]]
      csv.each_slice(100) do |slice| # get SMILES in chunks
        cids = slice.collect{|s| s[2]}
        pubchem_cids = []
        JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
          i = cids.index(prop["CID"].to_s)
          value = slice[i][3]
          if value == "Active"
            table << [slice[i][1].to_s,prop["CanonicalSMILES"],active]
            pubchem_cids << prop["CID"].to_s
          elsif value == "Inactive"
            table << [slice[i][1].to_s,prop["CanonicalSMILES"],inactive]
            pubchem_cids << prop["CID"].to_s
          else
            warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
          end
        end
        (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
      end
      File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
      meta = {
        :species => species,
        :endpoint => endpoint,
        :source => aid_url,
        :qmrf => qmrf,
        :warnings => warnings
      }
      File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json}
    end

  end

end