summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-11-12 17:08:51 +0100
committerhelma@in-silico.ch <helma@in-silico.ch>2018-11-12 17:08:51 +0100
commit6d68a1ca94937a0553f61ebbbbd317dae54ce4e6 (patch)
tree02df42992b8ec3bcd8230826b8ccde5820c35359 /lib
parentcf80ed17102a0368df8d65037d113b521cdf6f0c (diff)
PubChem classification download
Diffstat (limited to 'lib')
-rw-r--r--lib/download.rb55
-rw-r--r--lib/lazar.rb1
2 files changed, 56 insertions, 0 deletions
diff --git a/lib/download.rb b/lib/download.rb
new file mode 100644
index 0000000..9e30790
--- /dev/null
+++ b/lib/download.rb
@@ -0,0 +1,55 @@
+module OpenTox
+
+ class Download
+
+ def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil
+ aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+
+ # Get assay data in chunks
+ # Assay record retrieval is limited to 10000 SIDs
+ # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+ list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+ listkey = list["ListKey"]
+ size = list["Size"]
+ start = 0
+ csv = []
+ while start < size
+ url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+ csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+ start += 10000
+ end
+ warnings = []
+ name = endpoint+"-"+species
+ table = [["SID","SMILES",name]]
+ csv.each_slice(100) do |slice| # get SMILES in chunks
+ cids = slice.collect{|s| s[2]}
+ pubchem_cids = []
+ JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+ i = cids.index(prop["CID"].to_s)
+ value = slice[i][3]
+ if value == "Active"
+ table << [slice[i][1].to_s,prop["CanonicalSMILES"],active]
+ pubchem_cids << prop["CID"].to_s
+ elsif value == "Inactive"
+ table << [slice[i][1].to_s,prop["CanonicalSMILES"],inactive]
+ pubchem_cids << prop["CID"].to_s
+ else
+ warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
+ end
+ end
+ (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
+ end
+ File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
+ meta = {
+ :species => species,
+ :endpoint => endpoint,
+ :source => aid_url,
+ :qmrf => qmrf,
+ :warnings => warnings
+ }
+ File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json}
+ end
+
+ end
+
+end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 69a6f15..6f14f67 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -97,5 +97,6 @@ CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidat
"train-test-validation.rb",
"leave-one-out-validation.rb",
"crossvalidation.rb",
+ "download.rb"
#"import.rb",
].each{ |f| require_relative f }