From 6e23be652ad90c747aaccf15258bdaa4458185a4 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 13 Nov 2018 14:32:09 +0100 Subject: public dataset download --- lib/download.rb | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 199 insertions(+), 2 deletions(-) (limited to 'lib/download.rb') diff --git a/lib/download.rb b/lib/download.rb index 2f6b4f1..99d8842 100644 --- a/lib/download.rb +++ b/lib/download.rb @@ -22,6 +22,7 @@ module OpenTox end warnings = [] name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_") + $logger.debug name table = [["SID","SMILES",name]] csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100 cids = slice.collect{|s| s[2]} @@ -45,7 +46,58 @@ module OpenTox meta = { :species => species, :endpoint => endpoint, - :source => aid_url, + :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}", + :qmrf => qmrf, + :warnings => warnings + } + File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json} + end + + def self.pubchem_regression aid: , species: , endpoint:, qmrf: nil + aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}" + + # Get assay data in chunks + # Assay record retrieval is limited to 10000 SIDs + # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435 + list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"] + listkey = list["ListKey"] + size = list["Size"] + start = 0 + csv = [] + unit = nil + while start < size + url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000" + # get unit + unit ||= CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0] == "RESULT_UNIT"}[0][8] + csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows + start += 10000 + end + warnings = [] + name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_") + $logger.debug name + table = [["SID","SMILES","-log10(#{name} [#{unit}])"]] + csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100 + cids = slice.collect{|s| s[2]} + pubchem_cids = [] + JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop| + i = cids.index(prop["CID"].to_s) + value = slice[i][8] + if value + value = -Math.log10(value.to_f) + table << [slice[i][1].to_s,prop["CanonicalSMILES"],value] + pubchem_cids << prop["CID"].to_s + else + warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'." + end + end + (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." } + end + File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")} + meta = { + :species => species, + :endpoint => endpoint, + :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}", + :unit => unit, :qmrf => qmrf, :warnings => warnings } @@ -53,6 +105,7 @@ module OpenTox end def self.mutagenicity + $logger.debug "Mutagenicity" # TODO add download/conversion programs to lazar dependencies hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv" kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip" @@ -68,6 +121,7 @@ module OpenTox # convert hansen hansen = CSV.read File.join(parts,"hansen-original.csv") hansen.shift + map = {"0" => "non-mutagenic","1" => "mutagenic"} File.open(File.join(parts,"hansen.csv"),"w+") do |f| f.puts "ID,SMILES,Mutagenicity" @@ -122,12 +176,155 @@ module OpenTox :source => [kazius_url,hansen_url,efsa_url].join(", "), :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"}, } - File.open(File.join(File.dirname(__FILE__),"..","data","Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json} + File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json} # cleanup datasets << dataset datasets.each{|d| d.delete } end + def self.blood_brain_barrier + url = "http://cheminformatics.org/datasets/li/bbp2.smi" + name = "Blood_Brain_Barrier_Penetration-Human" + $logger.debug name + map = {"n" => "non-penetrating", "p" => "penetrating"} + table = CSV.parse RestClientWrapper.get(url).to_s, :col_sep => "\t" + File.open(File.join(DATA,name+".csv"),"w+") do |f| + f.puts "ID,SMILES,#{name}" + table.each do |row| + f.puts [row[1],row[0],map[row[3]]].join(",") + end + end + meta = { + :species => "Human", + :endpoint => "Blood Brain Barrier Penetration", + :source => url, + :qmrf => {"name": "QMRF 5.4. Toxicokinetics.Blood-brain barrier penetration", "group": "QMRF 5. Toxicokinetics"}, + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + def self.loael + # TODO: fix url?? + url = "https://raw.githubusercontent.com/opentox/loael-paper/revision/data/training_log10.csv" + name = "Lowest_observed_adverse_effect_level-Rats" + $logger.debug name + File.open(File.join(DATA,name+".csv"),"w+") do |f| + f.puts RestClientWrapper.get(url).to_s + end + meta = { + :species => "Rat", + :endpoint => "Lowest observed adverse effect level", + :source => url, + :unit => "mmol/kg_bw/day", + :qmrf => { + "name": "QMRF 4.14. Repeated dose toxicity", + "group": "QMRF 4.Human Health Effects" + } + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + def self.daphnia + # download of original file requires email request, this is a temporary solution + url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv" + name = "Acute_toxicity-Daphnia_magna" + $logger.debug name + File.open(File.join(DATA,name+".csv"),"w+") do |f| + f.puts RestClientWrapper.get(url).to_s + end + meta = { "species": "Daphnia magna", + "endpoint": "Acute toxicity", + "source": "http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/", + "unit": "mmol/L", + "qmrf": { + "group": "QMRF 3.1. Short-term toxicity to Daphnia (immobilisation)", + "name": "EC C. 2. Daphnia sp Acute Immobilisation Test" + } + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + def self.public_data + + # Classification + [ + { + :aid => 1205, + :species => "Rodents", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + },{ + :aid => 1208, + :species => "Rat", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + },{ + :aid => 1199, + :species => "Mouse", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + } + ].each do |assay| + Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogen", inactive: "non-carcinogen", qmrf: assay[:qmrf] + end + Download.mutagenicity + Download.blood_brain_barrier + + # Regression + [ + { + :aid => 1195, + :species => "Human", + :endpoint => "Maximum Recommended Daily Dose", + :qmrf => { + "group": "QMRF 4.14. Repeated dose toxicity", + "name": "OECD 452 Chronic Toxicity Studies" + }, + },{ + :aid => 1208, + :species => "Rat (TD50)", + :endpoint => "Carcinogenicity", + :qmrf => { + :group => "QMRF 4.12. Carcinogenicity", + :name => "OECD 451 Carcinogenicity Studies" + } + },{ + :aid => 1199, + :species => "Mouse (TD50)", + :endpoint => "Carcinogenicity", + :qmrf => { + :group => "QMRF 4.12. Carcinogenicity", + :name => "OECD 451 Carcinogenicity Studies" + } + },{ + :aid => 1188, + :species => "Fathead minnow", + :endpoint => "Acute toxicity", + :qmrf => { + "group": "QMRF 3.3. Acute toxicity to fish (lethality)", + "name": "EC C. 1. Acute Toxicity for Fish" + } + } + ].each do |assay| + Download.pubchem_regression aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], qmrf: assay[:qmrf] + end + + Download.loael + Download.daphnia + +=begin + # 1204 estrogen receptor + # 1259408, # GENE-TOX + # 1159563 HepG2 cytotoxicity assay + # 588209 hepatotoxicity + # 1259333 cytotoxicity + # 1159569 HepG2 cytotoxicity counterscreen Measured in Cell-Based System Using Plate Reader - 2153-03_Inhibitor_Dose_DryPowder_Activity + # 2122 HTS Counterscreen for Detection of Compound Cytotoxicity in MIN6 Cells + # 116724 Acute toxicity determined after intravenal administration in mice + # 1148549 Toxicity in po dosed mouse assessed as mortality after 7 days +=end + end + end end -- cgit v1.2.3