From 6e23be652ad90c747aaccf15258bdaa4458185a4 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 13 Nov 2018 14:32:09 +0100 Subject: public dataset download --- lib/download.rb | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- lib/import.rb | 13 ++++ lib/lazar.rb | 4 +- lib/model.rb | 3 +- 4 files changed, 216 insertions(+), 5 deletions(-) create mode 100644 lib/import.rb (limited to 'lib') diff --git a/lib/download.rb b/lib/download.rb index 2f6b4f1..99d8842 100644 --- a/lib/download.rb +++ b/lib/download.rb @@ -22,6 +22,7 @@ module OpenTox end warnings = [] name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_") + $logger.debug name table = [["SID","SMILES",name]] csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100 cids = slice.collect{|s| s[2]} @@ -45,7 +46,58 @@ module OpenTox meta = { :species => species, :endpoint => endpoint, - :source => aid_url, + :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}", + :qmrf => qmrf, + :warnings => warnings + } + File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json} + end + + def self.pubchem_regression aid: , species: , endpoint:, qmrf: nil + aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}" + + # Get assay data in chunks + # Assay record retrieval is limited to 10000 SIDs + # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435 + list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"] + listkey = list["ListKey"] + size = list["Size"] + start = 0 + csv = [] + unit = nil + while start < size + url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000" + # get unit + unit ||= CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0] == "RESULT_UNIT"}[0][8] + csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows + start += 10000 + end + warnings = [] + name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_") + $logger.debug name + table = [["SID","SMILES","-log10(#{name} [#{unit}])"]] + csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100 + cids = slice.collect{|s| s[2]} + pubchem_cids = [] + JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop| + i = cids.index(prop["CID"].to_s) + value = slice[i][8] + if value + value = -Math.log10(value.to_f) + table << [slice[i][1].to_s,prop["CanonicalSMILES"],value] + pubchem_cids << prop["CID"].to_s + else + warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'." + end + end + (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." } + end + File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")} + meta = { + :species => species, + :endpoint => endpoint, + :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}", + :unit => unit, :qmrf => qmrf, :warnings => warnings } @@ -53,6 +105,7 @@ module OpenTox end def self.mutagenicity + $logger.debug "Mutagenicity" # TODO add download/conversion programs to lazar dependencies hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv" kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip" @@ -68,6 +121,7 @@ module OpenTox # convert hansen hansen = CSV.read File.join(parts,"hansen-original.csv") hansen.shift + map = {"0" => "non-mutagenic","1" => "mutagenic"} File.open(File.join(parts,"hansen.csv"),"w+") do |f| f.puts "ID,SMILES,Mutagenicity" @@ -122,12 +176,155 @@ module OpenTox :source => [kazius_url,hansen_url,efsa_url].join(", "), :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"}, } - File.open(File.join(File.dirname(__FILE__),"..","data","Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json} + File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json} # cleanup datasets << dataset datasets.each{|d| d.delete } end + def self.blood_brain_barrier + url = "http://cheminformatics.org/datasets/li/bbp2.smi" + name = "Blood_Brain_Barrier_Penetration-Human" + $logger.debug name + map = {"n" => "non-penetrating", "p" => "penetrating"} + table = CSV.parse RestClientWrapper.get(url).to_s, :col_sep => "\t" + File.open(File.join(DATA,name+".csv"),"w+") do |f| + f.puts "ID,SMILES,#{name}" + table.each do |row| + f.puts [row[1],row[0],map[row[3]]].join(",") + end + end + meta = { + :species => "Human", + :endpoint => "Blood Brain Barrier Penetration", + :source => url, + :qmrf => {"name": "QMRF 5.4. Toxicokinetics.Blood-brain barrier penetration", "group": "QMRF 5. Toxicokinetics"}, + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + def self.loael + # TODO: fix url?? + url = "https://raw.githubusercontent.com/opentox/loael-paper/revision/data/training_log10.csv" + name = "Lowest_observed_adverse_effect_level-Rats" + $logger.debug name + File.open(File.join(DATA,name+".csv"),"w+") do |f| + f.puts RestClientWrapper.get(url).to_s + end + meta = { + :species => "Rat", + :endpoint => "Lowest observed adverse effect level", + :source => url, + :unit => "mmol/kg_bw/day", + :qmrf => { + "name": "QMRF 4.14. Repeated dose toxicity", + "group": "QMRF 4.Human Health Effects" + } + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + def self.daphnia + # download of original file requires email request, this is a temporary solution + url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv" + name = "Acute_toxicity-Daphnia_magna" + $logger.debug name + File.open(File.join(DATA,name+".csv"),"w+") do |f| + f.puts RestClientWrapper.get(url).to_s + end + meta = { "species": "Daphnia magna", + "endpoint": "Acute toxicity", + "source": "http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/", + "unit": "mmol/L", + "qmrf": { + "group": "QMRF 3.1. Short-term toxicity to Daphnia (immobilisation)", + "name": "EC C. 2. Daphnia sp Acute Immobilisation Test" + } + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + def self.public_data + + # Classification + [ + { + :aid => 1205, + :species => "Rodents", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + },{ + :aid => 1208, + :species => "Rat", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + },{ + :aid => 1199, + :species => "Mouse", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + } + ].each do |assay| + Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogen", inactive: "non-carcinogen", qmrf: assay[:qmrf] + end + Download.mutagenicity + Download.blood_brain_barrier + + # Regression + [ + { + :aid => 1195, + :species => "Human", + :endpoint => "Maximum Recommended Daily Dose", + :qmrf => { + "group": "QMRF 4.14. Repeated dose toxicity", + "name": "OECD 452 Chronic Toxicity Studies" + }, + },{ + :aid => 1208, + :species => "Rat (TD50)", + :endpoint => "Carcinogenicity", + :qmrf => { + :group => "QMRF 4.12. Carcinogenicity", + :name => "OECD 451 Carcinogenicity Studies" + } + },{ + :aid => 1199, + :species => "Mouse (TD50)", + :endpoint => "Carcinogenicity", + :qmrf => { + :group => "QMRF 4.12. Carcinogenicity", + :name => "OECD 451 Carcinogenicity Studies" + } + },{ + :aid => 1188, + :species => "Fathead minnow", + :endpoint => "Acute toxicity", + :qmrf => { + "group": "QMRF 3.3. Acute toxicity to fish (lethality)", + "name": "EC C. 1. Acute Toxicity for Fish" + } + } + ].each do |assay| + Download.pubchem_regression aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], qmrf: assay[:qmrf] + end + + Download.loael + Download.daphnia + +=begin + # 1204 estrogen receptor + # 1259408, # GENE-TOX + # 1159563 HepG2 cytotoxicity assay + # 588209 hepatotoxicity + # 1259333 cytotoxicity + # 1159569 HepG2 cytotoxicity counterscreen Measured in Cell-Based System Using Plate Reader - 2153-03_Inhibitor_Dose_DryPowder_Activity + # 2122 HTS Counterscreen for Detection of Compound Cytotoxicity in MIN6 Cells + # 116724 Acute toxicity determined after intravenal administration in mice + # 1148549 Toxicity in po dosed mouse assessed as mortality after 7 days +=end + end + end end diff --git a/lib/import.rb b/lib/import.rb new file mode 100644 index 0000000..831efcb --- /dev/null +++ b/lib/import.rb @@ -0,0 +1,13 @@ +module OpenTox + + class Import + + def self.public_data + # TODO clear database? + Dir[File.join(File.dirname(__FILE__),"..","data/*csv")].each do |f| + $logger.debug f + Model::Validation.from_csv_file f + end + end + end +end diff --git a/lib/lazar.rb b/lib/lazar.rb index 6f14f67..c3bbbf3 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -97,6 +97,6 @@ CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidat "train-test-validation.rb", "leave-one-out-validation.rb", "crossvalidation.rb", - "download.rb" - #"import.rb", + "download.rb", + "import.rb", ].each{ |f| require_relative f } diff --git a/lib/model.rb b/lib/model.rb index 966460b..70ae43c 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -422,6 +422,7 @@ module OpenTox field :species, type: String field :source, type: String field :unit, type: String + field :warnings, type: Array field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId @@ -494,7 +495,7 @@ module OpenTox # Create and validate a lazar model from a csv file with training data and a json file with metadata # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data. - # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations + # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file -- cgit v1.2.3