From 8649795b3d5d63f227eed030286270b91ec39c68 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 12 Nov 2018 20:43:46 +0100 Subject: Mutagenicity download --- lib/download.rb | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/download.rb b/lib/download.rb index 9e30790..2f6b4f1 100644 --- a/lib/download.rb +++ b/lib/download.rb @@ -2,6 +2,8 @@ module OpenTox class Download + DATA = File.join(File.dirname(__FILE__),"..","data") + def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}" @@ -19,9 +21,9 @@ module OpenTox start += 10000 end warnings = [] - name = endpoint+"-"+species + name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_") table = [["SID","SMILES",name]] - csv.each_slice(100) do |slice| # get SMILES in chunks + csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100 cids = slice.collect{|s| s[2]} pubchem_cids = [] JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop| @@ -50,6 +52,82 @@ module OpenTox File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json} end - end + def self.mutagenicity + # TODO add download/conversion programs to lazar dependencies + hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv" + kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip" + efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls" + + parts = File.join(DATA, "parts") + FileUtils.mkdir_p parts + Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f } + Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f } + Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f } + File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s } + + # convert hansen + hansen = CSV.read File.join(parts,"hansen-original.csv") + hansen.shift + map = {"0" => "non-mutagenic","1" => "mutagenic"} + File.open(File.join(parts,"hansen.csv"),"w+") do |f| + f.puts "ID,SMILES,Mutagenicity" + hansen.each do |row| + f.puts [row[0],row[5],map[row[2]]].join "," + end + end + File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s } + `cd #{parts} && unzip cas_4337.zip` + `cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls` + `cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c " " efsa.xls > efsa.tsv` + # convert EFSA data to mutagenicity classifications + i = 0 + db = {} + CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row| + if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33] + begin + c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles + rescue + c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters + end + db[c] ||= {} + db[c][:id] ||= row[2] + if row[33].match(/Positiv/i) + db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100 + elsif row[33].match(/Negativ/i) + db[c][:value] ||= "non-mutagenic" + end + end + i += 1 + end + File.open(File.join(parts,"efsa.csv"),"w+") do |f| + f.puts "ID,SMILES,Mutagenicity" + db.each do |s,v| + f.puts [v[:id],s,v[:value]].join "," + end + end + + # merge datasets + hansen = Dataset.from_csv_file File.join(parts,"hansen.csv") + efsa = Dataset.from_csv_file File.join(parts,"efsa.csv") + kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf") + datasets = [hansen,efsa,kazius] + map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} + dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true + dataset.merged_features.first.name = "Mutagenicity" + File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_csv} + meta = { + :species => "Salmonella typhimurium", + :endpoint => "Mutagenicity", + :source => [kazius_url,hansen_url,efsa_url].join(", "), + :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"}, + } + File.open(File.join(File.dirname(__FILE__),"..","data","Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json} + + # cleanup + datasets << dataset + datasets.each{|d| d.delete } + end + + end end -- cgit v1.2.3