summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-11-12 20:43:46 +0100
committerhelma@in-silico.ch <helma@in-silico.ch>2018-11-12 20:43:46 +0100
commit8649795b3d5d63f227eed030286270b91ec39c68 (patch)
treeaa9168847a9c0aa6f6abe4654304b604a441ff30 /lib
parent6d68a1ca94937a0553f61ebbbbd317dae54ce4e6 (diff)
Mutagenicity download
Diffstat (limited to 'lib')
-rw-r--r--lib/download.rb84
1 files changed, 81 insertions, 3 deletions
diff --git a/lib/download.rb b/lib/download.rb
index 9e30790..2f6b4f1 100644
--- a/lib/download.rb
+++ b/lib/download.rb
@@ -2,6 +2,8 @@ module OpenTox
class Download
+ DATA = File.join(File.dirname(__FILE__),"..","data")
+
def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil
aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
@@ -19,9 +21,9 @@ module OpenTox
start += 10000
end
warnings = []
- name = endpoint+"-"+species
+ name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_")
table = [["SID","SMILES",name]]
- csv.each_slice(100) do |slice| # get SMILES in chunks
+ csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100
cids = slice.collect{|s| s[2]}
pubchem_cids = []
JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
@@ -50,6 +52,82 @@ module OpenTox
File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json}
end
- end
+ def self.mutagenicity
+ # TODO add download/conversion programs to lazar dependencies
+ hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
+ kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
+ efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
+
+ parts = File.join(DATA, "parts")
+ FileUtils.mkdir_p parts
+ Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f }
+ Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f }
+ Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f }
+ File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s }
+
+ # convert hansen
+ hansen = CSV.read File.join(parts,"hansen-original.csv")
+ hansen.shift
+ map = {"0" => "non-mutagenic","1" => "mutagenic"}
+ File.open(File.join(parts,"hansen.csv"),"w+") do |f|
+ f.puts "ID,SMILES,Mutagenicity"
+ hansen.each do |row|
+ f.puts [row[0],row[5],map[row[2]]].join ","
+ end
+ end
+ File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s }
+ `cd #{parts} && unzip cas_4337.zip`
+ `cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls`
+ `cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c " " efsa.xls > efsa.tsv`
+ # convert EFSA data to mutagenicity classifications
+ i = 0
+ db = {}
+ CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row|
+ if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33]
+ begin
+ c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles
+ rescue
+ c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters
+ end
+ db[c] ||= {}
+ db[c][:id] ||= row[2]
+ if row[33].match(/Positiv/i)
+ db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100
+ elsif row[33].match(/Negativ/i)
+ db[c][:value] ||= "non-mutagenic"
+ end
+ end
+ i += 1
+ end
+ File.open(File.join(parts,"efsa.csv"),"w+") do |f|
+ f.puts "ID,SMILES,Mutagenicity"
+ db.each do |s,v|
+ f.puts [v[:id],s,v[:value]].join ","
+ end
+ end
+
+ # merge datasets
+ hansen = Dataset.from_csv_file File.join(parts,"hansen.csv")
+ efsa = Dataset.from_csv_file File.join(parts,"efsa.csv")
+ kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf")
+ datasets = [hansen,efsa,kazius]
+ map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
+ dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
+ dataset.merged_features.first.name = "Mutagenicity"
+ File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_csv}
+ meta = {
+ :species => "Salmonella typhimurium",
+ :endpoint => "Mutagenicity",
+ :source => [kazius_url,hansen_url,efsa_url].join(", "),
+ :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"},
+ }
+ File.open(File.join(File.dirname(__FILE__),"..","data","Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json}
+
+ # cleanup
+ datasets << dataset
+ datasets.each{|d| d.delete }
+ end
+
+ end
end