1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
module OpenTox
class Download
DATA = File.join(File.dirname(__FILE__),"..","data")
def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil
aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
# Get assay data in chunks
# Assay record retrieval is limited to 10000 SIDs
# https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
listkey = list["ListKey"]
size = list["Size"]
start = 0
csv = []
while start < size
url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
start += 10000
end
warnings = []
name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_")
table = [["SID","SMILES",name]]
csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100
cids = slice.collect{|s| s[2]}
pubchem_cids = []
JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
i = cids.index(prop["CID"].to_s)
value = slice[i][3]
if value == "Active"
table << [slice[i][1].to_s,prop["CanonicalSMILES"],active]
pubchem_cids << prop["CID"].to_s
elsif value == "Inactive"
table << [slice[i][1].to_s,prop["CanonicalSMILES"],inactive]
pubchem_cids << prop["CID"].to_s
else
warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
end
end
(cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
end
File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
meta = {
:species => species,
:endpoint => endpoint,
:source => aid_url,
:qmrf => qmrf,
:warnings => warnings
}
File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json}
end
def self.mutagenicity
# TODO add download/conversion programs to lazar dependencies
hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
parts = File.join(DATA, "parts")
FileUtils.mkdir_p parts
Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f }
Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f }
Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f }
File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s }
# convert hansen
hansen = CSV.read File.join(parts,"hansen-original.csv")
hansen.shift
map = {"0" => "non-mutagenic","1" => "mutagenic"}
File.open(File.join(parts,"hansen.csv"),"w+") do |f|
f.puts "ID,SMILES,Mutagenicity"
hansen.each do |row|
f.puts [row[0],row[5],map[row[2]]].join ","
end
end
File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s }
`cd #{parts} && unzip cas_4337.zip`
`cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls`
`cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c " " efsa.xls > efsa.tsv`
# convert EFSA data to mutagenicity classifications
i = 0
db = {}
CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row|
if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33]
begin
c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles
rescue
c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters
end
db[c] ||= {}
db[c][:id] ||= row[2]
if row[33].match(/Positiv/i)
db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100
elsif row[33].match(/Negativ/i)
db[c][:value] ||= "non-mutagenic"
end
end
i += 1
end
File.open(File.join(parts,"efsa.csv"),"w+") do |f|
f.puts "ID,SMILES,Mutagenicity"
db.each do |s,v|
f.puts [v[:id],s,v[:value]].join ","
end
end
# merge datasets
hansen = Dataset.from_csv_file File.join(parts,"hansen.csv")
efsa = Dataset.from_csv_file File.join(parts,"efsa.csv")
kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf")
datasets = [hansen,efsa,kazius]
map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
dataset.merged_features.first.name = "Mutagenicity"
File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_csv}
meta = {
:species => "Salmonella typhimurium",
:endpoint => "Mutagenicity",
:source => [kazius_url,hansen_url,efsa_url].join(", "),
:qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"},
}
File.open(File.join(File.dirname(__FILE__),"..","data","Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json}
# cleanup
datasets << dataset
datasets.each{|d| d.delete }
end
end
end
|