summaryrefslogtreecommitdiff
path: root/lib/download.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/download.rb')
-rw-r--r--lib/download.rb80
1 files changed, 0 insertions, 80 deletions
diff --git a/lib/download.rb b/lib/download.rb
index 2546dc4..5b6a68e 100644
--- a/lib/download.rb
+++ b/lib/download.rb
@@ -119,86 +119,6 @@ module OpenTox
File.join(DATA,name+".csv")
end
- # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder
- def self.mutagenicity
- $logger.debug "Mutagenicity"
- hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
- kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
- efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
-
- parts = File.join(DATA, "parts")
- FileUtils.mkdir_p parts
- Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f }
- Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f }
- Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f }
- File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s }
-
- # convert hansen
- hansen = CSV.read File.join(parts,"hansen-original.csv")
- hansen.shift
-
- map = {"0" => "non-mutagenic","1" => "mutagenic"}
- File.open(File.join(parts,"hansen.csv"),"w+") do |f|
- f.puts "ID,SMILES,Mutagenicity"
- hansen.each do |row|
- f.puts [row[0],row[5],map[row[2]]].join ","
- end
- end
- File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s }
- `cd #{parts} && unzip cas_4337.zip`
- `cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls`
- `cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c " " efsa.xls > efsa.tsv`
-
- # convert EFSA data to mutagenicity classifications
- i = 0
- db = {}
- CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row|
- if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33]
- begin
- c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles
- rescue
- c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters
- end
- db[c] ||= {}
- db[c][:id] ||= row[2]
- if row[33].match(/Positiv/i)
- db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100
- elsif row[33].match(/Negativ/i)
- db[c][:value] ||= "non-mutagenic"
- end
- end
- i += 1
- end
- File.open(File.join(parts,"efsa.csv"),"w+") do |f|
- f.puts "ID,SMILES,Mutagenicity"
- db.each do |s,v|
- f.puts [v[:id],s,v[:value]].join ","
- end
- end
-
- # merge datasets
- hansen = Dataset.from_csv_file File.join(parts,"hansen.csv")
- efsa = Dataset.from_csv_file File.join(parts,"efsa.csv")
- kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf")
- datasets = [hansen,efsa,kazius]
- map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
- dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
- dataset.merged_features.first.name = "Mutagenicity"
- File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv}
- meta = {
- :species => "Salmonella typhimurium",
- :endpoint => "Mutagenicity",
- :source => [kazius_url,hansen_url,efsa_url].join(", "),
- :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"},
- }
- File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json}
-
- # cleanup
- datasets << dataset
- datasets.each{|d| d.delete }
- File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv")
- end
-
# Download Blood Brain Barrier Penetration dataset into the data folder
def self.blood_brain_barrier
url = "http://cheminformatics.org/datasets/li/bbp2.smi"