From 791398c12af4f8290095425dac87e3c852905ab6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 7 Sep 2019 18:20:10 +0200 Subject: obsolete data and java directories deleted --- lib/compound.rb | 6 +++-- lib/dataset.rb | 55 +-------------------------------------- lib/download.rb | 80 --------------------------------------------------------- 3 files changed, 5 insertions(+), 136 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 615ea6e..4436e9d 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -4,6 +4,8 @@ require 'openbabel' class Compound DEFAULT_FINGERPRINT = "MP2D" + attr_reader :smiles, :fingerprints + def initialize smiles @smiles = smiles @fingerprints = {} @@ -123,10 +125,10 @@ class Compound # Create a compound from SDF # @param [String] SDF - # @return [OpenTox::Compound] + # @return [Compound] def self.from_sdf sdf # do not store sdf because it might be 2D - Compound.from_smiles obconversion(sdf,"sdf","can") + self.new obconversion(sdf,"sdf","can") end # Create a compound from name. Relies on an external service for name lookups. diff --git a/lib/dataset.rb b/lib/dataset.rb index 8cb343f..87e7fef 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -68,63 +68,10 @@ class Dataset @independent_variable_names = ["Canonical Smiles"] + fingerprints.flatten.sort.uniq print_variables end + end =begin - # Create a dataset from SDF file - # files with a single data field are read as BioActivities (i.e. dependent variable) - # files with multiple data fields are read as SubstanceProperties (i.e. independent variable) - # @param [File] - # @return [OpenTox::Dataset] - def self.from_sdf_file file - md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files - dataset = self.find_by(:md5 => md5) - if dataset - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - - dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) - original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID") - - read_result = false - sdf = "" - feature_name = "" - compound = nil - features = {} - table = [["ID","SMILES"]] - - File.readlines(file).each do |line| - if line.match %r{\$\$\$\$} - sdf << line - id = sdf.split("\n").first.chomp - compound = Compound.from_sdf sdf - row = [id,compound.smiles] - features.each do |f,v| - table[0] << f unless table[0].include? f - row[table[0].index(f)] = v - end - table << row - sdf = "" - features = {} - elsif line.match /^>\s+\s+<(.*)>/)[1] - read_result = true - else - if read_result - value = line.chomp - features[feature_name] = value - read_result = false - else - sdf << line - end - end - end - dataset.parse_table table - end - dataset - end - # Create a dataset from PubChem Assay # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] diff --git a/lib/download.rb b/lib/download.rb index 2546dc4..5b6a68e 100644 --- a/lib/download.rb +++ b/lib/download.rb @@ -119,86 +119,6 @@ module OpenTox File.join(DATA,name+".csv") end - # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder - def self.mutagenicity - $logger.debug "Mutagenicity" - hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv" - kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip" - efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls" - - parts = File.join(DATA, "parts") - FileUtils.mkdir_p parts - Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f } - Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f } - Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f } - File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s } - - # convert hansen - hansen = CSV.read File.join(parts,"hansen-original.csv") - hansen.shift - - map = {"0" => "non-mutagenic","1" => "mutagenic"} - File.open(File.join(parts,"hansen.csv"),"w+") do |f| - f.puts "ID,SMILES,Mutagenicity" - hansen.each do |row| - f.puts [row[0],row[5],map[row[2]]].join "," - end - end - File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s } - `cd #{parts} && unzip cas_4337.zip` - `cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls` - `cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c " " efsa.xls > efsa.tsv` - - # convert EFSA data to mutagenicity classifications - i = 0 - db = {} - CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row| - if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33] - begin - c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles - rescue - c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters - end - db[c] ||= {} - db[c][:id] ||= row[2] - if row[33].match(/Positiv/i) - db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100 - elsif row[33].match(/Negativ/i) - db[c][:value] ||= "non-mutagenic" - end - end - i += 1 - end - File.open(File.join(parts,"efsa.csv"),"w+") do |f| - f.puts "ID,SMILES,Mutagenicity" - db.each do |s,v| - f.puts [v[:id],s,v[:value]].join "," - end - end - - # merge datasets - hansen = Dataset.from_csv_file File.join(parts,"hansen.csv") - efsa = Dataset.from_csv_file File.join(parts,"efsa.csv") - kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf") - datasets = [hansen,efsa,kazius] - map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} - dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true - dataset.merged_features.first.name = "Mutagenicity" - File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv} - meta = { - :species => "Salmonella typhimurium", - :endpoint => "Mutagenicity", - :source => [kazius_url,hansen_url,efsa_url].join(", "), - :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"}, - } - File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json} - - # cleanup - datasets << dataset - datasets.each{|d| d.delete } - File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv") - end - # Download Blood Brain Barrier Penetration dataset into the data folder def self.blood_brain_barrier url = "http://cheminformatics.org/datasets/li/bbp2.smi" -- cgit v1.2.3