3 files changed, 5 insertions, 136 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 615ea6e..4436e9d 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -4,6 +4,8 @@ require 'openbabel'
 class Compound 
   DEFAULT_FINGERPRINT = "MP2D"
 
+  attr_reader :smiles, :fingerprints
+
   def initialize smiles
     @smiles = smiles
     @fingerprints = {}
@@ -123,10 +125,10 @@ class Compound
 
   # Create a compound from SDF 
   # @param [String] SDF 
-  # @return [OpenTox::Compound] 
+  # @return [Compound] 
   def self.from_sdf sdf
     # do not store sdf because it might be 2D
-    Compound.from_smiles obconversion(sdf,"sdf","can")
+    self.new obconversion(sdf,"sdf","can")
   end
 
   # Create a compound from name. Relies on an external service for name lookups.
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 8cb343f..87e7fef 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -68,63 +68,10 @@ class Dataset
     @independent_variable_names = ["Canonical Smiles"] + fingerprints.flatten.sort.uniq
     print_variables
   end
+
 end
 
 =begin
-    # Create a dataset from SDF file 
-    #   files with a single data field are read as BioActivities (i.e. dependent variable)
-    #   files with multiple data fields are read as SubstanceProperties (i.e. independent variable)
-    # @param [File] 
-    # @return [OpenTox::Dataset]
-    def self.from_sdf_file file
-      md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
-      dataset = self.find_by(:md5 => md5)
-      if dataset
-        $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
-      else
-        $logger.debug "Parsing #{file}."
-
-        dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
-        original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID")
-
-        read_result = false
-        sdf = ""
-        feature_name = ""
-        compound = nil
-        features = {}
-        table = [["ID","SMILES"]]
-
-        File.readlines(file).each do |line|
-          if line.match %r{\$\$\$\$}
-            sdf << line
-            id = sdf.split("\n").first.chomp
-            compound = Compound.from_sdf sdf
-            row = [id,compound.smiles]
-            features.each do |f,v|
-              table[0] << f unless table[0].include? f
-              row[table[0].index(f)] = v
-            end
-            table << row
-            sdf = ""
-            features = {}
-          elsif line.match /^>\s+</
-            feature_name = line.match(/^>\s+<(.*)>/)[1]
-            read_result = true
-          else
-            if read_result
-              value = line.chomp
-              features[feature_name] = value
-              read_result = false
-            else
-              sdf << line
-            end
-          end
-        end
-        dataset.parse_table table
-      end
-      dataset
-    end
-
     # Create a dataset from PubChem Assay
     # @param [Integer] PubChem AssayID (AID)
     # @return [OpenTox::Dataset]
diff --git a/lib/download.rb b/lib/download.rb
index 2546dc4..5b6a68e 100644
--- a/lib/download.rb
+++ b/lib/download.rb
@@ -119,86 +119,6 @@ module OpenTox
       File.join(DATA,name+".csv")
     end
 
-    # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder
-    def self.mutagenicity
-      $logger.debug "Mutagenicity"
-      hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
-      kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
-      efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
-      
-      parts = File.join(DATA, "parts")
-      FileUtils.mkdir_p parts
-      Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f }
-      Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f }
-      Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f }
-      File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s }
-
-      # convert hansen
-      hansen = CSV.read File.join(parts,"hansen-original.csv")
-      hansen.shift
-
-      map = {"0" => "non-mutagenic","1" => "mutagenic"}
-      File.open(File.join(parts,"hansen.csv"),"w+") do |f|
-        f.puts "ID,SMILES,Mutagenicity"
-        hansen.each do |row|
-          f.puts [row[0],row[5],map[row[2]]].join "," 
-        end
-      end
-      File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s }
-      `cd #{parts} && unzip cas_4337.zip`
-      `cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls`
-      `cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c "	" efsa.xls > efsa.tsv`
-
-      # convert EFSA data to mutagenicity classifications
-      i = 0
-      db = {}
-      CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row|
-        if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33]
-          begin
-            c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles
-          rescue
-            c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters
-          end
-          db[c] ||= {}
-          db[c][:id] ||= row[2]
-          if row[33].match(/Positiv/i)
-            db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100
-          elsif row[33].match(/Negativ/i)
-            db[c][:value] ||= "non-mutagenic"
-          end
-        end
-        i += 1
-      end
-      File.open(File.join(parts,"efsa.csv"),"w+") do |f|
-        f.puts "ID,SMILES,Mutagenicity"
-        db.each do |s,v|
-          f.puts [v[:id],s,v[:value]].join ","
-        end
-      end
-
-      # merge datasets
-      hansen = Dataset.from_csv_file File.join(parts,"hansen.csv")
-      efsa = Dataset.from_csv_file File.join(parts,"efsa.csv")
-      kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf")
-      datasets = [hansen,efsa,kazius]
-      map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
-      dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
-      dataset.merged_features.first.name = "Mutagenicity"
-      File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv}
-      meta = {
-        :species => "Salmonella typhimurium",
-        :endpoint => "Mutagenicity",
-        :source => [kazius_url,hansen_url,efsa_url].join(", "),
-        :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"},
-      }
-      File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json}
-      
-      # cleanup
-      datasets << dataset
-      datasets.each{|d| d.delete }
-      File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv")
-    end
-
     # Download Blood Brain Barrier Penetration dataset into the data folder
     def self.blood_brain_barrier
       url =  "http://cheminformatics.org/datasets/li/bbp2.smi"