From 8b31acab67e22f30a87c995a94f1ee1e2a3d510f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 10 Oct 2018 21:39:11 +0200 Subject: dataset tests fixed --- lib/dataset.rb | 369 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 190 insertions(+), 179 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 6ad3215..b6c6173 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -67,166 +67,76 @@ module OpenTox #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source end - # Dataset operations - - # Merge an array of datasets - # @param [Array] OpenTox::Dataset Array to be merged - # @param [Hash] feature modifications - # @param [Hash] value modifications - # @return [OpenTox::Dataset] merged dataset - def self.merge datasets, feature_map=nil, value_map=nil - dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) - datasets.each do |d| - d.substances.each do |s| - d.features.each do |f| - d.values(s,f).each do |v| - f = feature_map[f] if feature_map and feature_map[f] - v = value_map[v] if value_map and value_map[v] - dataset.add s,f,v #unless dataset.values(s,f).include? v - end - end - end - end - dataset.save - dataset - end - - # Split a dataset into n folds - # @param [Integer] number of folds - # @return [Array] Array with folds [training_dataset,test_dataset] - def folds n - len = self.substances.size - indices = (0..len-1).to_a.shuffle - mid = (len/n) - chunks = [] - start = 0 - 1.upto(n) do |i| - last = start+mid - last = last-1 unless len%n >= i - test_idxs = indices[start..last] || [] - test_substances = test_idxs.collect{|i| substances[i]} - training_idxs = indices-test_idxs - training_substances = training_idxs.collect{|i| substances[i]} - chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) - substances.each do |substance| - substance.dataset_ids << dataset.id - substance.dataset_ids.uniq! - substance.save - dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} - end - dataset.save - dataset - end - start = last+1 - chunks << chunk - end - chunks - end - - # Serialisation + # Parsers - # Convert dataset to csv format including compound smiles as first column, other column headers are feature names - # @return [String] - # TODO original_id - def to_csv(inchi=false) - CSV.generate() do |csv| - compound = substances.first.is_a? Compound - if compound - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - else - csv << ["Name"] + features.collect{|f| f.name} - end - substances.each do |substance| - if compound - name = (inchi ? substance.inchi : substance.smiles) - else - name = substance.name - end - nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq - - if nr_measurements.size > 1 - warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." - else - (0..nr_measurements.first-1).each do |i| - row = [name] - features.each do |f| - values(substance,f) ? row << values(substance,f)[i] : row << "" - end - csv << row - end + # Create a dataset from CSV file + # @param [File] Input file with the following format: + # - ID column (optional): header containing "ID" string, arbitrary ID values + # - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings + # - one or more properties column(s): header with property name(s), property values + # files with a single property column are read as BioActivities (i.e. dependent variable) + # files with multiple property columns are read as SubstanceProperties (i.e. independent variables) + # @return [OpenTox::Dataset] + def self.from_csv_file file + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) + if dataset + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." + else + $logger.debug "Parsing #{file}." + table = nil + sep = "," + ["\t",";"].each do |s| # guess alternative CSV separator + if File.readlines(file).first.match(/#{s}/) + sep = s + break end end - end - end - - # Convert dataset to SDF format - # @return [String] SDF string - def to_sdf - sdf = "" - substances.each do |substance| - sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") - sdf_lines[0] = substance.smiles - sdf += sdf_lines.join("\n") - features.each do |f| - sdf += "\n> <#{f.name}>\n" - sdf += values(substance,f).uniq.join "," - end - sdf += "\n$$$$\n" - end - sdf - end - - # Parsers - - # Create a dataset from PubChem Assay - # @param [Integer] PubChem AssayID (AID) - # @return [OpenTox::Dataset] - def self.from_pubchem aid - url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" - csv = CSV.parse(RestClientWrapper.get(url)) - csv.select!{|r| r[0].match /^\d/} # discard header rows - table = [["SID","SMILES","Activity"]] - csv.each_slice(100) do |slice| # get SMILES in chunks - sids = slice.collect{|s| s[1]} - smiles = RestClientWrapper.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT").split("\n") - abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size - smiles.each_with_index do |smi,i| - table << [slice[i][1],smi.chomp,slice[i][3]] + table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + if table + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) + dataset.parse_table table + else + bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." end end - dataset = self.new(:source => url) # TODO name - dataset.parse_table table, false dataset end # Create a dataset from SDF file + # files with a single data field are read as BioActivities (i.e. dependent variable) + # files with multiple data fields are read as SubstanceProperties (i.e. independent variable) # @param [File] # @return [OpenTox::Dataset] - def self.from_sdf_file file, map=nil + def self.from_sdf_file file md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." - table = nil - read_result = false - sdf = "" - dataset = self.new(:source => file, :name => File.basename(file), :md5 => md5) + + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID") + read_result = false + sdf = "" feature_name = "" compound = nil features = {} + table = [["ID","SMILES"]] File.readlines(file).each do |line| if line.match %r{\$\$\$\$} sdf << line id = sdf.split("\n").first.chomp compound = Compound.from_sdf sdf - dataset.add compound, original_id, id - features.each { |f,v| dataset.add compound, f, v } + row = [id,compound.smiles] + features.each do |f,v| + table[0] << f unless table[0].include? f + row[table[0].index(f)] = v + end + table << row sdf = "" features = {} elsif line.match /^>\s+ feature_name, :measured => true) - value = value.to_f - else - feature = NominalFeature.find_or_create_by(:name => feature_name, :measured => true) - end - features[feature] = value + features[feature_name] = value read_result = false else sdf << line end end end + dataset.parse_table table end dataset.save dataset end - - # Create a dataset from CSV file - # @param [File] - # @param [TrueClass,FalseClass] accept or reject empty values + + # Create a dataset from PubChem Assay + # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] - def self.from_csv_file file - md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files - dataset = self.find_by(:md5 => md5) - if dataset - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - table = nil - [",","\t",";"].each do |sep| # guess CSV separator - if File.readlines(file).first.match(/#{sep}/) - table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - break - end - end - if table - dataset = self.new(:source => file, :name => File.basename(file), :md5 => md5) - dataset.parse_table table - else - bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." + def self.from_pubchem_aid aid + url = File.join PUBCHEM_URI, "assay/aid/#{aid}/CSV" + assay_metadata = JSON.parse(RestClientWrapper.get(File.join PUBCHEM_URI,"assay/aid/#{aid}/description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"] + name = assay_metadata["name"].gsub(/\s+/,"_") + csv = CSV.parse(RestClientWrapper.get(url)) + csv.select!{|r| r[0].match /^\d/} # discard header rows + table = [["SID","SMILES",name]] + csv.each_slice(100) do |slice| # get SMILES in chunks + sids = slice.collect{|s| s[1]} + smiles = RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT")).split("\n").collect{|s| s.to_s} + abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size + smiles.each_with_index do |smi,i| + table << [slice[i][1].to_s,smi.chomp,slice[i][3].to_s] end end + dataset = self.new(:source => url, :name => name) + dataset.parse_table table dataset end @@ -302,8 +202,8 @@ module OpenTox features = [] # guess feature types + bioactivity = true if feature_names.size == 1 feature_names.each_with_index do |f,i| - metadata = {:name => f, :measured => true} original_id ? j = i+2 : j = i+1 values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq @@ -311,11 +211,18 @@ module OpenTox if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes numeric[i] = true - feature = NumericFeature.find_or_create_by(metadata) + if bioactivity + feature = NumericBioActivity.find_or_create_by(:name => f) + else + feature = NumericSubstanceProperty.find_or_create_by(:name => f) + end else - metadata["accept_values"] = values.sort numeric[i] = false - feature = NominalFeature.find_or_create_by(metadata) + if bioactivity + feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) + else + feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) + end end features << feature if feature end @@ -326,13 +233,12 @@ module OpenTox table.each_with_index do |vals,i| original_id_value = vals.shift.strip if original_id identifier = vals.shift.strip - #warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? #and !accept_empty_values begin case compound_format when /SMILES/i - substance = OpenTox::Compound.from_smiles(identifier) + substance = Compound.from_smiles(identifier) when /InChI/i - substance = OpenTox::Compound.from_inchi(identifier) + substance = Compound.from_inchi(identifier) end rescue substance = nil @@ -345,18 +251,13 @@ module OpenTox substance.dataset_ids << self.id substance.dataset_ids.uniq! substance.save - - unless vals.size == features.size - warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." - next - end add substance, original_id, original_id_value if original_id vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." - next + warn "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." + v = nil elsif numeric[j] v = v.to_f else @@ -364,6 +265,7 @@ module OpenTox end add substance, features[j], v end + data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions end all_substances.duplicates.each do |substance| @@ -374,6 +276,115 @@ module OpenTox save end + # Serialisation + + # Convert dataset to csv format including compound smiles as first column, other column headers are feature names + # @return [String] + def to_csv(inchi=false) + CSV.generate() do |csv| + compound = substances.first.is_a? Compound + if compound + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + else + csv << ["Name"] + features.collect{|f| f.name} + end + substances.each do |substance| + if compound + name = (inchi ? substance.inchi : substance.smiles) + else + name = substance.name + end + nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq + + if nr_measurements.size > 1 + warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." + else + (0..nr_measurements.first-1).each do |i| + row = [name] + features.each do |f| + values(substance,f) ? row << values(substance,f)[i] : row << "" + end + csv << row + end + end + end + end + end + + # Convert dataset to SDF format + # @return [String] SDF string + def to_sdf + sdf = "" + substances.each do |substance| + sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") + sdf_lines[0] = substance.smiles + sdf += sdf_lines.join("\n") + features.each do |f| + sdf += "\n> <#{f.name}>\n" + sdf += values(substance,f).uniq.join "," + end + sdf += "\n$$$$\n" + end + sdf + end + + # Dataset operations + + # Merge an array of datasets + # @param [Array] OpenTox::Dataset Array to be merged + # @param [Hash] feature modifications + # @param [Hash] value modifications + # @return [OpenTox::Dataset] merged dataset + def self.merge datasets, feature_map=nil, value_map=nil + dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) + datasets.each do |d| + d.substances.each do |s| + d.features.each do |f| + d.values(s,f).each do |v| + f = feature_map[f] if feature_map and feature_map[f] + v = value_map[v] if value_map and value_map[v] + dataset.add s,f,v #unless dataset.values(s,f).include? v + end + end + end + end + dataset.save + dataset + end + + # Split a dataset into n folds + # @param [Integer] number of folds + # @return [Array] Array with folds [training_dataset,test_dataset] + def folds n + len = self.substances.size + indices = (0..len-1).to_a.shuffle + mid = (len/n) + chunks = [] + start = 0 + 1.upto(n) do |i| + last = start+mid + last = last-1 unless len%n >= i + test_idxs = indices[start..last] || [] + test_substances = test_idxs.collect{|i| substances[i]} + training_idxs = indices-test_idxs + training_substances = training_idxs.collect{|i| substances[i]} + chunk = [training_substances,test_substances].collect do |substances| + dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) + substances.each do |substance| + substance.dataset_ids << dataset.id + substance.dataset_ids.uniq! + substance.save + dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} + end + dataset.save + dataset + end + start = last+1 + chunks << chunk + end + chunks + end + # Delete dataset def delete compounds.each{|c| c.dataset_ids.delete id.to_s} @@ -453,7 +464,7 @@ module OpenTox bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \ "Accepted formats: SMILES, InChI. Please take a look on the help page." end - numeric = [] + #numeric = [] features = [] # guess feature types feature_names.each_with_index do |f,i| @@ -463,11 +474,11 @@ module OpenTox feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - numeric[i] = true + #numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else metadata["accept_values"] = values.sort - numeric[i] = false + #numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end features << feature if feature -- cgit v1.2.3