From 1f789133d961c29d3babfaf69cdde3d675288537 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 24 Aug 2019 14:44:52 +0200 Subject: initial refactored version for mutagenicity paper --- lib/dataset.rb | 529 ++++++++------------------------------------------------- 1 file changed, 68 insertions(+), 461 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 1d6b56c..8cb343f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -1,234 +1,76 @@ -require 'csv' -require 'tempfile' -require 'digest/md5' - -module OpenTox - - # Collection of substances and features - class Dataset - - field :data_entries, type: Array, default: [] #substance,feature,value - field :warnings, type: Array, default: [] - field :source, type: String - field :md5, type: String - - # Readers - - # Get all compounds - # @return [Array] - def compounds - substances.select{|s| s.is_a? Compound} - end - - # Get all nanoparticles - # @return [Array] - def nanoparticles - substances.select{|s| s.is_a? Nanoparticle} - end - - # Get all substances - # @return [Array] - def substances - @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq - @substances - end - - # Get all features - # @return [Array] - def features - @features ||= data_entries.collect{|row| OpenTox::Feature.find(row[1])}.uniq - @features - end - - # Get all values for a given substance and feature - # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id - # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id - # @return [Array] values - def values substance,feature - substance = substance.id if substance.is_a? Substance - feature = feature.id if feature.is_a? Feature - substance = BSON::ObjectId.from_string(substance) if substance.is_a? String - feature = BSON::ObjectId.from_string(feature) if feature.is_a? String - data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]} - end - - # Get OriginalId features - # @return [Array] original ID features (merged datasets may have multiple original IDs) - def original_id_features - features.select{|f| f.is_a?(OriginalId)} - end - - # Get OriginalSmiles features - # @return [Array] original smiles features (merged datasets may have multiple original smiles) - def original_smiles_features - features.select{|f| f.is_a?(OriginalSmiles)} - end - - # Get Warnings features - # @return [Array] warnings features (merged datasets may have multiple warnings) - def warnings_features - features.select{|f| f.is_a?(Warnings)} - end - - # Get Confidence feature - # @return [OpenTox::Confidence] confidence feature - def confidence_feature - features.select{|f| f.is_a?(Confidence)}.first - end - - # Get nominal and numeric bioactivity features - # @return [Array] - def bioactivity_features - features.select{|f| f._type.match(/BioActivity/)} - end - - # Get nominal and numeric bioactivity features - # @return [Array] - def transformed_bioactivity_features - features.select{|f| f._type.match(/Transformed.*BioActivity/)} - end - - # Get nominal and numeric substance property features - # @return [Array] - def substance_property_features - features.select{|f| f._type.match("SubstanceProperty")} - end - - # Get nominal and numeric prediction features - # @return [Array] - def prediction_feature - features.select{|f| f._type.match(/Prediction$/)}.first - end - - # Get supporting nominal and numeric prediction features (class probabilities, prediction interval) - # @return [Array] - def prediction_supporting_features - features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)} - end - - # Get nominal and numeric merged features - # @return [Array] - def merged_features - features.select{|f| f._type.match("Merged")} - end - - # Writers - - # Add a value for a given substance and feature - # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id - # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id - # @param [TrueClass,FalseClass,Float] - def add(substance,feature,value) - substance = substance.id if substance.is_a? Substance - feature = feature.id if feature.is_a? Feature - data_entries << [substance,feature,value] if substance and feature and value - end - - # Parsers - - # Create a dataset from CSV file - # @param [File] Input file with the following format: - # - ID column (optional): header containing "ID" string, arbitrary ID values - # - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings - # - one or more properties column(s): header with property name(s), property values - # files with a single property column are read as BioActivities (i.e. dependent variable) - # files with multiple property columns are read as SubstanceProperties (i.e. independent variables) - # @return [OpenTox::Dataset] - def self.from_csv_file file - md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files - dataset = self.find_by(:md5 => md5) - if dataset - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - table = nil - sep = "," - ["\t",";"].each do |s| # guess alternative CSV separator - if File.readlines(file).first.match(/#{s}/) - sep = s - break - end - end - table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - if table - dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) - dataset.parse_table table - else - raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." - end - end - dataset - end - - # Create a dataset from CSV with descriptor values file - # @param [File] Input file with the following format: - # - ID column: header containing arbitrary string, arbitrary ID values - # - properties columns: header with property names, property values (i.e. independent variables) - # - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable) - # @param [String] Descriptor type - # @return [OpenTox::Dataset] - def self.from_descriptor_csv_file file, category - md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files - dataset = self.find_by(:md5 => md5) - #dataset = nil - if dataset - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - p "Parsing #{file}." - table = nil - sep = "," - ["\t",";"].each do |s| # guess alternative CSV separator - if File.readlines(file).first.match(/#{s}/) - sep = s - break - end - end - table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find ',' ';' or TAB as column separator." unless table - dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) - - # features - feature_names = table.shift.collect{|f| f.strip} - raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - - original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift) - - bioactivity_feature_name = feature_names.pop - values = table.collect{|row| val=row.last.to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - if values.size > 5 and types.size == 1 and types.first == true # 5 max classes - bioactivity_feature = NumericBioActivity.find_or_create_by(:name => bioactivity_feature_name) - else - bioactivity_feature = NominalBioActivity.find_or_create_by(:name => bioactivity_feature_name, :accept_values => values.sort) - end - - # substances and values +require 'matrix' + +class Dataset + + def initialize file + @dir = File.dirname file + @dependent_variable_type = File.read(File.join(@dir,"dependent_variable_type")).chomp + if @dependent_variable_type == "binary" + @dependent_variable_values = {} + File.readlines(File.join(@dir,"dependent_variable_values")).each_with_index{|v,i| @dependent_variable_values[v.chomp] = i} + end + @independent_variable_type = File.read(File.join(@dir,"independent_variable_type")).chomp + @lines = File.readlines(file) + @header = @lines.shift.split(",") + @header.first.match(/ID/i) ? @has_id = true : @has_id = false + @dependent_variable_name = @header.pop + @ids = [] + @dependent_variables = [] + @independent_variables = [] + @independent_variable_names = [] + end - table.each_with_index do |vals,i| + def print_variables + File.open(File.join(@dir,"ids"),"w+") { |f| f.puts @ids.join("\n") } + File.open(File.join(@dir,"dependent_variable_name"),"w+") { |f| f.puts @dependent_variable_name } + File.open(File.join(@dir,"dependent_variables"),"w+") { |f| f.puts @dependent_variables.join("\n") } + File.open(File.join(@dir,"independent_variable_names"),"w+") { |f| f.puts @independent_variable_names.join(",") } + File.open(File.join(@dir,"independent_variables"),"w+") { |f| @independent_variables.each{|row| f.puts row.join(",")} } + end - original_id_value = vals.shift.to_s.strip - bioactivity_value = vals.pop.to_s.strip - substance = Substance.new - dataset.add substance, original_id, original_id_value + def scale_independent_variables file + @header.shift if @has_id + @independent_variable_names = @header + @lines.each_with_index do |line,i| + items = line.chomp.split(",") + @ids << items.shift + if @dependent_variable_type == "binary" + @dependent_variables << @dependent_variable_values[items.pop] + elsif @dependent_variable_type == "numeric" + @dependent_variables << items.pop.to_f + end + @independent_variables << items.collect{|i| i.to_f} + end + @independent_variables = Matrix[ *@independent_variables ] + columns = @independent_variables.column_vectors + @independent_variable_means = columns.collect{|c| c.to_a.mean} + @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation} + scaled_columns = [] + columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@independent_variable_means[i])/@independent_variable_standard_deviations[i] : nil}} + @independent_variables = Matrix.columns(scaled_columns).to_a + print_variables + File.open(File.join(@dir,"means"),"w+") { |f| f.puts @independent_variable_means.join(",") } + File.open(File.join(@dir,"standard_deviations"),"w+") { |f| f.puts @independent_variable_standard_deviations.join(",") } + end - vals.each_with_index do |v,j| - if v.blank? - warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." - next - else - property = NumericSubstanceProperty.find_or_create_by(:name => feature_names[j],:category => category) - substance.properties[property.id.to_s] = v.to_f - end - end - substance.save - dataset.add substance, bioactivity_feature, bioactivity_value - end - dataset.save + def fingerprint_independent_variables file, fingerprint_type="MP2D" + fingerprints = [] + @lines.each_with_index do |line,i| + items = line.chomp.split(",") + @has_id ? @ids << items.shift : @ids << i + if @dependent_variable_type == "binary" + @dependent_variables << @dependent_variable_values[items.pop] + elsif @dependent_variable_type == "numeric" + @dependent_variables << items.pop.to_f end - dataset + @independent_variables << [items[0]] + Compound.new(items[0]).fingerprint(fingerprint_type) end + @independent_variable_names = ["Canonical Smiles"] + fingerprints.flatten.sort.uniq + print_variables + end +end +=begin # Create a dataset from SDF file # files with a single data field are read as BioActivities (i.e. dependent variable) # files with multiple data fields are read as SubstanceProperties (i.e. independent variable) @@ -325,178 +167,6 @@ module OpenTox dataset end - # Parse data in tabular format (e.g. from csv) - # does a lot of guesswork in order to determine feature types - # @param [Array] - def parse_table table - - # features - feature_names = table.shift.collect{|f| f.strip} - raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - - if feature_names[0] !~ /SMILES|InChI/i # check ID column - original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) - else - original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") - end - - compound_format = feature_names.shift - raise ArgumentError, "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i) - - numeric = [] - features = [] - - # guess feature types - bioactivity = true if feature_names.size == 1 - - feature_names.each_with_index do |f,i| - original_id.name.match(/LineID$/) ? j = i+1 : j = i+2 - values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - feature = nil - if values.size == 0 # empty feature - elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - numeric[i] = true - bioactivity ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f) - else - numeric[i] = false - bioactivity ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) - end - features << feature if feature - end - - # substances and values - - all_substances = [] - table.each_with_index do |vals,i| - original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.to_s.strip - identifier = vals.shift.strip - begin - case compound_format - when /SMILES/i - substance = Compound.from_smiles(identifier) - add substance, original_smiles, identifier - when /InChI/i - substance = Compound.from_inchi(identifier) - end - rescue - substance = nil - end - - if substance.nil? # compound parsers may return nil - warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." - next - end - - all_substances << substance - add substance, original_id, original_id_value - - vals.each_with_index do |v,j| - if v.blank? - warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." - next - elsif numeric[j] - v = v.to_f - else - v = v.strip - end - add substance, features[j], v - end - end - - warnings_feature = Warnings.find_or_create_by(:dataset_id => id) - all_substances.duplicates.each do |substance| - positions = [] - all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles} - all_substances.select{|s| s.smiles == substance.smiles}.each do |s| - add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." - end - end - save - end - - # Serialisation - - # Convert dataset into csv formatted training data - # @return [String] - def to_training_csv - - export_features = merged_features - export_features = transformed_bioactivity_features if export_features.empty? - export_features = bioactivity_features if export_features.empty? - export_feature = export_features.first - - header = ["Canonical SMILES"] - header << bioactivity_features.first.name # use original bioactivity name instead of long merged name - csv = [header] - - substances.each do |substance| - nr_activities = values(substance,bioactivity_features.first).size - (0..nr_activities-1).each do |n| # new row for each value - row = [substance.smiles] - row << values(substance,export_feature)[n] - csv << row - end - end - csv.collect{|r| r.join(",")}.join("\n") - end - - # Convert lazar prediction dataset to csv format - # @return [String] - def to_prediction_csv - - compound = substances.first.is_a? Compound - header = ["ID"] - header << "Original SMILES" if compound - compound ? header << "Canonical SMILES" : header << "Name" - header << "Prediction" if prediction_feature - header << "Confidence" if confidence_feature - header += prediction_supporting_features.collect{|f| f.name} - header << "Measurements" - csv = [header] - - substances.each do |substance| - row = original_id_features.collect{|f| values(substance,f).join(" ")} - row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound - compound ? row << substance.smiles : row << substance.name - row << values(substance,prediction_feature).join(" ") - row << values(substance,confidence_feature).join(" ") - row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")} - row << values(substance,bioactivity_features[0]).join(" ") - csv << row - end - csv.collect{|r| r.join(",")}.join("\n") - end - - # Export fingerprints in csv format - # @return [String] - def to_fingerprint_csv type=Compound::DEFAULT_FINGERPRINT - - fingerprints = substances.collect{|s| s.fingerprints[type]}.flatten.sort.uniq - export_features = merged_features - export_features = transformed_bioactivity_features if export_features.empty? - export_features = bioactivity_features if export_features.empty? - export_feature = export_features.first - - header = ["Canonical SMILES"] - header += fingerprints - header << bioactivity_features.first.name # use original bioactivity name instead of long merged name - csv = [header] - - substances.each do |substance| - nr_activities = values(substance,bioactivity_features.first).size - (0..nr_activities-1).each do |n| # new row for each value - row = [substance.smiles] - fingerprints.each do |f| - substance.fingerprints[type].include?(f) ? row << 1 : row << 0 - end - row << values(substance,export_feature)[n] - csv << row - end - end - csv.collect{|r| r.join(",")}.join("\n") - end # Convert dataset to SDF format # @return [String] SDF string @@ -520,70 +190,6 @@ module OpenTox sdf end - # Get lazar predictions from a dataset - # @return [Hash] predictions - def predictions - predictions = {} - substances.each do |s| - predictions[s] ||= {} - predictions[s][:value] = values(s,prediction_feature).first - #predictions[s][:warnings] = [] - #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } - predictions[s][:confidence] = values(s,confidence_feature).first - if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction - prediction_feature.accept_values.each do |v| - f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id) - predictions[s][:probabilities] ||= {} - predictions[s][:probabilities][v] = values(s,f).first - end - end - end - predictions - end - - # Dataset operations - - # Copy a dataset - # @return OpenTox::Dataset dataset copy - def copy - dataset = Dataset.new - dataset.data_entries = data_entries - dataset.warnings = warnings - dataset.name = name - dataset.source = id.to_s - dataset.save - dataset - end - - # Split a dataset into n folds - # @param [Integer] number of folds - # @return [Array] Array with folds [training_dataset,test_dataset] - def folds n - $logger.debug "Creating #{n} folds for #{name}." - len = self.substances.size - indices = (0..len-1).to_a.shuffle - mid = (len/n) - chunks = [] - start = 0 - 1.upto(n) do |i| - last = start+mid - last = last-1 unless len%n >= i - test_idxs = indices[start..last] || [] - test_substances = test_idxs.collect{|i| substances[i].id} - training_idxs = indices-test_idxs - training_substances = training_idxs.collect{|i| substances[i].id} - chunk = [training_substances,test_substances].collect do |substances| - self.class.create( - :name => "#{self.name} (Fold #{i-1})", - :source => self.id, - :data_entries => data_entries.select{|row| substances.include? row[0]} - ) - end - start = last+1 - chunks << chunk - end - chunks - end # Merge an array of datasets # @param [Array] datasets Datasets to be merged @@ -634,3 +240,4 @@ module OpenTox end end +=end -- cgit v1.2.3