summaryrefslogtreecommitdiff
path: root/lib/dataset.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dataset.rb')
-rw-r--r--lib/dataset.rb529
1 files changed, 68 insertions, 461 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 1d6b56c..8cb343f 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -1,234 +1,76 @@
-require 'csv'
-require 'tempfile'
-require 'digest/md5'
-
-module OpenTox
-
- # Collection of substances and features
- class Dataset
-
- field :data_entries, type: Array, default: [] #substance,feature,value
- field :warnings, type: Array, default: []
- field :source, type: String
- field :md5, type: String
-
- # Readers
-
- # Get all compounds
- # @return [Array<OpenTox::Compound>]
- def compounds
- substances.select{|s| s.is_a? Compound}
- end
-
- # Get all nanoparticles
- # @return [Array<OpenTox::Nanoparticle>]
- def nanoparticles
- substances.select{|s| s.is_a? Nanoparticle}
- end
-
- # Get all substances
- # @return [Array<OpenTox::Substance>]
- def substances
- @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq
- @substances
- end
-
- # Get all features
- # @return [Array<OpenTox::Feature>]
- def features
- @features ||= data_entries.collect{|row| OpenTox::Feature.find(row[1])}.uniq
- @features
- end
-
- # Get all values for a given substance and feature
- # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id
- # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id
- # @return [Array<TrueClass,FalseClass,Float>] values
- def values substance,feature
- substance = substance.id if substance.is_a? Substance
- feature = feature.id if feature.is_a? Feature
- substance = BSON::ObjectId.from_string(substance) if substance.is_a? String
- feature = BSON::ObjectId.from_string(feature) if feature.is_a? String
- data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]}
- end
-
- # Get OriginalId features
- # @return [Array<OpenTox::OriginalId>] original ID features (merged datasets may have multiple original IDs)
- def original_id_features
- features.select{|f| f.is_a?(OriginalId)}
- end
-
- # Get OriginalSmiles features
- # @return [Array<OpenTox::OriginalSmiles>] original smiles features (merged datasets may have multiple original smiles)
- def original_smiles_features
- features.select{|f| f.is_a?(OriginalSmiles)}
- end
-
- # Get Warnings features
- # @return [Array<OpenTox::Warnings>] warnings features (merged datasets may have multiple warnings)
- def warnings_features
- features.select{|f| f.is_a?(Warnings)}
- end
-
- # Get Confidence feature
- # @return [OpenTox::Confidence] confidence feature
- def confidence_feature
- features.select{|f| f.is_a?(Confidence)}.first
- end
-
- # Get nominal and numeric bioactivity features
- # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>]
- def bioactivity_features
- features.select{|f| f._type.match(/BioActivity/)}
- end
-
- # Get nominal and numeric bioactivity features
- # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>]
- def transformed_bioactivity_features
- features.select{|f| f._type.match(/Transformed.*BioActivity/)}
- end
-
- # Get nominal and numeric substance property features
- # @return [Array<OpenTox::NominalSubstanceProperty,OpenTox::NumericSubstanceProperty>]
- def substance_property_features
- features.select{|f| f._type.match("SubstanceProperty")}
- end
-
- # Get nominal and numeric prediction features
- # @return [Array<OpenTox::NominalLazarPrediction,OpenTox::NumericLazarPrediction>]
- def prediction_feature
- features.select{|f| f._type.match(/Prediction$/)}.first
- end
-
- # Get supporting nominal and numeric prediction features (class probabilities, prediction interval)
- # @return [Array<OpenTox::LazarPredictionProbability,OpenTox::LazarPredictionInterval>]
- def prediction_supporting_features
- features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)}
- end
-
- # Get nominal and numeric merged features
- # @return [Array<OpenTox::MergedNominalBioActivity,OpenTox::MergedNumericBioActivity>]
- def merged_features
- features.select{|f| f._type.match("Merged")}
- end
-
- # Writers
-
- # Add a value for a given substance and feature
- # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id
- # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id
- # @param [TrueClass,FalseClass,Float]
- def add(substance,feature,value)
- substance = substance.id if substance.is_a? Substance
- feature = feature.id if feature.is_a? Feature
- data_entries << [substance,feature,value] if substance and feature and value
- end
-
- # Parsers
-
- # Create a dataset from CSV file
- # @param [File] Input file with the following format:
- # - ID column (optional): header containing "ID" string, arbitrary ID values
- # - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings
- # - one or more properties column(s): header with property name(s), property values
- # files with a single property column are read as BioActivities (i.e. dependent variable)
- # files with multiple property columns are read as SubstanceProperties (i.e. independent variables)
- # @return [OpenTox::Dataset]
- def self.from_csv_file file
- md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
- dataset = self.find_by(:md5 => md5)
- if dataset
- $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
- else
- $logger.debug "Parsing #{file}."
- table = nil
- sep = ","
- ["\t",";"].each do |s| # guess alternative CSV separator
- if File.readlines(file).first.match(/#{s}/)
- sep = s
- break
- end
- end
- table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
- if table
- dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
- dataset.parse_table table
- else
- raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator."
- end
- end
- dataset
- end
-
- # Create a dataset from CSV with descriptor values file
- # @param [File] Input file with the following format:
- # - ID column: header containing arbitrary string, arbitrary ID values
- # - properties columns: header with property names, property values (i.e. independent variables)
- # - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable)
- # @param [String] Descriptor type
- # @return [OpenTox::Dataset]
- def self.from_descriptor_csv_file file, category
- md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
- dataset = self.find_by(:md5 => md5)
- #dataset = nil
- if dataset
- $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
- else
- $logger.debug "Parsing #{file}."
- p "Parsing #{file}."
- table = nil
- sep = ","
- ["\t",";"].each do |s| # guess alternative CSV separator
- if File.readlines(file).first.match(/#{s}/)
- sep = s
- break
- end
- end
- table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
- raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find ',' ';' or TAB as column separator." unless table
- dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
-
- # features
- feature_names = table.shift.collect{|f| f.strip}
- raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
-
- original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift)
-
- bioactivity_feature_name = feature_names.pop
- values = table.collect{|row| val=row.last.to_s.strip; val.blank? ? nil : val }.uniq.compact
- types = values.collect{|v| v.numeric? ? true : false}.uniq
- if values.size > 5 and types.size == 1 and types.first == true # 5 max classes
- bioactivity_feature = NumericBioActivity.find_or_create_by(:name => bioactivity_feature_name)
- else
- bioactivity_feature = NominalBioActivity.find_or_create_by(:name => bioactivity_feature_name, :accept_values => values.sort)
- end
-
- # substances and values
+require 'matrix'
+
+class Dataset
+
+ def initialize file
+ @dir = File.dirname file
+ @dependent_variable_type = File.read(File.join(@dir,"dependent_variable_type")).chomp
+ if @dependent_variable_type == "binary"
+ @dependent_variable_values = {}
+ File.readlines(File.join(@dir,"dependent_variable_values")).each_with_index{|v,i| @dependent_variable_values[v.chomp] = i}
+ end
+ @independent_variable_type = File.read(File.join(@dir,"independent_variable_type")).chomp
+ @lines = File.readlines(file)
+ @header = @lines.shift.split(",")
+ @header.first.match(/ID/i) ? @has_id = true : @has_id = false
+ @dependent_variable_name = @header.pop
+ @ids = []
+ @dependent_variables = []
+ @independent_variables = []
+ @independent_variable_names = []
+ end
- table.each_with_index do |vals,i|
+ def print_variables
+ File.open(File.join(@dir,"ids"),"w+") { |f| f.puts @ids.join("\n") }
+ File.open(File.join(@dir,"dependent_variable_name"),"w+") { |f| f.puts @dependent_variable_name }
+ File.open(File.join(@dir,"dependent_variables"),"w+") { |f| f.puts @dependent_variables.join("\n") }
+ File.open(File.join(@dir,"independent_variable_names"),"w+") { |f| f.puts @independent_variable_names.join(",") }
+ File.open(File.join(@dir,"independent_variables"),"w+") { |f| @independent_variables.each{|row| f.puts row.join(",")} }
+ end
- original_id_value = vals.shift.to_s.strip
- bioactivity_value = vals.pop.to_s.strip
- substance = Substance.new
- dataset.add substance, original_id, original_id_value
+ def scale_independent_variables file
+ @header.shift if @has_id
+ @independent_variable_names = @header
+ @lines.each_with_index do |line,i|
+ items = line.chomp.split(",")
+ @ids << items.shift
+ if @dependent_variable_type == "binary"
+ @dependent_variables << @dependent_variable_values[items.pop]
+ elsif @dependent_variable_type == "numeric"
+ @dependent_variables << items.pop.to_f
+ end
+ @independent_variables << items.collect{|i| i.to_f}
+ end
+ @independent_variables = Matrix[ *@independent_variables ]
+ columns = @independent_variables.column_vectors
+ @independent_variable_means = columns.collect{|c| c.to_a.mean}
+ @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation}
+ scaled_columns = []
+ columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@independent_variable_means[i])/@independent_variable_standard_deviations[i] : nil}}
+ @independent_variables = Matrix.columns(scaled_columns).to_a
+ print_variables
+ File.open(File.join(@dir,"means"),"w+") { |f| f.puts @independent_variable_means.join(",") }
+ File.open(File.join(@dir,"standard_deviations"),"w+") { |f| f.puts @independent_variable_standard_deviations.join(",") }
+ end
- vals.each_with_index do |v,j|
- if v.blank?
- warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
- next
- else
- property = NumericSubstanceProperty.find_or_create_by(:name => feature_names[j],:category => category)
- substance.properties[property.id.to_s] = v.to_f
- end
- end
- substance.save
- dataset.add substance, bioactivity_feature, bioactivity_value
- end
- dataset.save
+ def fingerprint_independent_variables file, fingerprint_type="MP2D"
+ fingerprints = []
+ @lines.each_with_index do |line,i|
+ items = line.chomp.split(",")
+ @has_id ? @ids << items.shift : @ids << i
+ if @dependent_variable_type == "binary"
+ @dependent_variables << @dependent_variable_values[items.pop]
+ elsif @dependent_variable_type == "numeric"
+ @dependent_variables << items.pop.to_f
end
- dataset
+ @independent_variables << [items[0]] + Compound.new(items[0]).fingerprint(fingerprint_type)
end
+ @independent_variable_names = ["Canonical Smiles"] + fingerprints.flatten.sort.uniq
+ print_variables
+ end
+end
+=begin
# Create a dataset from SDF file
# files with a single data field are read as BioActivities (i.e. dependent variable)
# files with multiple data fields are read as SubstanceProperties (i.e. independent variable)
@@ -325,178 +167,6 @@ module OpenTox
dataset
end
- # Parse data in tabular format (e.g. from csv)
- # does a lot of guesswork in order to determine feature types
- # @param [Array<Array>]
- def parse_table table
-
- # features
- feature_names = table.shift.collect{|f| f.strip}
- raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
-
- if feature_names[0] !~ /SMILES|InChI/i # check ID column
- original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift)
- else
- original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID")
- end
-
- compound_format = feature_names.shift
- raise ArgumentError, "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
- original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i)
-
- numeric = []
- features = []
-
- # guess feature types
- bioactivity = true if feature_names.size == 1
-
- feature_names.each_with_index do |f,i|
- original_id.name.match(/LineID$/) ? j = i+1 : j = i+2
- values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact
- types = values.collect{|v| v.numeric? ? true : false}.uniq
- feature = nil
- if values.size == 0 # empty feature
- elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
- numeric[i] = true
- bioactivity ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
- else
- numeric[i] = false
- bioactivity ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
- end
- features << feature if feature
- end
-
- # substances and values
-
- all_substances = []
- table.each_with_index do |vals,i|
- original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.to_s.strip
- identifier = vals.shift.strip
- begin
- case compound_format
- when /SMILES/i
- substance = Compound.from_smiles(identifier)
- add substance, original_smiles, identifier
- when /InChI/i
- substance = Compound.from_inchi(identifier)
- end
- rescue
- substance = nil
- end
-
- if substance.nil? # compound parsers may return nil
- warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
- next
- end
-
- all_substances << substance
- add substance, original_id, original_id_value
-
- vals.each_with_index do |v,j|
- if v.blank?
- warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
- next
- elsif numeric[j]
- v = v.to_f
- else
- v = v.strip
- end
- add substance, features[j], v
- end
- end
-
- warnings_feature = Warnings.find_or_create_by(:dataset_id => id)
- all_substances.duplicates.each do |substance|
- positions = []
- all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles}
- all_substances.select{|s| s.smiles == substance.smiles}.each do |s|
- add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
- end
- end
- save
- end
-
- # Serialisation
-
- # Convert dataset into csv formatted training data
- # @return [String]
- def to_training_csv
-
- export_features = merged_features
- export_features = transformed_bioactivity_features if export_features.empty?
- export_features = bioactivity_features if export_features.empty?
- export_feature = export_features.first
-
- header = ["Canonical SMILES"]
- header << bioactivity_features.first.name # use original bioactivity name instead of long merged name
- csv = [header]
-
- substances.each do |substance|
- nr_activities = values(substance,bioactivity_features.first).size
- (0..nr_activities-1).each do |n| # new row for each value
- row = [substance.smiles]
- row << values(substance,export_feature)[n]
- csv << row
- end
- end
- csv.collect{|r| r.join(",")}.join("\n")
- end
-
- # Convert lazar prediction dataset to csv format
- # @return [String]
- def to_prediction_csv
-
- compound = substances.first.is_a? Compound
- header = ["ID"]
- header << "Original SMILES" if compound
- compound ? header << "Canonical SMILES" : header << "Name"
- header << "Prediction" if prediction_feature
- header << "Confidence" if confidence_feature
- header += prediction_supporting_features.collect{|f| f.name}
- header << "Measurements"
- csv = [header]
-
- substances.each do |substance|
- row = original_id_features.collect{|f| values(substance,f).join(" ")}
- row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
- compound ? row << substance.smiles : row << substance.name
- row << values(substance,prediction_feature).join(" ")
- row << values(substance,confidence_feature).join(" ")
- row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")}
- row << values(substance,bioactivity_features[0]).join(" ")
- csv << row
- end
- csv.collect{|r| r.join(",")}.join("\n")
- end
-
- # Export fingerprints in csv format
- # @return [String]
- def to_fingerprint_csv type=Compound::DEFAULT_FINGERPRINT
-
- fingerprints = substances.collect{|s| s.fingerprints[type]}.flatten.sort.uniq
- export_features = merged_features
- export_features = transformed_bioactivity_features if export_features.empty?
- export_features = bioactivity_features if export_features.empty?
- export_feature = export_features.first
-
- header = ["Canonical SMILES"]
- header += fingerprints
- header << bioactivity_features.first.name # use original bioactivity name instead of long merged name
- csv = [header]
-
- substances.each do |substance|
- nr_activities = values(substance,bioactivity_features.first).size
- (0..nr_activities-1).each do |n| # new row for each value
- row = [substance.smiles]
- fingerprints.each do |f|
- substance.fingerprints[type].include?(f) ? row << 1 : row << 0
- end
- row << values(substance,export_feature)[n]
- csv << row
- end
- end
- csv.collect{|r| r.join(",")}.join("\n")
- end
# Convert dataset to SDF format
# @return [String] SDF string
@@ -520,70 +190,6 @@ module OpenTox
sdf
end
- # Get lazar predictions from a dataset
- # @return [Hash] predictions
- def predictions
- predictions = {}
- substances.each do |s|
- predictions[s] ||= {}
- predictions[s][:value] = values(s,prediction_feature).first
- #predictions[s][:warnings] = []
- #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) }
- predictions[s][:confidence] = values(s,confidence_feature).first
- if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction
- prediction_feature.accept_values.each do |v|
- f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id)
- predictions[s][:probabilities] ||= {}
- predictions[s][:probabilities][v] = values(s,f).first
- end
- end
- end
- predictions
- end
-
- # Dataset operations
-
- # Copy a dataset
- # @return OpenTox::Dataset dataset copy
- def copy
- dataset = Dataset.new
- dataset.data_entries = data_entries
- dataset.warnings = warnings
- dataset.name = name
- dataset.source = id.to_s
- dataset.save
- dataset
- end
-
- # Split a dataset into n folds
- # @param [Integer] number of folds
- # @return [Array] Array with folds [training_dataset,test_dataset]
- def folds n
- $logger.debug "Creating #{n} folds for #{name}."
- len = self.substances.size
- indices = (0..len-1).to_a.shuffle
- mid = (len/n)
- chunks = []
- start = 0
- 1.upto(n) do |i|
- last = start+mid
- last = last-1 unless len%n >= i
- test_idxs = indices[start..last] || []
- test_substances = test_idxs.collect{|i| substances[i].id}
- training_idxs = indices-test_idxs
- training_substances = training_idxs.collect{|i| substances[i].id}
- chunk = [training_substances,test_substances].collect do |substances|
- self.class.create(
- :name => "#{self.name} (Fold #{i-1})",
- :source => self.id,
- :data_entries => data_entries.select{|row| substances.include? row[0]}
- )
- end
- start = last+1
- chunks << chunk
- end
- chunks
- end
# Merge an array of datasets
# @param [Array<OpenTox::Dataset>] datasets Datasets to be merged
@@ -634,3 +240,4 @@ module OpenTox
end
end
+=end