summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2019-08-19 17:45:27 +0200
committerChristoph Helma <helma@in-silico.ch>2019-08-19 17:45:27 +0200
commitede38a2f1390befe8f7cf8a62fb5432448633d63 (patch)
treeb97d26b236c8e7d7f06dbe7172f85b4ea232cb15
parent15f093eee66597b0b4a4defa7d8645a3e13372a0 (diff)
property import from csv file
-rw-r--r--lib/dataset.rb78
1 files changed, 78 insertions, 0 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 7037679..9e9fdd5 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -159,6 +159,84 @@ module OpenTox
end
dataset
end
+
+ # Create a dataset from CSV with descriptor values file
+ # @param [File] Input file with the following format:
+ # - ID column: header containing arbitrary string, arbitrary ID values
+ # - properties columns: header with property names, property values (i.e. independent variables)
+ # - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable)
+ # @param [String] Descriptor type
+ # @return [OpenTox::Dataset]
+ def self.from_descriptor_csv_file file, type
+ md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
+ dataset = self.find_by(:md5 => md5)
+ if dataset
+ $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
+ else
+ $logger.debug "Parsing #{file}."
+ table = nil
+ sep = ","
+ ["\t",";"].each do |s| # guess alternative CSV separator
+ if File.readlines(file).first.match(/#{s}/)
+ sep = s
+ break
+ end
+ end
+ table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
+ raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find ',' ';' or TAB as column separator." unless table
+ dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
+
+ # features
+ feature_names = table.shift.collect{|f| f.strip}
+ raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+ original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift)
+
+ numeric = []
+ features = []
+
+ feature_names.each_with_index do |f,i|
+ values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+ types = values.collect{|v| v.numeric? ? true : false}.uniq
+ feature = nil
+ if values.size == 0 # empty feature
+ elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+ numeric[i] = true
+ i == feature_names.size-1 ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
+ else
+ numeric[i] = false
+ i == feature_names.size-1 ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
+ end
+ features << feature if feature
+ end
+ bioactivity_feature = features.pop
+
+ # substances and values
+
+ table.each_with_index do |vals,i|
+
+ original_id_value = vals.shift.to_s.strip
+ bioactivity_value = vals.pop.to_s.strip
+ substance = Substance.new
+ dataset.add substance, original_id, original_id_value
+
+ vals.each_with_index do |v,j|
+ if v.blank?
+ warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
+ next
+ elsif numeric[j]
+ v = v.to_f
+ else
+ v = v.strip
+ end
+ substance.properties[features[j].id.to_s] = [v]
+ end
+ substance.save
+ dataset.add substance, bioactivity_feature, bioactivity_value
+ end
+ dataset.save
+ end
+ dataset
+ end
# Create a dataset from SDF file
# files with a single data field are read as BioActivities (i.e. dependent variable)