From faccda14c0f98333bf7623d4caef00eea7bb1933 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 19 Aug 2019 20:53:53 +0200 Subject: temporary commit before branching - Dataset.from_descriptor_csv_file is broken --- lib/dataset.rb | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 9e9fdd5..1d6b56c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -167,13 +167,15 @@ module OpenTox # - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable) # @param [String] Descriptor type # @return [OpenTox::Dataset] - def self.from_descriptor_csv_file file, type + def self.from_descriptor_csv_file file, category md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) + #dataset = nil if dataset $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." + p "Parsing #{file}." table = nil sep = "," ["\t",";"].each do |s| # guess alternative CSV separator @@ -189,26 +191,17 @@ module OpenTox # features feature_names = table.shift.collect{|f| f.strip} raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift) - numeric = [] - features = [] - - feature_names.each_with_index do |f,i| - values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - feature = nil - if values.size == 0 # empty feature - elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - numeric[i] = true - i == feature_names.size-1 ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f) - else - numeric[i] = false - i == feature_names.size-1 ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) - end - features << feature if feature + bioactivity_feature_name = feature_names.pop + values = table.collect{|row| val=row.last.to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + if values.size > 5 and types.size == 1 and types.first == true # 5 max classes + bioactivity_feature = NumericBioActivity.find_or_create_by(:name => bioactivity_feature_name) + else + bioactivity_feature = NominalBioActivity.find_or_create_by(:name => bioactivity_feature_name, :accept_values => values.sort) end - bioactivity_feature = features.pop # substances and values @@ -223,12 +216,10 @@ module OpenTox if v.blank? warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." next - elsif numeric[j] - v = v.to_f else - v = v.strip + property = NumericSubstanceProperty.find_or_create_by(:name => feature_names[j],:category => category) + substance.properties[property.id.to_s] = v.to_f end - substance.properties[features[j].id.to_s] = [v] end substance.save dataset.add substance, bioactivity_feature, bioactivity_value -- cgit v1.2.3