From ede38a2f1390befe8f7cf8a62fb5432448633d63 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 19 Aug 2019 17:45:27 +0200 Subject: property import from csv file --- lib/dataset.rb | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/lib/dataset.rb b/lib/dataset.rb index 7037679..9e9fdd5 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -159,6 +159,84 @@ module OpenTox end dataset end + + # Create a dataset from CSV with descriptor values file + # @param [File] Input file with the following format: + # - ID column: header containing arbitrary string, arbitrary ID values + # - properties columns: header with property names, property values (i.e. independent variables) + # - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable) + # @param [String] Descriptor type + # @return [OpenTox::Dataset] + def self.from_descriptor_csv_file file, type + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) + if dataset + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." + else + $logger.debug "Parsing #{file}." + table = nil + sep = "," + ["\t",";"].each do |s| # guess alternative CSV separator + if File.readlines(file).first.match(/#{s}/) + sep = s + break + end + end + table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find ',' ';' or TAB as column separator." unless table + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) + + # features + feature_names = table.shift.collect{|f| f.strip} + raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift) + + numeric = [] + features = [] + + feature_names.each_with_index do |f,i| + values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil + if values.size == 0 # empty feature + elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes + numeric[i] = true + i == feature_names.size-1 ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f) + else + numeric[i] = false + i == feature_names.size-1 ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) + end + features << feature if feature + end + bioactivity_feature = features.pop + + # substances and values + + table.each_with_index do |vals,i| + + original_id_value = vals.shift.to_s.strip + bioactivity_value = vals.pop.to_s.strip + substance = Substance.new + dataset.add substance, original_id, original_id_value + + vals.each_with_index do |v,j| + if v.blank? + warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." + next + elsif numeric[j] + v = v.to_f + else + v = v.strip + end + substance.properties[features[j].id.to_s] = [v] + end + substance.save + dataset.add substance, bioactivity_feature, bioactivity_value + end + dataset.save + end + dataset + end # Create a dataset from SDF file # files with a single data field are read as BioActivities (i.e. dependent variable) -- cgit v1.2.3