property import from csv file

author: Christoph Helma <helma@in-silico.ch> 2019-08-19 17:45:27 +0200
committer: Christoph Helma <helma@in-silico.ch> 2019-08-19 17:45:27 +0200
commit: ede38a2f1390befe8f7cf8a62fb5432448633d63 (patch)
tree: b97d26b236c8e7d7f06dbe7172f85b4ea232cb15
parent: 15f093eee66597b0b4a4defa7d8645a3e13372a0 (diff)
1 files changed, 78 insertions, 0 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 7037679..9e9fdd5 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -159,6 +159,84 @@ module OpenTox
       end
       dataset
     end
+    
+    # Create a dataset from CSV with descriptor values file
+    # @param [File] Input file with the following format:
+    #   - ID column: header containing arbitrary string, arbitrary ID values
+    #   - properties columns: header with property names, property values (i.e. independent variables)
+    #   - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable)
+    # @param [String] Descriptor type
+    # @return [OpenTox::Dataset]
+    def self.from_descriptor_csv_file file, type
+      md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
+      dataset = self.find_by(:md5 => md5)
+      if dataset
+        $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
+      else
+        $logger.debug "Parsing #{file}."
+        table = nil
+        sep = ","
+        ["\t",";"].each do |s| # guess alternative CSV separator
+          if File.readlines(file).first.match(/#{s}/)
+            sep = s
+            break
+          end
+        end
+        table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
+        raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find ',' ';' or TAB as column separator." unless table
+        dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
+
+        # features
+        feature_names = table.shift.collect{|f| f.strip}
+        raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+        original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift)
+
+        numeric = []
+        features = []
+
+        feature_names.each_with_index do |f,i|
+          values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+          types = values.collect{|v| v.numeric? ? true : false}.uniq
+          feature = nil
+          if values.size == 0 # empty feature
+          elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+            numeric[i] = true
+            i == feature_names.size-1 ?  feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
+          else
+            numeric[i] = false
+            i == feature_names.size-1 ?  feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
+          end
+          features << feature if feature
+        end
+        bioactivity_feature = features.pop
+        
+        # substances and values
+
+        table.each_with_index do |vals,i|
+
+          original_id_value = vals.shift.to_s.strip
+          bioactivity_value = vals.pop.to_s.strip
+          substance = Substance.new
+          dataset.add substance, original_id, original_id_value 
+
+          vals.each_with_index do |v,j|
+            if v.blank?
+              warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
+              next
+            elsif numeric[j]
+              v = v.to_f
+            else
+              v = v.strip
+            end
+            substance.properties[features[j].id.to_s] = [v]
+          end
+          substance.save
+          dataset.add substance, bioactivity_feature, bioactivity_value
+        end
+        dataset.save
+      end
+      dataset
+    end
 
     # Create a dataset from SDF file 
     #   files with a single data field are read as BioActivities (i.e. dependent variable)
author	Christoph Helma <helma@in-silico.ch>	2019-08-19 17:45:27 +0200
committer	Christoph Helma <helma@in-silico.ch>	2019-08-19 17:45:27 +0200
commit	ede38a2f1390befe8f7cf8a62fb5432448633d63 (patch)
tree	b97d26b236c8e7d7f06dbe7172f85b4ea232cb15
parent	15f093eee66597b0b4a4defa7d8645a3e13372a0 (diff)