From 5a3be4190688bc8240327930b3e953b09ecc9d9e Mon Sep 17 00:00:00 2001 From: gebele Date: Tue, 28 May 2019 14:25:52 +0000 Subject: before clean up --- batch.rb | 101 --------------------------------------------------------------- 1 file changed, 101 deletions(-) delete mode 100644 batch.rb (limited to 'batch.rb') diff --git a/batch.rb b/batch.rb deleted file mode 100644 index 2e72396..0000000 --- a/batch.rb +++ /dev/null @@ -1,101 +0,0 @@ -require 'csv' -require 'tempfile' - -module OpenTox - - class Batch - - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "batch" - field :name, type: String - field :source, type: String - field :identifiers, type: Array - field :ids, type: Array - field :compounds, type: Array - field :warnings, type: Array, default: [] - - def self.from_csv_file file - source = file - name = File.basename(file,".*") - batch = self.find_by(:source => source, :name => name) - if batch - $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})." - else - $logger.debug "Parsing #{file}." - table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => []) - - # original IDs - if table[0][0] =~ /ID/i - @original_ids = table.collect{|row| row.shift} - @original_ids.shift - end - - # features - feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - compound_format = feature_names.shift.strip - bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - numeric = [] - features = [] - # guess feature types - feature_names.each_with_index do |f,i| - metadata = {:name => f} - values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - feature = nil - if values.size == 0 # empty feature - elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - numeric[i] = true - feature = NumericFeature.find_or_create_by(metadata) - else - metadata["accept_values"] = values - numeric[i] = false - feature = NominalFeature.find_or_create_by(metadata) - end - features << feature if feature - end - - table.each_with_index do |vals,i| - identifier = vals.shift.strip.gsub(/^'|'$/,"") - begin - case compound_format - when /SMILES/i - compound = OpenTox::Compound.from_smiles(identifier) - when /InChI/i - compound = OpenTox::Compound.from_inchi(identifier) - end - rescue - compound = nil - end - # collect only for present compounds - unless compound.nil? - batch.identifiers << identifier - batch.compounds << compound.id - batch.ids << @original_ids[i] if @original_ids - else - batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}." - end - end - batch.compounds.duplicates.each do |duplicate| - $logger.debug "Duplicates found in #{name}." - dup = Compound.find duplicate - positions = [] - batch.compounds.each_with_index do |co,i| - c = Compound.find co - if !c.blank? and c.inchi and c.inchi == dup.inchi - positions << i+1 - end - end - batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}." - end - batch.save - end - batch - end - - end - -end -- cgit v1.2.3