From 9750e0309500259e9a56e267ce87984fb5bb5e53 Mon Sep 17 00:00:00 2001 From: gebele Date: Mon, 26 Nov 2018 15:29:26 +0000 Subject: clean out; better response codes; prepare for batch --- batch.rb | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 batch.rb (limited to 'batch.rb') diff --git a/batch.rb b/batch.rb new file mode 100644 index 0000000..2e72396 --- /dev/null +++ b/batch.rb @@ -0,0 +1,101 @@ +require 'csv' +require 'tempfile' + +module OpenTox + + class Batch + + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "batch" + field :name, type: String + field :source, type: String + field :identifiers, type: Array + field :ids, type: Array + field :compounds, type: Array + field :warnings, type: Array, default: [] + + def self.from_csv_file file + source = file + name = File.basename(file,".*") + batch = self.find_by(:source => source, :name => name) + if batch + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})." + else + $logger.debug "Parsing #{file}." + table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => []) + + # original IDs + if table[0][0] =~ /ID/i + @original_ids = table.collect{|row| row.shift} + @original_ids.shift + end + + # features + feature_names = table.shift.collect{|f| f.strip} + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + compound_format = feature_names.shift.strip + bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + numeric = [] + features = [] + # guess feature types + feature_names.each_with_index do |f,i| + metadata = {:name => f} + values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil + if values.size == 0 # empty feature + elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes + numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) + else + metadata["accept_values"] = values + numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) + end + features << feature if feature + end + + table.each_with_index do |vals,i| + identifier = vals.shift.strip.gsub(/^'|'$/,"") + begin + case compound_format + when /SMILES/i + compound = OpenTox::Compound.from_smiles(identifier) + when /InChI/i + compound = OpenTox::Compound.from_inchi(identifier) + end + rescue + compound = nil + end + # collect only for present compounds + unless compound.nil? + batch.identifiers << identifier + batch.compounds << compound.id + batch.ids << @original_ids[i] if @original_ids + else + batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}." + end + end + batch.compounds.duplicates.each do |duplicate| + $logger.debug "Duplicates found in #{name}." + dup = Compound.find duplicate + positions = [] + batch.compounds.each_with_index do |co,i| + c = Compound.find co + if !c.blank? and c.inchi and c.inchi == dup.inchi + positions << i+1 + end + end + batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}." + end + batch.save + end + batch + end + + end + +end -- cgit v1.2.3