diff options
author | gebele <gebele@in-silico.ch> | 2018-04-20 10:45:11 +0000 |
---|---|---|
committer | gebele <gebele@in-silico.ch> | 2018-04-20 10:45:11 +0000 |
commit | 19eb655f4af1a4631692989a30a59b7b78e6669b (patch) | |
tree | 9bcfe5f51f40fa02d0e805b7a61648f9937406c0 /batch.rb | |
parent | f78d888a249303379c7934c83d5da11349a44b48 (diff) |
batch download with original identifiers
Diffstat (limited to 'batch.rb')
-rw-r--r-- | batch.rb | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/batch.rb b/batch.rb new file mode 100644 index 0000000..2dd9359 --- /dev/null +++ b/batch.rb @@ -0,0 +1,81 @@ +require 'csv' +require 'tempfile' + +module OpenTox + + class Batch + + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "batch" + field :name, type: String + field :source, type: String + field :identifiers, type: Array + field :compounds, type: Array + field :warnings, type: Array, default: [] + + def self.from_csv_file file + source = file + name = File.basename(file,".*") + batch = self.find_by(:source => source, :name => name) + if batch + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})." + else + $logger.debug "Parsing #{file}." + table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + batch = self.new(:source => source, :name => name, :identifiers => [], :compounds => []) + + # features + feature_names = table.shift.collect{|f| f.strip} + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + compound_format = feature_names.shift.strip + bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + numeric = [] + features = [] + # guess feature types + feature_names.each_with_index do |f,i| + metadata = {:name => f} + values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil + if values.size == 0 # empty feature + elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes + numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) + else + metadata["accept_values"] = values + numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) + end + features << feature if feature + end + + table.each_with_index do |vals,i| + identifier = vals.shift.strip + batch.identifiers << identifier + begin + case compound_format + when /SMILES/i + compound = OpenTox::Compound.from_smiles(identifier) + when /InChI/i + compound = OpenTox::Compound.from_inchi(identifier) + end + rescue + compound = nil + end + if compound.nil? # compound parsers may return nil + #warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." + batch.compounds << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}." + next + end + batch.compounds << compound.id + end + batch.save + end + batch + end + + end + +end |