summaryrefslogtreecommitdiff
path: root/batch.rb
blob: 2e7239617ed5fb024f0260048e87d47414bbe54c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
require 'csv'
require 'tempfile'

module OpenTox

  class Batch

    include OpenTox
    include Mongoid::Document
    include Mongoid::Timestamps
    store_in collection: "batch"
    field :name,  type: String
    field :source,  type: String
    field :identifiers, type: Array
    field :ids, type: Array
    field :compounds, type: Array
    field :warnings, type: Array, default: []

    def self.from_csv_file file
      source = file
      name = File.basename(file,".*")
      batch = self.find_by(:source => source, :name => name)
      if batch
        $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})."
      else
        $logger.debug "Parsing #{file}."
        table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
        batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => [])

        # original IDs
        if table[0][0] =~ /ID/i
          @original_ids = table.collect{|row| row.shift}
          @original_ids.shift
        end
        
        # features
        feature_names = table.shift.collect{|f| f.strip}
        warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
        compound_format = feature_names.shift.strip
        bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
        numeric = []
        features = []
        # guess feature types
        feature_names.each_with_index do |f,i|
          metadata = {:name => f}
          values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
          types = values.collect{|v| v.numeric? ? true : false}.uniq
          feature = nil
          if values.size == 0 # empty feature
          elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
            numeric[i] = true
            feature = NumericFeature.find_or_create_by(metadata)
          else
            metadata["accept_values"] = values
            numeric[i] = false
            feature = NominalFeature.find_or_create_by(metadata)
          end
          features << feature if feature
        end
        
        table.each_with_index do |vals,i|
          identifier = vals.shift.strip.gsub(/^'|'$/,"")
          begin
            case compound_format
            when /SMILES/i
              compound = OpenTox::Compound.from_smiles(identifier)
            when /InChI/i
              compound = OpenTox::Compound.from_inchi(identifier)
            end
          rescue 
            compound = nil
          end
          # collect only for present compounds
          unless compound.nil?
            batch.identifiers << identifier
            batch.compounds << compound.id
            batch.ids << @original_ids[i] if @original_ids
          else
            batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}."
          end
        end
        batch.compounds.duplicates.each do |duplicate|
          $logger.debug "Duplicates found in #{name}."
          dup = Compound.find duplicate
          positions = []
          batch.compounds.each_with_index do |co,i|
            c = Compound.find co
            if !c.blank? and c.inchi and c.inchi == dup.inchi
              positions << i+1
            end
          end
          batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}."
        end
        batch.save
      end
      batch
    end

  end

end