1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
require 'csv'
require 'tempfile'
module OpenTox
class Batch
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
store_in collection: "batch"
field :name, type: String
field :source, type: String
field :identifiers, type: Array
field :ids, type: Array
field :compounds, type: Array
field :warnings, type: Array, default: []
def self.from_csv_file file
source = file
name = File.basename(file,".*")
batch = self.find_by(:source => source, :name => name)
if batch
$logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})."
else
$logger.debug "Parsing #{file}."
table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => [])
# original IDs
if table[0][0] =~ /ID/i
@original_ids = table.collect{|row| row.shift}
@original_ids.shift
end
# features
feature_names = table.shift.collect{|f| f.strip}
warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
compound_format = feature_names.shift.strip
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
numeric = []
features = []
# guess feature types
feature_names.each_with_index do |f,i|
metadata = {:name => f}
values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
types = values.collect{|v| v.numeric? ? true : false}.uniq
feature = nil
if values.size == 0 # empty feature
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
numeric[i] = true
feature = NumericFeature.find_or_create_by(metadata)
else
metadata["accept_values"] = values
numeric[i] = false
feature = NominalFeature.find_or_create_by(metadata)
end
features << feature if feature
end
table.each_with_index do |vals,i|
identifier = vals.shift.strip.gsub(/^'|'$/,"")
begin
case compound_format
when /SMILES/i
compound = OpenTox::Compound.from_smiles(identifier)
when /InChI/i
compound = OpenTox::Compound.from_inchi(identifier)
end
rescue
compound = nil
end
# collect only for present compounds
unless compound.nil?
batch.identifiers << identifier
batch.compounds << compound.id
batch.ids << @original_ids[i] if @original_ids
else
batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}."
end
end
batch.compounds.duplicates.each do |duplicate|
$logger.debug "Duplicates found in #{name}."
dup = Compound.find duplicate
positions = []
batch.compounds.each_with_index do |co,i|
c = Compound.find co
if !c.blank? and c.inchi and c.inchi == dup.inchi
positions << i+1
end
end
batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}."
end
batch.save
end
batch
end
end
end
|