1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
require 'spreadsheet'
require 'roo'
class Parser
attr_accessor :dataset, :format_errors, :smiles_errors, :activity_errors, :duplicates, :nr_compounds, :dataset_uri
def initialize(file, endpoint_uri)
@file = file
@dataset = OpenTox::Dataset.new
@feature_uri = endpoint_uri
@dataset.features << endpoint_uri
@dataset.title = URI.decode(endpoint_uri.split(/#/).last)
@format_errors = ""
@smiles_errors = []
@activity_errors = []
@duplicates = {}
@nr_compounds = 0
@data = []
@activities = []
@type = "classification"
# check format by extension - not all browsers provide correct content-type])
case File.extname(@file[:filename])
when ".csv"
self.csv
when ".xls", ".xlsx"
self.excel
else
@format_errors = "#{@file[:filename]} is a unsupported file type."
return false
end
# create dataset
@data.each do |items|
@dataset.compounds << items[0]
@dataset.data[items[0]] = [] unless @dataset.data[items[0]]
case @type
when "classification"
case items[1].to_s
when TRUE_REGEXP
@dataset.data[items[0]] << {@feature_uri => true }
when FALSE_REGEXP
@dataset.data[items[0]] << {@feature_uri => false }
end
when "regression"
if items[1].to_f == 0
@activity_errors << "Row #{items[2]}: Zero values not allowed for regression datasets - entry ignored."
else
@dataset.data[items[0]] << {@feature_uri => items[1].to_f}
end
end
end
@dataset_uri = @dataset.save
end
def csv
row = 0
@file[:tempfile].each_line do |line|
row += 1
unless line.chomp.match(/^.+[,;].*$/) # check CSV format
@format_errors = "#{@file[:filename]} is not a valid CSV file."
return false
end
items = line.chomp.gsub(/["']/,'').split(/\s*[,;]\s*/) # remove quotes
LOGGER.debug items.join(",")
input = validate(items[0], items[1], row) # smiles, activity
@data << input if input
end
end
def excel
excel = 'tmp/' + @file[:filename]
File.mv(@file[:tempfile].path,excel)
begin
if File.extname(@file[:filename]) == ".xlsx"
book = Excelx.new(excel)
else
book = Excel.new(excel)
end
book.default_sheet = 0
1.upto(book.last_row) do |row|
input = validate( book.cell(row,1), book.cell(row,2), row ) # smiles, activity
@data << input if input
end
File.safe_unlink(@file[:tempfile])
rescue
@format_errors = "#{@file[:filename]} is not a valid Excel input file."
return false
end
end
def validate(smiles, act, row)
compound = OpenTox::Compound.new(:smiles => smiles)
if compound.nil? or compound.inchi.nil? or compound.inchi == ""
@smiles_errors << "Row #{row}: " + [smiles,act].join(", ")
return false
end
unless numeric?(act) or classification?(act)
@activity_errors << "Row #{row}: " + [smiles,act].join(", ")
return false
end
@duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
@duplicates[compound.inchi] << "Row #{row}: " + [smiles, act].join(", ")
@type = "regression" unless classification?(act)
@nr_compounds += 1
[ compound.uri, act , row ]
end
def numeric?(object)
true if Float(object) rescue false
end
def classification?(object)
!object.to_s.strip.match(TRUE_REGEXP).nil? or !object.to_s.strip.match(FALSE_REGEXP).nil?
end
end
|