diff options
Diffstat (limited to 'scripts/efsa2csv.rb')
-rwxr-xr-x | scripts/efsa2csv.rb | 31 |
1 files changed, 15 insertions, 16 deletions
diff --git a/scripts/efsa2csv.rb b/scripts/efsa2csv.rb index 247faac..48d4fb8 100755 --- a/scripts/efsa2csv.rb +++ b/scripts/efsa2csv.rb @@ -4,26 +4,25 @@ require_relative '../../lazar/lib/lazar.rb' i = 0 db = {} -CSV.foreach(ARGV[0], :encoding => 'ISO-8859-1', :col_sep => "\t") do |row| - begin - if i > 0 and row[11] - c = OpenTox::Compound.from_smiles(row[11]).smiles - #c = row[11] - if row[24].match(/Salmonella/i) - if row[25].match("TA 98") or row[25].match("TA 100") - if row[33].match(/Positiv/i) - db[c] = 1 # at least one positive result in TA 98 or TA 100 - elsif row[33].match(/Negativ/i) - db[c] ||= 0 - end - end - end +CSV.foreach(ARGV[0], :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row| + #STDERR.puts i if i%100 == 0 + if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33] + begin + c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles + rescue + c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters + end + db[c] ||= {} + db[c][:id] ||= "efsa_#{row[2]}" + if row[33].match(/Positiv/i) + db[c][:value] = 1 # at least one positive result in TA 98 or TA 100 + elsif row[33].match(/Negativ/i) + db[c][:value] ||= 0 end - rescue end i += 1 end db.each do |s,v| - puts [s,v].join "," + puts [v[:id],s,v[:value]].join "," end |