summaryrefslogtreecommitdiff
path: root/scripts/efsa2csv.rb
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/efsa2csv.rb')
-rwxr-xr-xscripts/efsa2csv.rb31
1 files changed, 15 insertions, 16 deletions
diff --git a/scripts/efsa2csv.rb b/scripts/efsa2csv.rb
index 247faac..48d4fb8 100755
--- a/scripts/efsa2csv.rb
+++ b/scripts/efsa2csv.rb
@@ -4,26 +4,25 @@ require_relative '../../lazar/lib/lazar.rb'
i = 0
db = {}
-CSV.foreach(ARGV[0], :encoding => 'ISO-8859-1', :col_sep => "\t") do |row|
- begin
- if i > 0 and row[11]
- c = OpenTox::Compound.from_smiles(row[11]).smiles
- #c = row[11]
- if row[24].match(/Salmonella/i)
- if row[25].match("TA 98") or row[25].match("TA 100")
- if row[33].match(/Positiv/i)
- db[c] = 1 # at least one positive result in TA 98 or TA 100
- elsif row[33].match(/Negativ/i)
- db[c] ||= 0
- end
- end
- end
+CSV.foreach(ARGV[0], :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row|
+ #STDERR.puts i if i%100 == 0
+ if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33]
+ begin
+ c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles
+ rescue
+ c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters
+ end
+ db[c] ||= {}
+ db[c][:id] ||= "efsa_#{row[2]}"
+ if row[33].match(/Positiv/i)
+ db[c][:value] = 1 # at least one positive result in TA 98 or TA 100
+ elsif row[33].match(/Negativ/i)
+ db[c][:value] ||= 0
end
- rescue
end
i += 1
end
db.each do |s,v|
- puts [s,v].join ","
+ puts [v[:id],s,v[:value]].join ","
end