#!/usr/bin/env ruby require 'csv' require_relative "../../../lib/compound.rb" # convert EFSA data to mutagenicity classifications i = 0 db = {} CSV.foreach(ARGV[0], :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row| if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33] begin c = Compound.from_smiles(row[11].gsub('"','')).smiles rescue c = Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters end db[c] ||= {} db[c][:id] ||= row[2] if row[33].match(/Positiv/i) db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100 elsif row[33].match(/Negativ/i) db[c][:value] ||= "non-mutagenic" end end i += 1 end puts "ID,SMILES,Mutagenicity" db.each do |s,v| puts [v[:id],s,v[:value]].join "," end