From b2f12e257037faa21c14a54eec0205c45c5686c6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 17 Sep 2018 16:53:19 +0200 Subject: efsa parsing fixed, contradictory results --- scripts/efsa2csv.rb | 31 +++++++++++++++---------------- scripts/hansen2csv.rb | 6 ++++-- scripts/kazius2csv.rb | 4 ++-- scripts/merge-mutagenicity.rb | 30 ++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 20 deletions(-) create mode 100755 scripts/merge-mutagenicity.rb (limited to 'scripts') diff --git a/scripts/efsa2csv.rb b/scripts/efsa2csv.rb index 247faac..48d4fb8 100755 --- a/scripts/efsa2csv.rb +++ b/scripts/efsa2csv.rb @@ -4,26 +4,25 @@ require_relative '../../lazar/lib/lazar.rb' i = 0 db = {} -CSV.foreach(ARGV[0], :encoding => 'ISO-8859-1', :col_sep => "\t") do |row| - begin - if i > 0 and row[11] - c = OpenTox::Compound.from_smiles(row[11]).smiles - #c = row[11] - if row[24].match(/Salmonella/i) - if row[25].match("TA 98") or row[25].match("TA 100") - if row[33].match(/Positiv/i) - db[c] = 1 # at least one positive result in TA 98 or TA 100 - elsif row[33].match(/Negativ/i) - db[c] ||= 0 - end - end - end +CSV.foreach(ARGV[0], :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row| + #STDERR.puts i if i%100 == 0 + if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33] + begin + c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles + rescue + c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters + end + db[c] ||= {} + db[c][:id] ||= "efsa_#{row[2]}" + if row[33].match(/Positiv/i) + db[c][:value] = 1 # at least one positive result in TA 98 or TA 100 + elsif row[33].match(/Negativ/i) + db[c][:value] ||= 0 end - rescue end i += 1 end db.each do |s,v| - puts [s,v].join "," + puts [v[:id],s,v[:value]].join "," end diff --git a/scripts/hansen2csv.rb b/scripts/hansen2csv.rb index 7684afd..3c8c3e1 100755 --- a/scripts/hansen2csv.rb +++ b/scripts/hansen2csv.rb @@ -4,7 +4,9 @@ require_relative '../../lazar/lib/lazar.rb' i = 0 CSV.foreach(ARGV[0]) do |row| - i == 0 ? c = "SMILES" : c = OpenTox::Compound.from_smiles(row[5]).smiles - puts [c, row[2]].join "," + if i > 0 + c = OpenTox::Compound.from_smiles(row[5]).smiles + puts ["hansen_#{row[0]}",c, row[2]].join "," + end i += 1 end diff --git a/scripts/kazius2csv.rb b/scripts/kazius2csv.rb index ed335b2..52b78c1 100755 --- a/scripts/kazius2csv.rb +++ b/scripts/kazius2csv.rb @@ -33,10 +33,10 @@ obconversion = OpenBabel::OBConversion.new obconversion.set_in_and_out_formats "sdf","can" obmol = OpenBabel::OBMol.new -puts "SMILES,Activity" sdfs.each_with_index do |sdf,i| + cas = sdf.split("\n").first.chomp obconversion.read_string obmol,sdf s = obconversion.write_string(obmol).split.first - puts [s,results[i]].join "," + puts ["kazius_#{cas}",s,results[i]].join "," end diff --git a/scripts/merge-mutagenicity.rb b/scripts/merge-mutagenicity.rb new file mode 100755 index 0000000..2de7d1c --- /dev/null +++ b/scripts/merge-mutagenicity.rb @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby +require 'csv' +require 'json' + +acts = {} +ids = {} +ARGV.each do |csv| + CSV.foreach(csv) do |row| + acts[row[1]] ||= [] + acts[row[1]] << row[2] + ids[row[1]] ||= [] + ids[row[1]] << row[0] + end +end + +contradictions = {} +puts ["SMILES","Mutagenicity"].join(",") +acts.each do |s,a| + if a.uniq.size > 1 + contradictions[s] ||= {} + a.each_with_index do |act,i| + contradictions[s][ids[s][i]] = act + puts [s,act].join "," + end + else + puts [s,a.first].join "," + end +end + +File.open(File.join(File.dirname(__FILE__),"..","data","contradictions.json"),"w+") { |cont| cont.puts contradictions.to_json } -- cgit v1.2.3