From b2f12e257037faa21c14a54eec0205c45c5686c6 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 17 Sep 2018 16:53:19 +0200
Subject: efsa parsing fixed, contradictory results

---
 scripts/efsa2csv.rb           | 31 +++++++++++++++----------------
 scripts/hansen2csv.rb         |  6 ++++--
 scripts/kazius2csv.rb         |  4 ++--
 scripts/merge-mutagenicity.rb | 30 ++++++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 20 deletions(-)
 create mode 100755 scripts/merge-mutagenicity.rb

(limited to 'scripts')

diff --git a/scripts/efsa2csv.rb b/scripts/efsa2csv.rb
index 247faac..48d4fb8 100755
--- a/scripts/efsa2csv.rb
+++ b/scripts/efsa2csv.rb
@@ -4,26 +4,25 @@ require_relative '../../lazar/lib/lazar.rb'
 
 i = 0
 db = {}
-CSV.foreach(ARGV[0], :encoding => 'ISO-8859-1', :col_sep => "\t") do |row|
-  begin
-    if i > 0 and row[11]
-      c = OpenTox::Compound.from_smiles(row[11]).smiles
-      #c = row[11]
-      if row[24].match(/Salmonella/i)
-        if row[25].match("TA 98") or row[25].match("TA 100")
-          if row[33].match(/Positiv/i)
-            db[c] = 1 # at least one positive result in TA 98 or TA 100
-          elsif row[33].match(/Negativ/i)
-            db[c] ||= 0
-          end
-        end
-      end
+CSV.foreach(ARGV[0], :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row|
+  #STDERR.puts i if i%100 == 0
+  if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33]
+    begin
+      c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles
+    rescue
+      c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters
+    end
+    db[c] ||= {}
+    db[c][:id] ||= "efsa_#{row[2]}"
+    if row[33].match(/Positiv/i)
+      db[c][:value] = 1 # at least one positive result in TA 98 or TA 100
+    elsif row[33].match(/Negativ/i)
+      db[c][:value] ||= 0
     end
-  rescue
   end
   i += 1
 end
 
 db.each do |s,v|
-  puts [s,v].join ","
+  puts [v[:id],s,v[:value]].join ","
 end
diff --git a/scripts/hansen2csv.rb b/scripts/hansen2csv.rb
index 7684afd..3c8c3e1 100755
--- a/scripts/hansen2csv.rb
+++ b/scripts/hansen2csv.rb
@@ -4,7 +4,9 @@ require_relative '../../lazar/lib/lazar.rb'
 
 i = 0
 CSV.foreach(ARGV[0]) do |row|
-  i == 0 ? c = "SMILES" : c = OpenTox::Compound.from_smiles(row[5]).smiles
-  puts [c, row[2]].join ","
+  if i > 0
+    c = OpenTox::Compound.from_smiles(row[5]).smiles
+    puts ["hansen_#{row[0]}",c, row[2]].join ","
+  end
   i += 1
 end
diff --git a/scripts/kazius2csv.rb b/scripts/kazius2csv.rb
index ed335b2..52b78c1 100755
--- a/scripts/kazius2csv.rb
+++ b/scripts/kazius2csv.rb
@@ -33,10 +33,10 @@ obconversion = OpenBabel::OBConversion.new
 obconversion.set_in_and_out_formats "sdf","can"
 obmol = OpenBabel::OBMol.new
 
-puts "SMILES,Activity"
 sdfs.each_with_index do |sdf,i|
+  cas = sdf.split("\n").first.chomp
   obconversion.read_string obmol,sdf
   s = obconversion.write_string(obmol).split.first
-  puts [s,results[i]].join ","
+  puts ["kazius_#{cas}",s,results[i]].join ","
 end
  
diff --git a/scripts/merge-mutagenicity.rb b/scripts/merge-mutagenicity.rb
new file mode 100755
index 0000000..2de7d1c
--- /dev/null
+++ b/scripts/merge-mutagenicity.rb
@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+require 'csv'
+require 'json'
+
+acts = {}
+ids = {}
+ARGV.each do |csv|
+  CSV.foreach(csv) do |row|
+    acts[row[1]] ||= []
+    acts[row[1]] << row[2]
+    ids[row[1]] ||= []
+    ids[row[1]] << row[0]
+  end
+end
+
+contradictions = {}
+puts ["SMILES","Mutagenicity"].join(",")
+acts.each do |s,a|
+  if a.uniq.size > 1
+    contradictions[s] ||= {}
+    a.each_with_index do |act,i|
+      contradictions[s][ids[s][i]] = act
+      puts [s,act].join ","
+    end
+  else
+    puts [s,a.first].join ","
+  end
+end
+
+File.open(File.join(File.dirname(__FILE__),"..","data","contradictions.json"),"w+") { |cont| cont.puts contradictions.to_json }
-- 
cgit v1.2.3