summaryrefslogtreecommitdiff
path: root/scripts/sanitize-pa-data.rb
blob: 83f85e22f825309cac6e80f368dd047afb9c21cc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env ruby
require 'csv'
src = File.join "pyrrolizidine-alkaloids","src"
dest = "pyrrolizidine-alkaloids"

ids = CSV.read(File.join(src,"180920_PA_complete_SMILES.csv"), headers: true, col_sep: ";")
cansmi = `echo "#{ids.collect{|r| r[3].gsub(';','')}.join("\n")}" | obabel -ismi - -ocan`.gsub("\t","").split("\n")
raise "Could not convert all smiles" unless ids.size == cansmi.size

smi_by_cid = {}
File.open(File.join(dest,"pa-cids.csv"),"w+") do |f|
  f.puts ["CID","Canonical SMILES"].join(",")
  ids.by_col["CID"].each_with_index do |cid,i|
    f.puts [cid,cansmi[i]].join(",")
    smi_by_cid[cid] = cansmi[i]
  end
end

File.open(File.join(dest,"pa-ids.csv"),"w+") do |f|
  f.puts ["ID","Canonical SMILES"].join(",")
  ids.by_col["ID"].each_with_index do |id,i|
    f.puts [id,cansmi[i]].join(",")
  end
end
File.open(File.join(dest,"pa-names.tsv"),"w+") do |f|
  f.puts ["Canonical SMILES","Name"].join("\t")
  ids.by_col["Name"].each_with_index do |name,i|
    name.sub!("1: ","") if name
    f.puts [cansmi[i],name].join("\t")
  end
end

groups = CSV.read(File.join(src,"pa-groups.original.csv"), headers: true, col_sep: ";")
raise "Unequal IDs in 180920_PA_complete_SMILES.csv and pa-groups.original.csv" unless ids["ID"] == groups[nil]

File.open(File.join(dest,"pa-groups.csv"),"w+") do |f|
  f.puts (["Canonical SMILES"] + groups.headers[1..groups.headers.size-1]).join ","
  groups.each_with_index do |row,i|
    f.puts ([cansmi[i]]+ row[1..9].collect{|g| g == "NA" ? 0 : 1}).join(",")
  end
end

CSV::Converters[:comma_numbers] =
  cdk = CSV.read(File.join(src,"PA-Padel-2D_m2.csv"), headers: true, col_sep: ";", converters:  ->(s) {(s =~ /^-*\d+,/) ? (s.sub(',','.').to_f) : s})
headers = cdk.headers
headers[0] = "Canonical SMILES"
File.open(File.join(dest,"pa-cdk.csv"),"w+") do |f|
  f.puts headers.join(",")
  cdk.each do |row|
    row[0] = smi_by_cid[row[0]]
    f.puts row.to_s
  end
end