summaryrefslogtreecommitdiff
path: root/scripts/carcinogenicity2csv.rb
blob: bed090730d56ae311cfe202247a92996027c6c59 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#!/usr/bin/env ruby
#require_relative '../../lazar/lib/lazar.rb'

i = 0
activities = []
File.readlines(ARGV[0]).each do |line|
  if i > 2
    tokens = line.split ","
    p line if tokens[1].empty?
    activities << [tokens[1],tokens[3]]
  end
  i += 1
end

puts "SMILES,Activity"
activities.each_slice(100) do |slice| # get SMILES in chunks
  sids = slice.collect{|e| e[0]}
  smiles = `curl https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT`.split("\n")
  abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size
  smiles.each_with_index do |smi,i|
    act = slice[i]
    puts [smi.chomp,act[1]].join(",")
  end
end