summaryrefslogtreecommitdiff
path: root/scripts/carcinogenicity2csv.rb
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/carcinogenicity2csv.rb')
-rwxr-xr-xscripts/carcinogenicity2csv.rb25
1 files changed, 25 insertions, 0 deletions
diff --git a/scripts/carcinogenicity2csv.rb b/scripts/carcinogenicity2csv.rb
new file mode 100755
index 0000000..9501bdc
--- /dev/null
+++ b/scripts/carcinogenicity2csv.rb
@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar.rb'
+
+i = 0
+activities = []
+File.readlines(ARGV[0]).each do |line|
+ if i > 2
+ tokens = line.split ","
+ p line if tokens[1].empty?
+ activities << [tokens[1],tokens[3]]
+ end
+ i += 1
+end
+
+puts "SMILES,Activity"
+activities.each_slice(100) do |slice| # get SMILES in chunks
+ sids = slice.collect{|e| e[0]}
+ smiles = `curl https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT`.split("\n")
+ abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size
+ smiles.each_with_index do |smi,i|
+ act = slice[i]
+ puts [smi.chomp,act[1]].join(",")
+ end
+end
+