summaryrefslogtreecommitdiff
path: root/scripts/padel-descriptors.rb
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/padel-descriptors.rb')
-rwxr-xr-xscripts/padel-descriptors.rb41
1 files changed, 41 insertions, 0 deletions
diff --git a/scripts/padel-descriptors.rb b/scripts/padel-descriptors.rb
new file mode 100755
index 0000000..691137f
--- /dev/null
+++ b/scripts/padel-descriptors.rb
@@ -0,0 +1,41 @@
+#!/usr/bin/env ruby
+train = File.readlines(ARGV[0])
+pa = File.readlines(ARGV[1])
+train_header = train.shift.chomp.split(",").collect{|i| i.gsub('"','')}
+pa_header = pa.shift.chomp.split(";")
+train_header.shift
+train_header.pop
+pa_header.shift
+
+#train_only = train_header - pa_header
+#pa_only = pa_header - train_header
+#puts train_only.size.to_s+ " training set descriptors missing from PAs:"
+#puts train_only.join(",")
+#puts
+#puts pa_only.size.to_s+ " PA descriptors not in training set:"
+#puts pa_only.join(",")
+#exit
+
+common = train_header & pa_header
+
+puts (["Mutagenicity"]+common).join(",")
+train.each do |line|
+ items = line.chomp.split ","
+ id = items.shift
+ id = "TRAIN"+id.gsub('"','')
+ act = items.pop
+ act == '"1"' ? act = "mutagen" : act = "non-mutagen"
+ descriptors = {}
+ items.each_with_index {|item,i| descriptors[train_header[i]] = item.sub(',','.').to_f }
+ puts ([id,act]+common.collect{|h| descriptors[h]}).join(",")
+end
+
+
+pa.each do |line|
+ items = line.chomp.split ";"
+ id = "PA"+items.shift
+ act = "PA"
+ descriptors = {}
+ items.each_with_index {|item,i| descriptors[pa_header[i]] = item.sub(',','.').to_f }
+ puts ([id,act]+common.collect{|h| descriptors[h]}).join(",")
+end