diff options
Diffstat (limited to 'scripts/padel-descriptors.rb')
-rwxr-xr-x | scripts/padel-descriptors.rb | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/scripts/padel-descriptors.rb b/scripts/padel-descriptors.rb new file mode 100755 index 0000000..691137f --- /dev/null +++ b/scripts/padel-descriptors.rb @@ -0,0 +1,41 @@ +#!/usr/bin/env ruby +train = File.readlines(ARGV[0]) +pa = File.readlines(ARGV[1]) +train_header = train.shift.chomp.split(",").collect{|i| i.gsub('"','')} +pa_header = pa.shift.chomp.split(";") +train_header.shift +train_header.pop +pa_header.shift + +#train_only = train_header - pa_header +#pa_only = pa_header - train_header +#puts train_only.size.to_s+ " training set descriptors missing from PAs:" +#puts train_only.join(",") +#puts +#puts pa_only.size.to_s+ " PA descriptors not in training set:" +#puts pa_only.join(",") +#exit + +common = train_header & pa_header + +puts (["Mutagenicity"]+common).join(",") +train.each do |line| + items = line.chomp.split "," + id = items.shift + id = "TRAIN"+id.gsub('"','') + act = items.pop + act == '"1"' ? act = "mutagen" : act = "non-mutagen" + descriptors = {} + items.each_with_index {|item,i| descriptors[train_header[i]] = item.sub(',','.').to_f } + puts ([id,act]+common.collect{|h| descriptors[h]}).join(",") +end + + +pa.each do |line| + items = line.chomp.split ";" + id = "PA"+items.shift + act = "PA" + descriptors = {} + items.each_with_index {|item,i| descriptors[pa_header[i]] = item.sub(',','.').to_f } + puts ([id,act]+common.collect{|h| descriptors[h]}).join(",") +end |