diff options
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/lazar-cv-predictions.rb | 31 | ||||
-rwxr-xr-x | scripts/lazar-pa-predictions.rb | 10 | ||||
-rwxr-xr-x | scripts/mp2d-fingerprints.rb | 11 | ||||
-rwxr-xr-x | scripts/sanitize-pa-data.rb | 53 | ||||
-rwxr-xr-x | scripts/tensorflow-cv-predictions.rb | 25 | ||||
-rwxr-xr-x | scripts/tensorflow-pa-predictions.rb | 26 | ||||
-rw-r--r-- | scripts/tsne-cdk-coordinates.R | 11 | ||||
-rw-r--r-- | scripts/tsne-mp2d-coordinates.R | 12 | ||||
-rwxr-xr-x | scripts/tsne-mutagenicity.rb | 21 |
9 files changed, 200 insertions, 0 deletions
diff --git a/scripts/lazar-cv-predictions.rb b/scripts/lazar-cv-predictions.rb new file mode 100755 index 0000000..9236bec --- /dev/null +++ b/scripts/lazar-cv-predictions.rb @@ -0,0 +1,31 @@ +#!/usr/bin/env ruby + +predictions = {} +minsim = ARGV[1].to_f +minsim ||= 0 +Dir[File.join(ARGV[0],"crossvalidation","*","test","predictions")].each do |pred| + File.readlines(pred).each_with_index do |l,i| + smi,m,pred,pa,pi,maxsim,n = l.split(",") + predictions[smi] = [pred,maxsim.to_f] + end +end +lines = File.readlines(File.join("mutagenicity","mutagenicity.csv")) +lines.shift +lines.each do |line| + smi,exp = line.chomp.split(",") + if predictions[smi] + if predictions[smi].first == "1" and exp == "1" and predictions[smi].last >= minsim + puts [smi,"TP"].join(",") + elsif predictions[smi].first == "0" and exp == "0" and predictions[smi].last >= minsim + puts [smi,"TN"].join(",") + elsif predictions[smi].first == "1" and exp == "0" and predictions[smi].last >= minsim + puts [smi,"FP"].join(",") + elsif predictions[smi].first == "0" and exp == "1" and predictions[smi].last >= minsim + puts [smi,"FN"].join(",") + else + puts [smi,"NA"].join(",") + end + else + puts [smi,"NA"].join(",") + end +end diff --git a/scripts/lazar-pa-predictions.rb b/scripts/lazar-pa-predictions.rb new file mode 100755 index 0000000..3fb61da --- /dev/null +++ b/scripts/lazar-pa-predictions.rb @@ -0,0 +1,10 @@ +#!/usr/bin/env ruby + +predictions = {} +minsim = ARGV[1].to_f +minsim ||= 0 +File.readlines(ARGV[0]).each do |l| + smi,m,pred,pa,pi,maxsim,n = l.split(",") + pred = "NA" if maxsim.to_f < minsim + puts [smi,pred].join(",") +end diff --git a/scripts/mp2d-fingerprints.rb b/scripts/mp2d-fingerprints.rb new file mode 100755 index 0000000..d016594 --- /dev/null +++ b/scripts/mp2d-fingerprints.rb @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +lines = File.readlines(ARGV[0]) +header = lines.shift.chomp.split(",") +i = header.index("Canonical SMILES") +smi = lines.collect{|l| l.chomp.split(",")[i]} +File.open("/tmp/smi","w+"){|f| f.puts smi.join("\n")} # avoid "Argument list too long (Errno::E2BIG)" +mp2d = `cat /tmp/smi | obabel -ismi - -ompd`.split("\n").collect{|l| l.split("\t")[1..-1]} +smi.each_with_index do |s,i| + puts ([s]+mp2d[i]).join(",") +end diff --git a/scripts/sanitize-pa-data.rb b/scripts/sanitize-pa-data.rb new file mode 100755 index 0000000..83f85e2 --- /dev/null +++ b/scripts/sanitize-pa-data.rb @@ -0,0 +1,53 @@ +#!/usr/bin/env ruby +require 'csv' +src = File.join "pyrrolizidine-alkaloids","src" +dest = "pyrrolizidine-alkaloids" + +ids = CSV.read(File.join(src,"180920_PA_complete_SMILES.csv"), headers: true, col_sep: ";") +cansmi = `echo "#{ids.collect{|r| r[3].gsub(';','')}.join("\n")}" | obabel -ismi - -ocan`.gsub("\t","").split("\n") +raise "Could not convert all smiles" unless ids.size == cansmi.size + +smi_by_cid = {} +File.open(File.join(dest,"pa-cids.csv"),"w+") do |f| + f.puts ["CID","Canonical SMILES"].join(",") + ids.by_col["CID"].each_with_index do |cid,i| + f.puts [cid,cansmi[i]].join(",") + smi_by_cid[cid] = cansmi[i] + end +end + +File.open(File.join(dest,"pa-ids.csv"),"w+") do |f| + f.puts ["ID","Canonical SMILES"].join(",") + ids.by_col["ID"].each_with_index do |id,i| + f.puts [id,cansmi[i]].join(",") + end +end +File.open(File.join(dest,"pa-names.tsv"),"w+") do |f| + f.puts ["Canonical SMILES","Name"].join("\t") + ids.by_col["Name"].each_with_index do |name,i| + name.sub!("1: ","") if name + f.puts [cansmi[i],name].join("\t") + end +end + +groups = CSV.read(File.join(src,"pa-groups.original.csv"), headers: true, col_sep: ";") +raise "Unequal IDs in 180920_PA_complete_SMILES.csv and pa-groups.original.csv" unless ids["ID"] == groups[nil] + +File.open(File.join(dest,"pa-groups.csv"),"w+") do |f| + f.puts (["Canonical SMILES"] + groups.headers[1..groups.headers.size-1]).join "," + groups.each_with_index do |row,i| + f.puts ([cansmi[i]]+ row[1..9].collect{|g| g == "NA" ? 0 : 1}).join(",") + end +end + +CSV::Converters[:comma_numbers] = + cdk = CSV.read(File.join(src,"PA-Padel-2D_m2.csv"), headers: true, col_sep: ";", converters: ->(s) {(s =~ /^-*\d+,/) ? (s.sub(',','.').to_f) : s}) +headers = cdk.headers +headers[0] = "Canonical SMILES" +File.open(File.join(dest,"pa-cdk.csv"),"w+") do |f| + f.puts headers.join(",") + cdk.each do |row| + row[0] = smi_by_cid[row[0]] + f.puts row.to_s + end +end diff --git a/scripts/tensorflow-cv-predictions.rb b/scripts/tensorflow-cv-predictions.rb new file mode 100755 index 0000000..370a299 --- /dev/null +++ b/scripts/tensorflow-cv-predictions.rb @@ -0,0 +1,25 @@ +#!/usr/bin/env ruby + +predictions = {} +lines = File.readlines(ARGV[0]) +lines.shift +lines.each do |line| + smi,prob = line.chomp.split(",") + prob.to_f < 0.5 ? predictions[smi] = 0 : predictions[smi] = 1 +end +lines = File.readlines(File.join("mutagenicity","mutagenicity.csv")) +lines.shift +lines.each do |line| + smi,exp = line.chomp.split(",") + if predictions[smi] == 1 and exp == "1" + puts [smi,"TP"].join(",") + elsif predictions[smi] == 0 and exp == "0" + puts [smi,"TN"].join(",") + elsif predictions[smi] == 1 and exp == "0" + puts [smi,"FP"].join(",") + elsif predictions[smi] == 0 and exp == "1" + puts [smi,"FN"].join(",") + else + puts [smi,"NA"].join(",") + end +end diff --git a/scripts/tensorflow-pa-predictions.rb b/scripts/tensorflow-pa-predictions.rb new file mode 100755 index 0000000..0eeb155 --- /dev/null +++ b/scripts/tensorflow-pa-predictions.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby + +smiles = {} +lines = File.readlines(ARGV[1]) +lines.shift +lines.each do |line| + cid,smi = line.chomp.split(",") + smiles[cid] = smi +end + +lines = File.readlines(ARGV[0]) +lines.shift +lines.each do |line| + id,prob = line.chomp.split(",") + prob.to_f < 0.5 ? pred = 0 : pred = 1 + if id.match(/^\d+$/) + puts [smiles[id],pred].join(",") + else + if smiles.values.include? id + puts [id,pred].join(",") + else + cansmi = `echo "#{id}" | obabel -ismi - -ocan | tr -d "\t"`.chomp + puts [cansmi,pred].join(",") + end + end +end diff --git a/scripts/tsne-cdk-coordinates.R b/scripts/tsne-cdk-coordinates.R new file mode 100644 index 0000000..f9baff9 --- /dev/null +++ b/scripts/tsne-cdk-coordinates.R @@ -0,0 +1,11 @@ +#!/usr/bin/env Rscript +library(Rtsne) +library(ggplot2) +args = commandArgs(trailingOnly=TRUE) +data = read.csv(args[1],header=F) +smi = data[,1] +data[,1] = NULL +m = as.matrix(data) +set.seed(66) +tsne = Rtsne(m,verbose=T,check_duplicates=F) +write.csv(data.frame(smiles= smi, x = tsne$Y[,1], y = tsne$Y[,2]),args[2],row.names=F) diff --git a/scripts/tsne-mp2d-coordinates.R b/scripts/tsne-mp2d-coordinates.R new file mode 100644 index 0000000..ef97595 --- /dev/null +++ b/scripts/tsne-mp2d-coordinates.R @@ -0,0 +1,12 @@ +#!/usr/bin/env Rscript +library(Rtsne) +library(ggplot2) +args = commandArgs(trailingOnly=TRUE) +data = read.csv(args[1],header=F) +smi = data[,1] +data[,1] = NULL +m = as.matrix(data) +dist = as.dist(m) +set.seed(66) +tsne = Rtsne(dist,verbose=T,is_distance=T) +write.csv(data.frame(smiles= smi, x = tsne$Y[,1], y = tsne$Y[,2]),args[2],row.names=F) diff --git a/scripts/tsne-mutagenicity.rb b/scripts/tsne-mutagenicity.rb new file mode 100755 index 0000000..45b4cc1 --- /dev/null +++ b/scripts/tsne-mutagenicity.rb @@ -0,0 +1,21 @@ +#!/usr/bin/env ruby + +lines = File.readlines(ARGV[0]) +lines.shift +mut = File.readlines(ARGV[1]) +mut.shift +mutagenicity = {} +mut.each do |l| + smi,m = l.chomp.split(",") + if m == "1" + mutagenicity[smi] = "mutagen" + elsif m == "0" + mutagenicity[smi] = "non-mutagen" + else + mutagenicity[smi] = "NA" + end +end +lines.collect{|l| l.chomp.split(",").first}.each do |smi| + smi.gsub!('"','') + mutagenicity[smi] ? puts(mutagenicity[smi]) : puts("PA") +end |