summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2021-02-06 20:21:58 +0100
committerChristoph Helma <helma@in-silico.ch>2021-02-06 20:21:58 +0100
commit771a2381ae0fd5e352f23d7223baeb26e8bb4e02 (patch)
tree1a7182a83ec15b9d7a69e5508c8d3b2bfa5983df /scripts
parent241f997c3a1a6a38fa47070f5efbd23852cc432b (diff)
svm validation, tensorflow mp2d pa predictions, r results removed, cleanup
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/cv-tensorflow-confusion-matrix.rb2
-rwxr-xr-xscripts/mp2d-distances.rb2
-rwxr-xr-xscripts/pa-fingerprints.rb20
-rwxr-xr-xscripts/pa-groups.R23
-rwxr-xr-xscripts/pa-table.rb28
-rwxr-xr-xscripts/pa-tex-table.rb10
-rwxr-xr-xscripts/summary2table.rb20
7 files changed, 83 insertions, 22 deletions
diff --git a/scripts/cv-tensorflow-confusion-matrix.rb b/scripts/cv-tensorflow-confusion-matrix.rb
index fba5c49..087d905 100755
--- a/scripts/cv-tensorflow-confusion-matrix.rb
+++ b/scripts/cv-tensorflow-confusion-matrix.rb
@@ -7,7 +7,7 @@ tn = 0
fn = 0
pred = CSV.read(ARGV[0],headers: true,:col_sep => ",")
-act = CSV.read(File.join("data","mutagenicity.csv"),headers: true,:col_sep => ",")
+act = CSV.read(File.join("data","training","mutagenicity.csv"),headers: true,:col_sep => ",")
data = {}
diff --git a/scripts/mp2d-distances.rb b/scripts/mp2d-distances.rb
index 398504d..05313c2 100755
--- a/scripts/mp2d-distances.rb
+++ b/scripts/mp2d-distances.rb
@@ -17,7 +17,7 @@ File.readlines(File.join("..","lazar","models","mutagenicity","independent_varia
end
end
-File.readlines(File.join("pyrrolizidine-alkaloids","PA-smiles.csv")).each_with_index do |line,i|
+File.readlines(File.join("pyrrolizidine-alkaloids","lazar","pa-smiles.csv")).each_with_index do |line,i|
if i > 0
(id,smiles) = line.chomp.split(",")
independent_variables << Compound.new(smiles).fingerprint
diff --git a/scripts/pa-fingerprints.rb b/scripts/pa-fingerprints.rb
new file mode 100755
index 0000000..344ba86
--- /dev/null
+++ b/scripts/pa-fingerprints.rb
@@ -0,0 +1,20 @@
+#!/usr/bin/env ruby
+require_relative "../../lazar/lib/lazar.rb"
+training_fingerprints = `sed -n '1p' data/mutagenicity-fingerprints.csv`.chomp.split(",")
+training_fingerprints.pop
+puts training_fingerprints.join(",")
+training_fingerprints.shift
+
+File.readlines(File.join("pyrrolizidine-alkaloids","lazar","pa-smiles.csv")).each_with_index do |line,i|
+ if i > 0
+ (id,smiles) = line.chomp.split(",")
+ c = Compound.new(smiles)
+ out = [c.smiles]
+ fp = c.fingerprint
+ training_fingerprints.each do |frag|
+ fp.include?(frag) ? out << 1 : out << 0
+ end
+ puts out.join(",")
+ end
+end
+
diff --git a/scripts/pa-groups.R b/scripts/pa-groups.R
new file mode 100755
index 0000000..005dd3d
--- /dev/null
+++ b/scripts/pa-groups.R
@@ -0,0 +1,23 @@
+#!/usr/bin/env Rscript
+library(ggplot2)
+data <- read.csv("tables/pa-table.csv")
+for (i in c(2:10)) {
+ name <- names(data)[i]
+ group <- data[data[i] == 1,c(15,19,20,21,22)]
+ freq <- 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group))
+ plot <- ggplot(data.frame(freq),aes(x=c("lazar","LR-sgd","LR-scikit","NN","RF"),y=freq)) + geom_bar(stat="identity") + ylab("% mutagenic") + xlab(name) + ylim(c(0,100))
+ ggsave(paste("figures/",name,".png",sep=""))
+}
+#groups <- names(data)[c(2:10)]
+#labels <- data$Mutagenicity
+#data$Mutagenicity <- NULL
+#m <- as.matrix(data)
+#dist <- as.dist(m)
+#tsne <- Rtsne(dist,is_distance=T)
+#write.csv(tsne,"tsne.csv")
+#write.csv(data.frame(x = tsne$Y[,1], y = tsne$Y[,2]),"tsne.csv")
+#tsne_plot <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2])
+#colors <- c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00")
+#plot <- ggplot(tsne_plot)
+#plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank()) + scale_color_manual(values = colors)
+#ggsave("figures/tsne-mp2d.png")
diff --git a/scripts/pa-table.rb b/scripts/pa-table.rb
index 8c1037e..1b8ecc8 100755
--- a/scripts/pa-table.rb
+++ b/scripts/pa-table.rb
@@ -2,7 +2,7 @@
# red groups
tab = []
-File.read("pyrrolizidine-alkaloids/R/PA.RF.outcome.csv").each_line do |l|
+File.read("data/pyrrolizidine-alkaloids/pa-groups.csv").each_line do |l|
items = l.chomp.split(';')
if items.first.empty?
items[0] = "ID"
@@ -17,7 +17,7 @@ end
tab[0] += ["CID","SMILES","Canonical SMILES","Measured","lazar-MP2D","lazar-MP2D-high-confidence","lazar-CDK","lazar-CDK-high-confidence"]
i = 0
-File.read("pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv").each_line do |l|
+File.read("data/pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv").each_line do |l|
if i > 0
id,cid,name,smi = l.chomp.split(";")
tab[i] += [cid,'"'+smi+'"']
@@ -26,7 +26,7 @@ File.read("pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv").each_line do
end
i = 0
-File.read("pyrrolizidine-alkaloids/lazar/pa-mp2d-predictions.csv").each_line do |l|
+File.read("pyrrolizidine-alkaloids/mp2d/lazar/pa-mp2d-predictions.csv").each_line do |l|
if i > 0
id,cansmi,exp,mut,p0,p1,max_sim,nn = l.chomp.split(",")
max_sim.to_f < 0.5? hc = "F" : hc = "T"
@@ -36,8 +36,8 @@ File.read("pyrrolizidine-alkaloids/lazar/pa-mp2d-predictions.csv").each_line do
i += 1
end
-i=0
-File.read("pyrrolizidine-alkaloids/lazar/pa-cdk-predictions.csv").each_line do |l|
+i=1
+File.read("pyrrolizidine-alkaloids/cdk/lazar/pa-cdk-predictions.csv").each_line do |l|
#if i > 0
cansmi,exp,mut,p0,p1,max_sim,nn = l.chomp.split(",")
max_sim.to_f < 0.5? hc = "F" : hc = "T"
@@ -47,6 +47,7 @@ File.read("pyrrolizidine-alkaloids/lazar/pa-cdk-predictions.csv").each_line do |
i += 1
end
+=begin
Dir["pyrrolizidine-alkaloids/R/PA.*.outcome.csv"].each do |r|
tab[0] << "R-"+r.sub('pyrrolizidine-alkaloids/R/PA.','').sub('.outcome.csv','')
i = 0
@@ -63,9 +64,22 @@ Dir["pyrrolizidine-alkaloids/R/PA.*.outcome.csv"].each do |r|
i += 1
end
end
+=end
-Dir["pyrrolizidine-alkaloids/tensorflow/pred.*.v5-ext-Padel-2D.csv"].each do |r|
- tab[0] << "TF-"+r.sub('pyrrolizidine-alkaloids/tensorflow/pred.','').sub('.v5-ext-Padel-2D.csv','').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN")
+Dir["pyrrolizidine-alkaloids/cdk/tensorflow/*.csv"].each do |r|
+ tab[0] << "TF-"+r.sub('pyrrolizidine-alkaloids/tensorflow/pred.','').sub('.v5-ext-Padel-2D.csv','').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN").sub("svm","SVM")
+ i = 0
+ File.read(r).each_line do |l|
+ if i > 0
+ id,pred = l.chomp.split(",")
+ pred.to_f > 0.5 ? tab[i] << 1 : tab[i] << 0
+ end
+ i += 1
+ end
+end
+
+Dir["pyrrolizidine-alkaloids/cdk/tensorflow/*.csv"].each do |r|
+ tab[0] << "TF-"+r.sub('pyrrolizidine-alkaloids/tensorflow/pred.','').sub('.v5-ext-Padel-2D.csv','').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN").sub("svm","SVM")
i = 0
File.read(r).each_line do |l|
if i > 0
diff --git a/scripts/pa-tex-table.rb b/scripts/pa-tex-table.rb
index 0fe1410..b163ab3 100755
--- a/scripts/pa-tex-table.rb
+++ b/scripts/pa-tex-table.rb
@@ -13,16 +13,16 @@ puts '
\caption{Summary of pyrrolizidine alkaloid predictions: red: mutagen, green: non-mutagen, grey: no prediction, dark red/green: low confidence; 1: Retronecine, 2: Otonecine, 3: Platynecine, 4: N-oxide, 5: Dehydropyrrolizidine, 6:Tertiary PA, 7: Macrocyclic-diester, 8: Monoester, 9: Diester} \\\\
\label{tab:pa}
-1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & DL & RF & SVM & LR-sgd & LR-scikit & NN & RF \\kill % needed as guide for multicolumn
-\multicolumn{9}{c}{PA Group} & & \multicolumn{2}{c}{lazar} & \multicolumn{3}{c}{R} & \multicolumn{4}{c}{Tensorflow}\\\\
+1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & DL & SVM & LR-sgd & LR-scikit & NN & RF \\kill % needed as guide for multicolumn
+\multicolumn{9}{c}{PA Group} & & \multicolumn{2}{c}{lazar} & \multicolumn{4}{c}{Tensorflow}\\\\
-1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & DL & RF & SVM & LR-sgd & LR-scikit & NN & RF \\\\
+1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & LR-sgd & LR-scikit & NN & RF \\\\
\hline
\renewcommand{\arraystretch}{0.075}
'
File.read(ARGV[0]).each_line do |l|
unless l.match("SMILES")
- id,r,o,p,n,de,t,ma,mo,di,cid,smi,cansmi,exp,lazar_MP2D,lazar_MP2D_high_confidence,lazar_CDK,lazar_CDK_high_confidence,r_DL,r_RF,r_SVM,tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF = l.chomp.split(",")
+ id,r,o,p,n,de,t,ma,mo,di,cid,smi,cansmi,exp,lazar_MP2D,lazar_MP2D_high_confidence,lazar_CDK,lazar_CDK_high_confidence,tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF = l.chomp.split(",")
row = [r,o,p,n,de,t,ma,mo,di].collect{|group| group == "1" ? '\cellcolor{black}' : '\cellcolor{white}'}.join(' & ')
if exp == "1"
row += ' & \cellcolor{red}'
@@ -49,7 +49,7 @@ File.read(ARGV[0]).each_line do |l|
else
row += ' & \cellcolor{grey}'
end
- [r_DL,r_RF,r_SVM,tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF].each do |mut|
+ [tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF].each do |mut|
mut == "1" ? row += ' & \cellcolor{red}' : row += ' & \cellcolor{green}'
end
puts row + ' \\\\'
diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb
index 557dbd4..8bc323c 100755
--- a/scripts/summary2table.rb
+++ b/scripts/summary2table.rb
@@ -5,21 +5,25 @@ rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "T
data = YAML.load_file("10-fold-crossvalidations/summary.yaml")[:cv]
case ARGV[0]
-when "R"
- header = ["RF","SVM","DL"]
- keys = header.collect{|h| "R-"+h}
when "tensorflow"
- header = ["RF","LR-sgd","LR-scikit","NN"]
- keys = ["rf","lr","lr2","nn"].collect{|n| "tensorflow-"+n+".v3"}
+ header = ["MP2D-RF","MP2D-LR-sgd","MP2D-LR-scikit","MP2D-NN","MP2D-SVM","CDK-RF","CDK-LR-sgd","CDK-LR-scikit","CDK-NN","CDK-SVM"]
+ desc = ["mp2d","cdk"]
+ algos = ["rf","lr","lr2","nn","svm"]
+ keys = []
+ desc.each do |d|
+ algos.each do |a|
+ keys << "tensorflow-"+a+"-"+d
+ end
+ end
when "lazar"
header = ["MP2D", "CDK"]
- mp2dkeys = ["lazar-all","lazar-high-confidence"]
- padelkeys = ["lazar-padel-all","lazar-padel-high-confidence"]
+ mp2dkeys = ["lazar-mp2d-all","lazar-mp2d-high-confidence"]
+ cdkkeys = ["lazar-cdk-all","lazar-cdk-high-confidence"]
puts ","+header.join(",")
rows.each do |short,long|
print long+","
print mp2dkeys.collect{|k| data[k][short]}.join("/")+","
- puts padelkeys.collect{|k| data[k][short]}.join("/")
+ puts cdkkeys.collect{|k| data[k][short]}.join("/")
end
exit
end