diff options
author | Christoph Helma <helma@in-silico.ch> | 2021-02-06 20:21:58 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2021-02-06 20:21:58 +0100 |
commit | 771a2381ae0fd5e352f23d7223baeb26e8bb4e02 (patch) | |
tree | 1a7182a83ec15b9d7a69e5508c8d3b2bfa5983df /scripts | |
parent | 241f997c3a1a6a38fa47070f5efbd23852cc432b (diff) |
svm validation, tensorflow mp2d pa predictions, r results removed, cleanup
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/cv-tensorflow-confusion-matrix.rb | 2 | ||||
-rwxr-xr-x | scripts/mp2d-distances.rb | 2 | ||||
-rwxr-xr-x | scripts/pa-fingerprints.rb | 20 | ||||
-rwxr-xr-x | scripts/pa-groups.R | 23 | ||||
-rwxr-xr-x | scripts/pa-table.rb | 28 | ||||
-rwxr-xr-x | scripts/pa-tex-table.rb | 10 | ||||
-rwxr-xr-x | scripts/summary2table.rb | 20 |
7 files changed, 83 insertions, 22 deletions
diff --git a/scripts/cv-tensorflow-confusion-matrix.rb b/scripts/cv-tensorflow-confusion-matrix.rb index fba5c49..087d905 100755 --- a/scripts/cv-tensorflow-confusion-matrix.rb +++ b/scripts/cv-tensorflow-confusion-matrix.rb @@ -7,7 +7,7 @@ tn = 0 fn = 0 pred = CSV.read(ARGV[0],headers: true,:col_sep => ",") -act = CSV.read(File.join("data","mutagenicity.csv"),headers: true,:col_sep => ",") +act = CSV.read(File.join("data","training","mutagenicity.csv"),headers: true,:col_sep => ",") data = {} diff --git a/scripts/mp2d-distances.rb b/scripts/mp2d-distances.rb index 398504d..05313c2 100755 --- a/scripts/mp2d-distances.rb +++ b/scripts/mp2d-distances.rb @@ -17,7 +17,7 @@ File.readlines(File.join("..","lazar","models","mutagenicity","independent_varia end end -File.readlines(File.join("pyrrolizidine-alkaloids","PA-smiles.csv")).each_with_index do |line,i| +File.readlines(File.join("pyrrolizidine-alkaloids","lazar","pa-smiles.csv")).each_with_index do |line,i| if i > 0 (id,smiles) = line.chomp.split(",") independent_variables << Compound.new(smiles).fingerprint diff --git a/scripts/pa-fingerprints.rb b/scripts/pa-fingerprints.rb new file mode 100755 index 0000000..344ba86 --- /dev/null +++ b/scripts/pa-fingerprints.rb @@ -0,0 +1,20 @@ +#!/usr/bin/env ruby +require_relative "../../lazar/lib/lazar.rb" +training_fingerprints = `sed -n '1p' data/mutagenicity-fingerprints.csv`.chomp.split(",") +training_fingerprints.pop +puts training_fingerprints.join(",") +training_fingerprints.shift + +File.readlines(File.join("pyrrolizidine-alkaloids","lazar","pa-smiles.csv")).each_with_index do |line,i| + if i > 0 + (id,smiles) = line.chomp.split(",") + c = Compound.new(smiles) + out = [c.smiles] + fp = c.fingerprint + training_fingerprints.each do |frag| + fp.include?(frag) ? out << 1 : out << 0 + end + puts out.join(",") + end +end + diff --git a/scripts/pa-groups.R b/scripts/pa-groups.R new file mode 100755 index 0000000..005dd3d --- /dev/null +++ b/scripts/pa-groups.R @@ -0,0 +1,23 @@ +#!/usr/bin/env Rscript +library(ggplot2) +data <- read.csv("tables/pa-table.csv") +for (i in c(2:10)) { + name <- names(data)[i] + group <- data[data[i] == 1,c(15,19,20,21,22)] + freq <- 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group)) + plot <- ggplot(data.frame(freq),aes(x=c("lazar","LR-sgd","LR-scikit","NN","RF"),y=freq)) + geom_bar(stat="identity") + ylab("% mutagenic") + xlab(name) + ylim(c(0,100)) + ggsave(paste("figures/",name,".png",sep="")) +} +#groups <- names(data)[c(2:10)] +#labels <- data$Mutagenicity +#data$Mutagenicity <- NULL +#m <- as.matrix(data) +#dist <- as.dist(m) +#tsne <- Rtsne(dist,is_distance=T) +#write.csv(tsne,"tsne.csv") +#write.csv(data.frame(x = tsne$Y[,1], y = tsne$Y[,2]),"tsne.csv") +#tsne_plot <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2]) +#colors <- c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00") +#plot <- ggplot(tsne_plot) +#plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank()) + scale_color_manual(values = colors) +#ggsave("figures/tsne-mp2d.png") diff --git a/scripts/pa-table.rb b/scripts/pa-table.rb index 8c1037e..1b8ecc8 100755 --- a/scripts/pa-table.rb +++ b/scripts/pa-table.rb @@ -2,7 +2,7 @@ # red groups tab = [] -File.read("pyrrolizidine-alkaloids/R/PA.RF.outcome.csv").each_line do |l| +File.read("data/pyrrolizidine-alkaloids/pa-groups.csv").each_line do |l| items = l.chomp.split(';') if items.first.empty? items[0] = "ID" @@ -17,7 +17,7 @@ end tab[0] += ["CID","SMILES","Canonical SMILES","Measured","lazar-MP2D","lazar-MP2D-high-confidence","lazar-CDK","lazar-CDK-high-confidence"] i = 0 -File.read("pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv").each_line do |l| +File.read("data/pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv").each_line do |l| if i > 0 id,cid,name,smi = l.chomp.split(";") tab[i] += [cid,'"'+smi+'"'] @@ -26,7 +26,7 @@ File.read("pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv").each_line do end i = 0 -File.read("pyrrolizidine-alkaloids/lazar/pa-mp2d-predictions.csv").each_line do |l| +File.read("pyrrolizidine-alkaloids/mp2d/lazar/pa-mp2d-predictions.csv").each_line do |l| if i > 0 id,cansmi,exp,mut,p0,p1,max_sim,nn = l.chomp.split(",") max_sim.to_f < 0.5? hc = "F" : hc = "T" @@ -36,8 +36,8 @@ File.read("pyrrolizidine-alkaloids/lazar/pa-mp2d-predictions.csv").each_line do i += 1 end -i=0 -File.read("pyrrolizidine-alkaloids/lazar/pa-cdk-predictions.csv").each_line do |l| +i=1 +File.read("pyrrolizidine-alkaloids/cdk/lazar/pa-cdk-predictions.csv").each_line do |l| #if i > 0 cansmi,exp,mut,p0,p1,max_sim,nn = l.chomp.split(",") max_sim.to_f < 0.5? hc = "F" : hc = "T" @@ -47,6 +47,7 @@ File.read("pyrrolizidine-alkaloids/lazar/pa-cdk-predictions.csv").each_line do | i += 1 end +=begin Dir["pyrrolizidine-alkaloids/R/PA.*.outcome.csv"].each do |r| tab[0] << "R-"+r.sub('pyrrolizidine-alkaloids/R/PA.','').sub('.outcome.csv','') i = 0 @@ -63,9 +64,22 @@ Dir["pyrrolizidine-alkaloids/R/PA.*.outcome.csv"].each do |r| i += 1 end end +=end -Dir["pyrrolizidine-alkaloids/tensorflow/pred.*.v5-ext-Padel-2D.csv"].each do |r| - tab[0] << "TF-"+r.sub('pyrrolizidine-alkaloids/tensorflow/pred.','').sub('.v5-ext-Padel-2D.csv','').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN") +Dir["pyrrolizidine-alkaloids/cdk/tensorflow/*.csv"].each do |r| + tab[0] << "TF-"+r.sub('pyrrolizidine-alkaloids/tensorflow/pred.','').sub('.v5-ext-Padel-2D.csv','').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN").sub("svm","SVM") + i = 0 + File.read(r).each_line do |l| + if i > 0 + id,pred = l.chomp.split(",") + pred.to_f > 0.5 ? tab[i] << 1 : tab[i] << 0 + end + i += 1 + end +end + +Dir["pyrrolizidine-alkaloids/cdk/tensorflow/*.csv"].each do |r| + tab[0] << "TF-"+r.sub('pyrrolizidine-alkaloids/tensorflow/pred.','').sub('.v5-ext-Padel-2D.csv','').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN").sub("svm","SVM") i = 0 File.read(r).each_line do |l| if i > 0 diff --git a/scripts/pa-tex-table.rb b/scripts/pa-tex-table.rb index 0fe1410..b163ab3 100755 --- a/scripts/pa-tex-table.rb +++ b/scripts/pa-tex-table.rb @@ -13,16 +13,16 @@ puts ' \caption{Summary of pyrrolizidine alkaloid predictions: red: mutagen, green: non-mutagen, grey: no prediction, dark red/green: low confidence; 1: Retronecine, 2: Otonecine, 3: Platynecine, 4: N-oxide, 5: Dehydropyrrolizidine, 6:Tertiary PA, 7: Macrocyclic-diester, 8: Monoester, 9: Diester} \\\\ \label{tab:pa} -1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & DL & RF & SVM & LR-sgd & LR-scikit & NN & RF \\kill % needed as guide for multicolumn -\multicolumn{9}{c}{PA Group} & & \multicolumn{2}{c}{lazar} & \multicolumn{3}{c}{R} & \multicolumn{4}{c}{Tensorflow}\\\\ +1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & DL & SVM & LR-sgd & LR-scikit & NN & RF \\kill % needed as guide for multicolumn +\multicolumn{9}{c}{PA Group} & & \multicolumn{2}{c}{lazar} & \multicolumn{4}{c}{Tensorflow}\\\\ -1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & DL & RF & SVM & LR-sgd & LR-scikit & NN & RF \\\\ +1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & LR-sgd & LR-scikit & NN & RF \\\\ \hline \renewcommand{\arraystretch}{0.075} ' File.read(ARGV[0]).each_line do |l| unless l.match("SMILES") - id,r,o,p,n,de,t,ma,mo,di,cid,smi,cansmi,exp,lazar_MP2D,lazar_MP2D_high_confidence,lazar_CDK,lazar_CDK_high_confidence,r_DL,r_RF,r_SVM,tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF = l.chomp.split(",") + id,r,o,p,n,de,t,ma,mo,di,cid,smi,cansmi,exp,lazar_MP2D,lazar_MP2D_high_confidence,lazar_CDK,lazar_CDK_high_confidence,tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF = l.chomp.split(",") row = [r,o,p,n,de,t,ma,mo,di].collect{|group| group == "1" ? '\cellcolor{black}' : '\cellcolor{white}'}.join(' & ') if exp == "1" row += ' & \cellcolor{red}' @@ -49,7 +49,7 @@ File.read(ARGV[0]).each_line do |l| else row += ' & \cellcolor{grey}' end - [r_DL,r_RF,r_SVM,tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF].each do |mut| + [tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF].each do |mut| mut == "1" ? row += ' & \cellcolor{red}' : row += ' & \cellcolor{green}' end puts row + ' \\\\' diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb index 557dbd4..8bc323c 100755 --- a/scripts/summary2table.rb +++ b/scripts/summary2table.rb @@ -5,21 +5,25 @@ rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "T data = YAML.load_file("10-fold-crossvalidations/summary.yaml")[:cv] case ARGV[0] -when "R" - header = ["RF","SVM","DL"] - keys = header.collect{|h| "R-"+h} when "tensorflow" - header = ["RF","LR-sgd","LR-scikit","NN"] - keys = ["rf","lr","lr2","nn"].collect{|n| "tensorflow-"+n+".v3"} + header = ["MP2D-RF","MP2D-LR-sgd","MP2D-LR-scikit","MP2D-NN","MP2D-SVM","CDK-RF","CDK-LR-sgd","CDK-LR-scikit","CDK-NN","CDK-SVM"] + desc = ["mp2d","cdk"] + algos = ["rf","lr","lr2","nn","svm"] + keys = [] + desc.each do |d| + algos.each do |a| + keys << "tensorflow-"+a+"-"+d + end + end when "lazar" header = ["MP2D", "CDK"] - mp2dkeys = ["lazar-all","lazar-high-confidence"] - padelkeys = ["lazar-padel-all","lazar-padel-high-confidence"] + mp2dkeys = ["lazar-mp2d-all","lazar-mp2d-high-confidence"] + cdkkeys = ["lazar-cdk-all","lazar-cdk-high-confidence"] puts ","+header.join(",") rows.each do |short,long| print long+"," print mp2dkeys.collect{|k| data[k][short]}.join("/")+"," - puts padelkeys.collect{|k| data[k][short]}.join("/") + puts cdkkeys.collect{|k| data[k][short]}.join("/") end exit end |