diff options
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/cv-pa-classifications.rb | 26 | ||||
-rwxr-xr-x | scripts/lazar-cv-predictions.rb | 33 | ||||
-rwxr-xr-x | scripts/lazar-pa-predictions.rb | 3 | ||||
-rwxr-xr-x | scripts/pa-groups.R | 60 | ||||
-rwxr-xr-x | scripts/pa-groups.rb | 41 | ||||
-rwxr-xr-x | scripts/pa-predictions.rb | 2 | ||||
-rwxr-xr-x | scripts/roc.R | 65 | ||||
-rwxr-xr-x | scripts/roc.rb | 5 | ||||
-rwxr-xr-x | scripts/summary2table.rb | 34 | ||||
-rwxr-xr-x[-rw-r--r--] | scripts/tsne-cdk-coordinates.R | 0 | ||||
-rwxr-xr-x | scripts/tsne-classifications.R | 19 | ||||
-rwxr-xr-x[-rw-r--r--] | scripts/tsne-mp2d-coordinates.R | 0 |
12 files changed, 159 insertions, 129 deletions
diff --git a/scripts/cv-pa-classifications.rb b/scripts/cv-pa-classifications.rb new file mode 100755 index 0000000..fe64078 --- /dev/null +++ b/scripts/cv-pa-classifications.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby + +classifications = {} +File.readlines(ARGV[1]).each do |l| + smi,c = l.chomp.split(",") + classifications[smi] = c +end +lines = File.readlines(ARGV[2]) +lines.shift +lines.each do |l| + smi,c = l.chomp.split(",") + if c == "1" + classifications[smi] = "PA-mutagenic" + elsif c == "0" + classifications[smi] = "PA-nonmutagenic" + else + classifications[smi] = "PA-NA" + end +end +lines=File.readlines(ARGV[0]) +lines.shift +lines.each do |l| + smi,x,y = l.split(",") + smi.gsub!('"','') + classifications[smi] ? puts(classifications[smi]) : puts("NA") +end diff --git a/scripts/lazar-cv-predictions.rb b/scripts/lazar-cv-predictions.rb index 9236bec..9f94776 100755 --- a/scripts/lazar-cv-predictions.rb +++ b/scripts/lazar-cv-predictions.rb @@ -1,31 +1,8 @@ #!/usr/bin/env ruby -predictions = {} -minsim = ARGV[1].to_f -minsim ||= 0 -Dir[File.join(ARGV[0],"crossvalidation","*","test","predictions")].each do |pred| - File.readlines(pred).each_with_index do |l,i| - smi,m,pred,pa,pi,maxsim,n = l.split(",") - predictions[smi] = [pred,maxsim.to_f] - end -end -lines = File.readlines(File.join("mutagenicity","mutagenicity.csv")) -lines.shift -lines.each do |line| - smi,exp = line.chomp.split(",") - if predictions[smi] - if predictions[smi].first == "1" and exp == "1" and predictions[smi].last >= minsim - puts [smi,"TP"].join(",") - elsif predictions[smi].first == "0" and exp == "0" and predictions[smi].last >= minsim - puts [smi,"TN"].join(",") - elsif predictions[smi].first == "1" and exp == "0" and predictions[smi].last >= minsim - puts [smi,"FP"].join(",") - elsif predictions[smi].first == "0" and exp == "1" and predictions[smi].last >= minsim - puts [smi,"FN"].join(",") - else - puts [smi,"NA"].join(",") - end - else - puts [smi,"NA"].join(",") - end +thresh = ARGV[1].to_f +thresh ||= 0 +File.readlines(ARGV[0]).each do |pred| + smi,c,maxsim = pred.split(",") + puts [smi,c].join(",") if maxsim.to_f > thresh end diff --git a/scripts/lazar-pa-predictions.rb b/scripts/lazar-pa-predictions.rb index 3fb61da..79c8198 100755 --- a/scripts/lazar-pa-predictions.rb +++ b/scripts/lazar-pa-predictions.rb @@ -1,9 +1,8 @@ #!/usr/bin/env ruby -predictions = {} minsim = ARGV[1].to_f minsim ||= 0 -File.readlines(ARGV[0]).each do |l| +File.readlines(ARGV[0])[1..-1].each do |l| smi,m,pred,pa,pi,maxsim,n = l.split(",") pred = "NA" if maxsim.to_f < minsim puts [smi,pred].join(",") diff --git a/scripts/pa-groups.R b/scripts/pa-groups.R index 3c6ce2c..77b358b 100755 --- a/scripts/pa-groups.R +++ b/scripts/pa-groups.R @@ -1,33 +1,49 @@ #!/usr/bin/env Rscript library(ggplot2) args = commandArgs(trailingOnly=TRUE) -groups = read.csv(args[1],header=T) -data = read.csv(args[2]) -for (i in c(2:10)) { - name = names(groups)[i] - cols = c(2:15) - group = data[groups[i] == 1,cols] - freq = 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group)) - algos = toupper(names(data)[cols]) - algos = gsub("HIGH",'HC',algos) - algos = gsub(".CONFIDENCE",'',algos) - algos = gsub("\\.",'-',algos) - algos <- factor(algos,levels=rev(c( - "MP2D-LAZAR-ALL", +freq = read.csv(args[1],header=T,quote="'",sep=",") + +models = factor(freq$Model,levels=rev(c( "MP2D-LAZAR-HC", + "MP2D-LAZAR-ALL", "MP2D-RF", - "MP2D-LR", - "MP2D-LR2", + "MP2D-LR-sgd", + "MP2D-LR-scikit", "MP2D-NN", "MP2D-SVM", - "CDK-LAZAR-ALL", "CDK-LAZAR-HC", + "CDK-LAZAR-ALL", "CDK-RF", - "CDK-LR", - "CDK-LR2", + "CDK-LR-sgd", + "CDK-LR-scikit", "CDK-NN", "CDK-SVM" - ))) - plot = ggplot(data.frame(freq),aes(x=freq,y=algos)) + geom_bar(stat="identity") + xlab("% mutagenic") + ylab(element_blank()) + xlim(c(0,100))# + theme(axis.text.x = element_text(angle=90)) - ggsave(paste("figures/",name,".png",sep="")) -} +))) + +colors <- c( +"MP2D-LAZAR-HC" = "#0072B2", +"MP2D-LAZAR-ALL" = "#56B4E9", +"MP2D-RF" = "#009E73", +"MP2D-LR-sgd" = "#F0E442", +"MP2D-LR-scikit" = "#D55E00", +"MP2D-NN" = "#CC79A7", +"MP2D-SVM" = "#E69F00", +"CDK-LAZAR-HC" = "#0072B2", +"CDK-LAZAR-ALL" = "#56B4E9", +"CDK-RF" = "#009E73", +"CDK-LR-sgd" = "#F0E442", +"CDK-LR-scikit" = "#D55E00", +"CDK-NN" = "#CC79A7", +"CDK-SVM" = "#E69F00" +) + +ggplot(freq,aes(Frequency,models,fill=models)) + + geom_bar(stat="identity",show.legend=F) + + xlab("% mutagenic") + + ylab(element_blank()) + + xlim(c(0,100)) + + scale_fill_manual(values = colors) + + facet_wrap(~PA.Group) + + theme_minimal() + + +ggsave(args[2]) diff --git a/scripts/pa-groups.rb b/scripts/pa-groups.rb new file mode 100755 index 0000000..b183116 --- /dev/null +++ b/scripts/pa-groups.rb @@ -0,0 +1,41 @@ +#!/usr/bin/env ruby +group_data = File.readlines(ARGV[0]).collect{|l| l.chomp.split(",")} +predictions = File.readlines(ARGV[1]).collect{|l| l.chomp.split(",")} +group_names = group_data.shift +group_names.shift +algo_names = predictions.shift +algo_names.shift +groups = {} +group_data.each do |d| + smi = d.shift + groups[smi] ||= [] + d.each_with_index do |v,i| + groups[smi] << group_names[i] if v == "1" + end +end +mut = {} +n = {} +predictions.each do |pred| + smi = pred.shift + pred.each_with_index do |p,i| + algo = algo_names[i] + group_names.each do |g| + mut[g] ||= {} + n[g] ||= {} + mut[g][algo] ||= 0 + n[g][algo] ||= 0 + if groups[smi].include? g + n[g][algo]+=1 + mut[g][algo]+=1 if p == "1" + end + end + end +end + +q = "'" +puts ["'PA Group'","'Model'","'Frequency'"].join(",") +mut.each do |g,val| + val.each do |a,n_mut| + puts [q+g+q,q+a.sub("high-confidence","HC").upcase.sub(/-LR$/,"-LR-sgd").sub("LR2","LR-scikit")+q,100.0*n_mut/n[g][a]].join(",") + end +end diff --git a/scripts/pa-predictions.rb b/scripts/pa-predictions.rb index 9500c39..712017b 100755 --- a/scripts/pa-predictions.rb +++ b/scripts/pa-predictions.rb @@ -8,6 +8,8 @@ ARGV.each do |f| algos << name File.readlines(f).each do |l| smi,pred = l.chomp.split(",") + pred ||= "NA" + pred = nil if pred == "NA" predictions[smi] ||= {} predictions[smi][name] = pred end diff --git a/scripts/roc.R b/scripts/roc.R index 32e1674..cbd5ea6 100755 --- a/scripts/roc.R +++ b/scripts/roc.R @@ -1,45 +1,28 @@ #!/usr/bin/env Rscript library(ggplot2) -data <- read.csv("figures/roc.csv",header=T) -labels = factor(row.names(data), levels = c("MP2D-LAZAR-HC", "MP2D-LAZAR-ALL", "MP2D-RF", "MP2D-LR", "MP2D-LR2", "MP2D-NN", "MP2D-SVM", "CDK-LAZAR-HC", "CDK-LAZAR-ALL", "CDK-RF", "CDK-LR", "CDK-LR2", "CDK-NN", "CDK-SVM")) -shapes = c( -"MP2D-LAZAR-HC" = 16, -"MP2D-LAZAR-ALL" = 16, -"MP2D-RF" = 16, -"MP2D-LR" = 16, -"MP2D-LR2" = 16, -"MP2D-NN" = 16, -"MP2D-SVM" = 16, -"CDK-LAZAR-HC" = 17, -"CDK-LAZAR-ALL" = 17, -"CDK-RF" = 17, -"CDK-LR" = 17, -"CDK-LR2" = 17, -"CDK-NN" = 17, -"CDK-SVM" = 17) +args = commandArgs(trailingOnly=TRUE) -colors <- c( -"MP2D-LAZAR-HC" = "#E69F00", -"MP2D-LAZAR-ALL" = "#56B4E9", -"MP2D-RF" = "#009E73", -"MP2D-LR" = "#F0E442", -"MP2D-LR2" = "#0072B2", -"MP2D-NN" = "#D55E00", -"MP2D-SVM" = "#CC79A7", -"CDK-LAZAR-HC" = "#E69F00", -"CDK-LAZAR-ALL" = "#56B4E9", -"CDK-RF" = "#009E73", -"CDK-LR" = "#F0E442", -"CDK-LR2" = "#0072B2", -"CDK-NN" = "#D55E00", -"CDK-SVM" = "#CC79A7") +data = read.csv(args[1],header=T) +model_labels = factor(data$model, levels = c("LAZAR-HC", "LAZAR-ALL", "RF", "LR-sgd", "LR-scikit", "NN", "SVM")) +descriptor_labels = factor(data$descriptor, levels = c("MP2D", "CDK")) -p <- ggplot(data) -p <- p + geom_point(aes(x=fpr, y=tpr, color = labels, shape = labels)) -p <- p + geom_abline() -p <- p + theme(legend.title=element_blank()) -p <- p + expand_limits(x=c(0,1),y=c(0,1)) -p <- p + scale_shape_manual(values = shapes) -p <- p + scale_color_manual(values = colors) -p <- p + labs(x = "False positive rate", y = "True positive rate") -ggsave("figures/roc.png") +colors = c( +"LAZAR-HC" = "#0072B2", +"LAZAR-ALL" = "#56B4E9", +"RF" = "#009E73", +"LR-sgd" = "#F0E442", +"LR-scikit" = "#D55E00", +"NN" = "#CC79A7", +"SVM" = "#E69F00" +) + +ggplot(data,aes(fpr, tpr, color = model_labels, shape = descriptor_labels)) + + geom_point(size = 2.5) + + geom_abline() + + expand_limits(x=c(0,1),y=c(0,1)) + + labs(x = "False positive rate", y = "True positive rate") + + scale_color_manual(values = colors) + + theme_minimal() + + theme(legend.title=element_blank())#,legend.position = "bottom",legend.direction="vertical")#,legend.key.height = 7,legend.key.width=2) + + +ggsave(args[2]) diff --git a/scripts/roc.rb b/scripts/roc.rb index 8e2dc51..8c8c93c 100755 --- a/scripts/roc.rb +++ b/scripts/roc.rb @@ -2,7 +2,8 @@ require "yaml" data = YAML.load(File.read ARGV[0])[:cv] -puts "tpr,fpr" +puts "descriptor,model,tpr,fpr" data.each do |algo,values| - puts [algo.upcase.gsub('_','-').sub("HIGH-CONFIDENCE","HC"),values[:tpr],values[:fpr]].join(",") + desc,model = algo.split("_",2) + puts [desc.upcase,model.upcase.gsub('_','-').sub("HIGH-CONFIDENCE","HC").sub(/^LR$/,"LR-sgd").sub("LR2","LR-scikit"),values[:tpr],values[:fpr]].join(",") end diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb deleted file mode 100755 index d0da0af..0000000 --- a/scripts/summary2table.rb +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env ruby -require 'yaml' - -rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"} -data = YAML.load_file("crossvalidations/summary.yaml")[:cv] - -case ARGV[0] -when "tensorflow" - header = ["MP2D-RF","MP2D-LR-sgd","MP2D-LR-scikit","MP2D-NN","MP2D-SVM","CDK-RF","CDK-LR-sgd","CDK-LR-scikit","CDK-NN","CDK-SVM"] - desc = ["mp2d","cdk"] - algos = ["rf","lr","lr2","nn","svm"] - keys = [] - desc.each do |d| - algos.each do |a| - keys << "tensorflow-"+a+"-"+d - end - end -when "lazar" - header = ["MP2D", "CDK"] - mp2dkeys = ["lazar-mp2d-all","lazar-mp2d-high-confidence"] - cdkkeys = ["lazar-cdk-all","lazar-cdk-high-confidence"] - puts ","+header.join(",") - rows.each do |short,long| - print long+"," - print mp2dkeys.collect{|k| data[k][short]}.join("/")+"," - puts cdkkeys.collect{|k| data[k][short]}.join("/") - end - exit -end -puts ","+header.join(",") -rows.each do |short,long| - print long+"," - puts keys.collect{|k| data[k][short]}.join(",") -end diff --git a/scripts/tsne-cdk-coordinates.R b/scripts/tsne-cdk-coordinates.R index f9baff9..f9baff9 100644..100755 --- a/scripts/tsne-cdk-coordinates.R +++ b/scripts/tsne-cdk-coordinates.R diff --git a/scripts/tsne-classifications.R b/scripts/tsne-classifications.R new file mode 100755 index 0000000..6f37d11 --- /dev/null +++ b/scripts/tsne-classifications.R @@ -0,0 +1,19 @@ +#!/usr/bin/env Rscript +library(Rtsne) +library(ggplot2) +args = commandArgs(trailingOnly=TRUE) +tsne = read.csv(args[1],header=T) +tsne[,1] = NULL +labels = read.csv(args[2],header=F, na.strings = "-")[,1] +labels = factor(labels,levels=c("TP","TN","FN","FP","NA","PA-mutagenic","PA-nonmutagenic","PA-NA")) +tsne_plot = data.frame(x = tsne$x, y = tsne$y) +blue = "#00BFC4" +red = "#F8766D" +green = "#7CAE00" +grey = "#AAAAAA" +colors = c("PA-mutagenic" = "red", "PA-nonmutagenic" = "green", "PA-NA" = grey, "TP" = red, "TN" = green, "FP" = "red", "FN" = "green", "NA" = grey) +shapes = c("PA-mutagenic" = 22, "PA-nonmutagenic" = 22, "PA-NA" = 22,"TP" = 16, "TN" = 16, "FP" = 23, "FN" = 23, "NA" = 16) +fills = c("PA-mutagenic" = blue, "PA-nonmutagenic" = blue, "PA-NA" = blue, "TP" = red, "TN" = green, "FP" = "green", "FN" = "red", "NA" = grey) +plot = ggplot(tsne_plot) +plot + geom_point(aes(x=x, y=y, color = labels, shape = labels, fill = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_shape_manual(values = shapes) + scale_color_manual(values = colors) + scale_fill_manual(values = fills) +ggsave(args[3]) diff --git a/scripts/tsne-mp2d-coordinates.R b/scripts/tsne-mp2d-coordinates.R index ef97595..ef97595 100644..100755 --- a/scripts/tsne-mp2d-coordinates.R +++ b/scripts/tsne-mp2d-coordinates.R |