summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/cv-pa-classifications.rb26
-rwxr-xr-xscripts/lazar-cv-predictions.rb33
-rwxr-xr-xscripts/lazar-pa-predictions.rb3
-rwxr-xr-xscripts/pa-groups.R60
-rwxr-xr-xscripts/pa-groups.rb41
-rwxr-xr-xscripts/pa-predictions.rb2
-rwxr-xr-xscripts/roc.R65
-rwxr-xr-xscripts/roc.rb5
-rwxr-xr-xscripts/summary2table.rb34
-rwxr-xr-x[-rw-r--r--]scripts/tsne-cdk-coordinates.R0
-rwxr-xr-xscripts/tsne-classifications.R19
-rwxr-xr-x[-rw-r--r--]scripts/tsne-mp2d-coordinates.R0
12 files changed, 159 insertions, 129 deletions
diff --git a/scripts/cv-pa-classifications.rb b/scripts/cv-pa-classifications.rb
new file mode 100755
index 0000000..fe64078
--- /dev/null
+++ b/scripts/cv-pa-classifications.rb
@@ -0,0 +1,26 @@
+#!/usr/bin/env ruby
+
+classifications = {}
+File.readlines(ARGV[1]).each do |l|
+ smi,c = l.chomp.split(",")
+ classifications[smi] = c
+end
+lines = File.readlines(ARGV[2])
+lines.shift
+lines.each do |l|
+ smi,c = l.chomp.split(",")
+ if c == "1"
+ classifications[smi] = "PA-mutagenic"
+ elsif c == "0"
+ classifications[smi] = "PA-nonmutagenic"
+ else
+ classifications[smi] = "PA-NA"
+ end
+end
+lines=File.readlines(ARGV[0])
+lines.shift
+lines.each do |l|
+ smi,x,y = l.split(",")
+ smi.gsub!('"','')
+ classifications[smi] ? puts(classifications[smi]) : puts("NA")
+end
diff --git a/scripts/lazar-cv-predictions.rb b/scripts/lazar-cv-predictions.rb
index 9236bec..9f94776 100755
--- a/scripts/lazar-cv-predictions.rb
+++ b/scripts/lazar-cv-predictions.rb
@@ -1,31 +1,8 @@
#!/usr/bin/env ruby
-predictions = {}
-minsim = ARGV[1].to_f
-minsim ||= 0
-Dir[File.join(ARGV[0],"crossvalidation","*","test","predictions")].each do |pred|
- File.readlines(pred).each_with_index do |l,i|
- smi,m,pred,pa,pi,maxsim,n = l.split(",")
- predictions[smi] = [pred,maxsim.to_f]
- end
-end
-lines = File.readlines(File.join("mutagenicity","mutagenicity.csv"))
-lines.shift
-lines.each do |line|
- smi,exp = line.chomp.split(",")
- if predictions[smi]
- if predictions[smi].first == "1" and exp == "1" and predictions[smi].last >= minsim
- puts [smi,"TP"].join(",")
- elsif predictions[smi].first == "0" and exp == "0" and predictions[smi].last >= minsim
- puts [smi,"TN"].join(",")
- elsif predictions[smi].first == "1" and exp == "0" and predictions[smi].last >= minsim
- puts [smi,"FP"].join(",")
- elsif predictions[smi].first == "0" and exp == "1" and predictions[smi].last >= minsim
- puts [smi,"FN"].join(",")
- else
- puts [smi,"NA"].join(",")
- end
- else
- puts [smi,"NA"].join(",")
- end
+thresh = ARGV[1].to_f
+thresh ||= 0
+File.readlines(ARGV[0]).each do |pred|
+ smi,c,maxsim = pred.split(",")
+ puts [smi,c].join(",") if maxsim.to_f > thresh
end
diff --git a/scripts/lazar-pa-predictions.rb b/scripts/lazar-pa-predictions.rb
index 3fb61da..79c8198 100755
--- a/scripts/lazar-pa-predictions.rb
+++ b/scripts/lazar-pa-predictions.rb
@@ -1,9 +1,8 @@
#!/usr/bin/env ruby
-predictions = {}
minsim = ARGV[1].to_f
minsim ||= 0
-File.readlines(ARGV[0]).each do |l|
+File.readlines(ARGV[0])[1..-1].each do |l|
smi,m,pred,pa,pi,maxsim,n = l.split(",")
pred = "NA" if maxsim.to_f < minsim
puts [smi,pred].join(",")
diff --git a/scripts/pa-groups.R b/scripts/pa-groups.R
index 3c6ce2c..77b358b 100755
--- a/scripts/pa-groups.R
+++ b/scripts/pa-groups.R
@@ -1,33 +1,49 @@
#!/usr/bin/env Rscript
library(ggplot2)
args = commandArgs(trailingOnly=TRUE)
-groups = read.csv(args[1],header=T)
-data = read.csv(args[2])
-for (i in c(2:10)) {
- name = names(groups)[i]
- cols = c(2:15)
- group = data[groups[i] == 1,cols]
- freq = 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group))
- algos = toupper(names(data)[cols])
- algos = gsub("HIGH",'HC',algos)
- algos = gsub(".CONFIDENCE",'',algos)
- algos = gsub("\\.",'-',algos)
- algos <- factor(algos,levels=rev(c(
- "MP2D-LAZAR-ALL",
+freq = read.csv(args[1],header=T,quote="'",sep=",")
+
+models = factor(freq$Model,levels=rev(c(
"MP2D-LAZAR-HC",
+ "MP2D-LAZAR-ALL",
"MP2D-RF",
- "MP2D-LR",
- "MP2D-LR2",
+ "MP2D-LR-sgd",
+ "MP2D-LR-scikit",
"MP2D-NN",
"MP2D-SVM",
- "CDK-LAZAR-ALL",
"CDK-LAZAR-HC",
+ "CDK-LAZAR-ALL",
"CDK-RF",
- "CDK-LR",
- "CDK-LR2",
+ "CDK-LR-sgd",
+ "CDK-LR-scikit",
"CDK-NN",
"CDK-SVM"
- )))
- plot = ggplot(data.frame(freq),aes(x=freq,y=algos)) + geom_bar(stat="identity") + xlab("% mutagenic") + ylab(element_blank()) + xlim(c(0,100))# + theme(axis.text.x = element_text(angle=90))
- ggsave(paste("figures/",name,".png",sep=""))
-}
+)))
+
+colors <- c(
+"MP2D-LAZAR-HC" = "#0072B2",
+"MP2D-LAZAR-ALL" = "#56B4E9",
+"MP2D-RF" = "#009E73",
+"MP2D-LR-sgd" = "#F0E442",
+"MP2D-LR-scikit" = "#D55E00",
+"MP2D-NN" = "#CC79A7",
+"MP2D-SVM" = "#E69F00",
+"CDK-LAZAR-HC" = "#0072B2",
+"CDK-LAZAR-ALL" = "#56B4E9",
+"CDK-RF" = "#009E73",
+"CDK-LR-sgd" = "#F0E442",
+"CDK-LR-scikit" = "#D55E00",
+"CDK-NN" = "#CC79A7",
+"CDK-SVM" = "#E69F00"
+)
+
+ggplot(freq,aes(Frequency,models,fill=models)) +
+ geom_bar(stat="identity",show.legend=F) +
+ xlab("% mutagenic") +
+ ylab(element_blank()) +
+ xlim(c(0,100)) +
+ scale_fill_manual(values = colors) +
+ facet_wrap(~PA.Group) +
+ theme_minimal() +
+
+ggsave(args[2])
diff --git a/scripts/pa-groups.rb b/scripts/pa-groups.rb
new file mode 100755
index 0000000..b183116
--- /dev/null
+++ b/scripts/pa-groups.rb
@@ -0,0 +1,41 @@
+#!/usr/bin/env ruby
+group_data = File.readlines(ARGV[0]).collect{|l| l.chomp.split(",")}
+predictions = File.readlines(ARGV[1]).collect{|l| l.chomp.split(",")}
+group_names = group_data.shift
+group_names.shift
+algo_names = predictions.shift
+algo_names.shift
+groups = {}
+group_data.each do |d|
+ smi = d.shift
+ groups[smi] ||= []
+ d.each_with_index do |v,i|
+ groups[smi] << group_names[i] if v == "1"
+ end
+end
+mut = {}
+n = {}
+predictions.each do |pred|
+ smi = pred.shift
+ pred.each_with_index do |p,i|
+ algo = algo_names[i]
+ group_names.each do |g|
+ mut[g] ||= {}
+ n[g] ||= {}
+ mut[g][algo] ||= 0
+ n[g][algo] ||= 0
+ if groups[smi].include? g
+ n[g][algo]+=1
+ mut[g][algo]+=1 if p == "1"
+ end
+ end
+ end
+end
+
+q = "'"
+puts ["'PA Group'","'Model'","'Frequency'"].join(",")
+mut.each do |g,val|
+ val.each do |a,n_mut|
+ puts [q+g+q,q+a.sub("high-confidence","HC").upcase.sub(/-LR$/,"-LR-sgd").sub("LR2","LR-scikit")+q,100.0*n_mut/n[g][a]].join(",")
+ end
+end
diff --git a/scripts/pa-predictions.rb b/scripts/pa-predictions.rb
index 9500c39..712017b 100755
--- a/scripts/pa-predictions.rb
+++ b/scripts/pa-predictions.rb
@@ -8,6 +8,8 @@ ARGV.each do |f|
algos << name
File.readlines(f).each do |l|
smi,pred = l.chomp.split(",")
+ pred ||= "NA"
+ pred = nil if pred == "NA"
predictions[smi] ||= {}
predictions[smi][name] = pred
end
diff --git a/scripts/roc.R b/scripts/roc.R
index 32e1674..cbd5ea6 100755
--- a/scripts/roc.R
+++ b/scripts/roc.R
@@ -1,45 +1,28 @@
#!/usr/bin/env Rscript
library(ggplot2)
-data <- read.csv("figures/roc.csv",header=T)
-labels = factor(row.names(data), levels = c("MP2D-LAZAR-HC", "MP2D-LAZAR-ALL", "MP2D-RF", "MP2D-LR", "MP2D-LR2", "MP2D-NN", "MP2D-SVM", "CDK-LAZAR-HC", "CDK-LAZAR-ALL", "CDK-RF", "CDK-LR", "CDK-LR2", "CDK-NN", "CDK-SVM"))
-shapes = c(
-"MP2D-LAZAR-HC" = 16,
-"MP2D-LAZAR-ALL" = 16,
-"MP2D-RF" = 16,
-"MP2D-LR" = 16,
-"MP2D-LR2" = 16,
-"MP2D-NN" = 16,
-"MP2D-SVM" = 16,
-"CDK-LAZAR-HC" = 17,
-"CDK-LAZAR-ALL" = 17,
-"CDK-RF" = 17,
-"CDK-LR" = 17,
-"CDK-LR2" = 17,
-"CDK-NN" = 17,
-"CDK-SVM" = 17)
+args = commandArgs(trailingOnly=TRUE)
-colors <- c(
-"MP2D-LAZAR-HC" = "#E69F00",
-"MP2D-LAZAR-ALL" = "#56B4E9",
-"MP2D-RF" = "#009E73",
-"MP2D-LR" = "#F0E442",
-"MP2D-LR2" = "#0072B2",
-"MP2D-NN" = "#D55E00",
-"MP2D-SVM" = "#CC79A7",
-"CDK-LAZAR-HC" = "#E69F00",
-"CDK-LAZAR-ALL" = "#56B4E9",
-"CDK-RF" = "#009E73",
-"CDK-LR" = "#F0E442",
-"CDK-LR2" = "#0072B2",
-"CDK-NN" = "#D55E00",
-"CDK-SVM" = "#CC79A7")
+data = read.csv(args[1],header=T)
+model_labels = factor(data$model, levels = c("LAZAR-HC", "LAZAR-ALL", "RF", "LR-sgd", "LR-scikit", "NN", "SVM"))
+descriptor_labels = factor(data$descriptor, levels = c("MP2D", "CDK"))
-p <- ggplot(data)
-p <- p + geom_point(aes(x=fpr, y=tpr, color = labels, shape = labels))
-p <- p + geom_abline()
-p <- p + theme(legend.title=element_blank())
-p <- p + expand_limits(x=c(0,1),y=c(0,1))
-p <- p + scale_shape_manual(values = shapes)
-p <- p + scale_color_manual(values = colors)
-p <- p + labs(x = "False positive rate", y = "True positive rate")
-ggsave("figures/roc.png")
+colors = c(
+"LAZAR-HC" = "#0072B2",
+"LAZAR-ALL" = "#56B4E9",
+"RF" = "#009E73",
+"LR-sgd" = "#F0E442",
+"LR-scikit" = "#D55E00",
+"NN" = "#CC79A7",
+"SVM" = "#E69F00"
+)
+
+ggplot(data,aes(fpr, tpr, color = model_labels, shape = descriptor_labels)) +
+ geom_point(size = 2.5) +
+ geom_abline() +
+ expand_limits(x=c(0,1),y=c(0,1)) +
+ labs(x = "False positive rate", y = "True positive rate") +
+ scale_color_manual(values = colors) +
+ theme_minimal() +
+ theme(legend.title=element_blank())#,legend.position = "bottom",legend.direction="vertical")#,legend.key.height = 7,legend.key.width=2) +
+
+ggsave(args[2])
diff --git a/scripts/roc.rb b/scripts/roc.rb
index 8e2dc51..8c8c93c 100755
--- a/scripts/roc.rb
+++ b/scripts/roc.rb
@@ -2,7 +2,8 @@
require "yaml"
data = YAML.load(File.read ARGV[0])[:cv]
-puts "tpr,fpr"
+puts "descriptor,model,tpr,fpr"
data.each do |algo,values|
- puts [algo.upcase.gsub('_','-').sub("HIGH-CONFIDENCE","HC"),values[:tpr],values[:fpr]].join(",")
+ desc,model = algo.split("_",2)
+ puts [desc.upcase,model.upcase.gsub('_','-').sub("HIGH-CONFIDENCE","HC").sub(/^LR$/,"LR-sgd").sub("LR2","LR-scikit"),values[:tpr],values[:fpr]].join(",")
end
diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb
deleted file mode 100755
index d0da0af..0000000
--- a/scripts/summary2table.rb
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env ruby
-require 'yaml'
-
-rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"}
-data = YAML.load_file("crossvalidations/summary.yaml")[:cv]
-
-case ARGV[0]
-when "tensorflow"
- header = ["MP2D-RF","MP2D-LR-sgd","MP2D-LR-scikit","MP2D-NN","MP2D-SVM","CDK-RF","CDK-LR-sgd","CDK-LR-scikit","CDK-NN","CDK-SVM"]
- desc = ["mp2d","cdk"]
- algos = ["rf","lr","lr2","nn","svm"]
- keys = []
- desc.each do |d|
- algos.each do |a|
- keys << "tensorflow-"+a+"-"+d
- end
- end
-when "lazar"
- header = ["MP2D", "CDK"]
- mp2dkeys = ["lazar-mp2d-all","lazar-mp2d-high-confidence"]
- cdkkeys = ["lazar-cdk-all","lazar-cdk-high-confidence"]
- puts ","+header.join(",")
- rows.each do |short,long|
- print long+","
- print mp2dkeys.collect{|k| data[k][short]}.join("/")+","
- puts cdkkeys.collect{|k| data[k][short]}.join("/")
- end
- exit
-end
-puts ","+header.join(",")
-rows.each do |short,long|
- print long+","
- puts keys.collect{|k| data[k][short]}.join(",")
-end
diff --git a/scripts/tsne-cdk-coordinates.R b/scripts/tsne-cdk-coordinates.R
index f9baff9..f9baff9 100644..100755
--- a/scripts/tsne-cdk-coordinates.R
+++ b/scripts/tsne-cdk-coordinates.R
diff --git a/scripts/tsne-classifications.R b/scripts/tsne-classifications.R
new file mode 100755
index 0000000..6f37d11
--- /dev/null
+++ b/scripts/tsne-classifications.R
@@ -0,0 +1,19 @@
+#!/usr/bin/env Rscript
+library(Rtsne)
+library(ggplot2)
+args = commandArgs(trailingOnly=TRUE)
+tsne = read.csv(args[1],header=T)
+tsne[,1] = NULL
+labels = read.csv(args[2],header=F, na.strings = "-")[,1]
+labels = factor(labels,levels=c("TP","TN","FN","FP","NA","PA-mutagenic","PA-nonmutagenic","PA-NA"))
+tsne_plot = data.frame(x = tsne$x, y = tsne$y)
+blue = "#00BFC4"
+red = "#F8766D"
+green = "#7CAE00"
+grey = "#AAAAAA"
+colors = c("PA-mutagenic" = "red", "PA-nonmutagenic" = "green", "PA-NA" = grey, "TP" = red, "TN" = green, "FP" = "red", "FN" = "green", "NA" = grey)
+shapes = c("PA-mutagenic" = 22, "PA-nonmutagenic" = 22, "PA-NA" = 22,"TP" = 16, "TN" = 16, "FP" = 23, "FN" = 23, "NA" = 16)
+fills = c("PA-mutagenic" = blue, "PA-nonmutagenic" = blue, "PA-NA" = blue, "TP" = red, "TN" = green, "FP" = "green", "FN" = "red", "NA" = grey)
+plot = ggplot(tsne_plot)
+plot + geom_point(aes(x=x, y=y, color = labels, shape = labels, fill = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_shape_manual(values = shapes) + scale_color_manual(values = colors) + scale_fill_manual(values = fills)
+ggsave(args[3])
diff --git a/scripts/tsne-mp2d-coordinates.R b/scripts/tsne-mp2d-coordinates.R
index ef97595..ef97595 100644..100755
--- a/scripts/tsne-mp2d-coordinates.R
+++ b/scripts/tsne-mp2d-coordinates.R