12 files changed, 159 insertions, 129 deletions
diff --git a/scripts/cv-pa-classifications.rb b/scripts/cv-pa-classifications.rb
new file mode 100755
index 0000000..fe64078
--- /dev/null
+++ b/scripts/cv-pa-classifications.rb
@@ -0,0 +1,26 @@
+#!/usr/bin/env ruby
+
+classifications = {}
+File.readlines(ARGV[1]).each do |l|
+  smi,c = l.chomp.split(",")
+  classifications[smi] = c
+end
+lines = File.readlines(ARGV[2])
+lines.shift
+lines.each do |l|
+  smi,c = l.chomp.split(",")
+  if c == "1"
+    classifications[smi] = "PA-mutagenic"
+  elsif c == "0"
+    classifications[smi] = "PA-nonmutagenic"
+  else
+    classifications[smi] = "PA-NA"
+  end
+end
+lines=File.readlines(ARGV[0])
+lines.shift
+lines.each do |l|
+  smi,x,y = l.split(",")
+  smi.gsub!('"','')
+  classifications[smi] ? puts(classifications[smi]) : puts("NA")
+end
diff --git a/scripts/lazar-cv-predictions.rb b/scripts/lazar-cv-predictions.rb
index 9236bec..9f94776 100755
--- a/scripts/lazar-cv-predictions.rb
+++ b/scripts/lazar-cv-predictions.rb
@@ -1,31 +1,8 @@
 #!/usr/bin/env ruby
 
-predictions = {}
-minsim = ARGV[1].to_f
-minsim ||= 0
-Dir[File.join(ARGV[0],"crossvalidation","*","test","predictions")].each do |pred|
-  File.readlines(pred).each_with_index do |l,i|
-    smi,m,pred,pa,pi,maxsim,n = l.split(",")
-    predictions[smi] = [pred,maxsim.to_f]
-  end
-end
-lines = File.readlines(File.join("mutagenicity","mutagenicity.csv"))
-lines.shift
-lines.each do |line|
-  smi,exp = line.chomp.split(",")
-  if predictions[smi]
-    if predictions[smi].first == "1" and exp == "1" and predictions[smi].last >= minsim
-      puts [smi,"TP"].join(",")
-    elsif predictions[smi].first == "0" and exp == "0" and predictions[smi].last >= minsim
-      puts [smi,"TN"].join(",")
-    elsif predictions[smi].first == "1" and exp == "0" and predictions[smi].last >= minsim
-      puts [smi,"FP"].join(",")
-    elsif predictions[smi].first == "0" and exp == "1" and predictions[smi].last >= minsim
-      puts [smi,"FN"].join(",")
-    else
-      puts [smi,"NA"].join(",")
-    end
-  else
-    puts [smi,"NA"].join(",")
-  end
+thresh = ARGV[1].to_f
+thresh ||= 0
+File.readlines(ARGV[0]).each do |pred|
+  smi,c,maxsim = pred.split(",")
+  puts [smi,c].join(",") if maxsim.to_f > thresh
 end
diff --git a/scripts/lazar-pa-predictions.rb b/scripts/lazar-pa-predictions.rb
index 3fb61da..79c8198 100755
--- a/scripts/lazar-pa-predictions.rb
+++ b/scripts/lazar-pa-predictions.rb
@@ -1,9 +1,8 @@
 #!/usr/bin/env ruby
 
-predictions = {}
 minsim = ARGV[1].to_f
 minsim ||= 0
-File.readlines(ARGV[0]).each do |l|
+File.readlines(ARGV[0])[1..-1].each do |l|
   smi,m,pred,pa,pi,maxsim,n = l.split(",")
   pred = "NA" if maxsim.to_f < minsim
   puts [smi,pred].join(",")
diff --git a/scripts/pa-groups.R b/scripts/pa-groups.R
index 3c6ce2c..77b358b 100755
--- a/scripts/pa-groups.R
+++ b/scripts/pa-groups.R
@@ -1,33 +1,49 @@
 #!/usr/bin/env Rscript
 library(ggplot2)
 args = commandArgs(trailingOnly=TRUE)
-groups = read.csv(args[1],header=T)
-data = read.csv(args[2])
-for (i in c(2:10)) {
-  name = names(groups)[i]
-  cols = c(2:15)
-  group = data[groups[i] == 1,cols]
-  freq = 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group))
-  algos = toupper(names(data)[cols])
-  algos = gsub("HIGH",'HC',algos)
-  algos = gsub(".CONFIDENCE",'',algos)
-  algos = gsub("\\.",'-',algos)
-  algos <- factor(algos,levels=rev(c(
-  "MP2D-LAZAR-ALL",
+freq = read.csv(args[1],header=T,quote="'",sep=",")
+
+models = factor(freq$Model,levels=rev(c(
   "MP2D-LAZAR-HC",
+  "MP2D-LAZAR-ALL",
   "MP2D-RF",
-  "MP2D-LR",
-  "MP2D-LR2",
+  "MP2D-LR-sgd",
+  "MP2D-LR-scikit",
   "MP2D-NN",
   "MP2D-SVM",
-  "CDK-LAZAR-ALL",
   "CDK-LAZAR-HC",
+  "CDK-LAZAR-ALL",
   "CDK-RF",
-  "CDK-LR",
-  "CDK-LR2",
+  "CDK-LR-sgd",
+  "CDK-LR-scikit",
   "CDK-NN",
   "CDK-SVM"
-  )))
-  plot = ggplot(data.frame(freq),aes(x=freq,y=algos)) + geom_bar(stat="identity") + xlab("% mutagenic") + ylab(element_blank()) + xlim(c(0,100))# + theme(axis.text.x = element_text(angle=90))
-  ggsave(paste("figures/",name,".png",sep=""))
-}
+)))
+
+colors <- c(
+"MP2D-LAZAR-HC" =  "#0072B2",
+"MP2D-LAZAR-ALL" = "#56B4E9",
+"MP2D-RF" =        "#009E73",
+"MP2D-LR-sgd" =        "#F0E442",
+"MP2D-LR-scikit" =       "#D55E00",
+"MP2D-NN" =        "#CC79A7",
+"MP2D-SVM" =       "#E69F00",
+"CDK-LAZAR-HC" =   "#0072B2",
+"CDK-LAZAR-ALL" =  "#56B4E9",
+"CDK-RF" =         "#009E73",
+"CDK-LR-sgd" =         "#F0E442",
+"CDK-LR-scikit" =        "#D55E00",
+"CDK-NN" =         "#CC79A7",
+"CDK-SVM" =        "#E69F00"
+)
+
+ggplot(freq,aes(Frequency,models,fill=models)) +
+  geom_bar(stat="identity",show.legend=F) +
+  xlab("% mutagenic") +
+  ylab(element_blank()) +
+  xlim(c(0,100)) +
+  scale_fill_manual(values = colors) +
+  facet_wrap(~PA.Group) +
+  theme_minimal() +
+
+ggsave(args[2])
diff --git a/scripts/pa-groups.rb b/scripts/pa-groups.rb
new file mode 100755
index 0000000..b183116
--- /dev/null
+++ b/scripts/pa-groups.rb
@@ -0,0 +1,41 @@
+#!/usr/bin/env ruby
+group_data = File.readlines(ARGV[0]).collect{|l| l.chomp.split(",")}
+predictions = File.readlines(ARGV[1]).collect{|l| l.chomp.split(",")}
+group_names = group_data.shift
+group_names.shift
+algo_names = predictions.shift
+algo_names.shift
+groups = {}
+group_data.each do |d|
+  smi = d.shift
+  groups[smi] ||= []
+  d.each_with_index do |v,i|
+    groups[smi] << group_names[i] if v == "1"
+  end
+end
+mut = {}
+n = {}
+predictions.each do |pred|
+  smi = pred.shift
+  pred.each_with_index do |p,i|
+    algo = algo_names[i]
+    group_names.each do |g|
+      mut[g] ||= {}
+      n[g] ||= {}
+      mut[g][algo] ||= 0
+      n[g][algo] ||= 0
+      if groups[smi].include? g
+        n[g][algo]+=1
+        mut[g][algo]+=1 if p == "1"
+      end
+    end
+  end
+end
+
+q = "'"
+puts ["'PA Group'","'Model'","'Frequency'"].join(",")
+mut.each do |g,val|
+  val.each do |a,n_mut|
+    puts [q+g+q,q+a.sub("high-confidence","HC").upcase.sub(/-LR$/,"-LR-sgd").sub("LR2","LR-scikit")+q,100.0*n_mut/n[g][a]].join(",")
+  end
+end
diff --git a/scripts/pa-predictions.rb b/scripts/pa-predictions.rb
index 9500c39..712017b 100755
--- a/scripts/pa-predictions.rb
+++ b/scripts/pa-predictions.rb
@@ -8,6 +8,8 @@ ARGV.each do |f|
   algos << name
   File.readlines(f).each do |l|
     smi,pred = l.chomp.split(",")
+    pred ||= "NA"
+    pred = nil if pred == "NA"
     predictions[smi] ||= {}
     predictions[smi][name] = pred
   end
diff --git a/scripts/roc.R b/scripts/roc.R
index 32e1674..cbd5ea6 100755
--- a/scripts/roc.R
+++ b/scripts/roc.R
@@ -1,45 +1,28 @@
 #!/usr/bin/env Rscript
 library(ggplot2)
-data <- read.csv("figures/roc.csv",header=T)
-labels = factor(row.names(data), levels = c("MP2D-LAZAR-HC", "MP2D-LAZAR-ALL", "MP2D-RF", "MP2D-LR", "MP2D-LR2", "MP2D-NN", "MP2D-SVM", "CDK-LAZAR-HC", "CDK-LAZAR-ALL", "CDK-RF", "CDK-LR", "CDK-LR2", "CDK-NN", "CDK-SVM"))
-shapes = c(
-"MP2D-LAZAR-HC" = 16,
-"MP2D-LAZAR-ALL" = 16,
-"MP2D-RF" = 16,
-"MP2D-LR" = 16,
-"MP2D-LR2" = 16,
-"MP2D-NN" = 16,
-"MP2D-SVM" = 16,
-"CDK-LAZAR-HC" = 17,
-"CDK-LAZAR-ALL" = 17,
-"CDK-RF" = 17,
-"CDK-LR" = 17,
-"CDK-LR2" = 17,
-"CDK-NN" = 17,
-"CDK-SVM" = 17)
+args = commandArgs(trailingOnly=TRUE)
 
-colors <- c(
-"MP2D-LAZAR-HC" =  "#E69F00",
-"MP2D-LAZAR-ALL" = "#56B4E9",
-"MP2D-RF" =        "#009E73",
-"MP2D-LR" =        "#F0E442",
-"MP2D-LR2" =       "#0072B2",
-"MP2D-NN" =        "#D55E00",
-"MP2D-SVM" =       "#CC79A7",
-"CDK-LAZAR-HC" =   "#E69F00",
-"CDK-LAZAR-ALL" =  "#56B4E9",
-"CDK-RF" =         "#009E73",
-"CDK-LR" =         "#F0E442",
-"CDK-LR2" =        "#0072B2",
-"CDK-NN" =         "#D55E00",
-"CDK-SVM" =        "#CC79A7")
+data = read.csv(args[1],header=T)
+model_labels = factor(data$model, levels = c("LAZAR-HC", "LAZAR-ALL", "RF", "LR-sgd", "LR-scikit", "NN", "SVM"))
+descriptor_labels = factor(data$descriptor, levels = c("MP2D", "CDK"))
 
-p <- ggplot(data)
-p <- p + geom_point(aes(x=fpr, y=tpr, color = labels, shape = labels))
-p <- p + geom_abline()
-p <- p + theme(legend.title=element_blank())
-p <- p + expand_limits(x=c(0,1),y=c(0,1))
-p <- p + scale_shape_manual(values = shapes)
-p <- p + scale_color_manual(values = colors)
-p <- p + labs(x = "False positive rate", y = "True positive rate")
-ggsave("figures/roc.png")
+colors = c(
+"LAZAR-HC" =  "#0072B2",
+"LAZAR-ALL" = "#56B4E9",
+"RF" =        "#009E73",
+"LR-sgd" =        "#F0E442",
+"LR-scikit" =       "#D55E00",
+"NN" =        "#CC79A7",
+"SVM" =       "#E69F00"
+)
+
+ggplot(data,aes(fpr, tpr, color = model_labels, shape = descriptor_labels)) +
+  geom_point(size = 2.5) +
+  geom_abline() +
+  expand_limits(x=c(0,1),y=c(0,1)) +
+  labs(x = "False positive rate", y = "True positive rate") +
+  scale_color_manual(values = colors) +
+  theme_minimal() +
+  theme(legend.title=element_blank())#,legend.position = "bottom",legend.direction="vertical")#,legend.key.height = 7,legend.key.width=2) +
+
+ggsave(args[2])
diff --git a/scripts/roc.rb b/scripts/roc.rb
index 8e2dc51..8c8c93c 100755
--- a/scripts/roc.rb
+++ b/scripts/roc.rb
@@ -2,7 +2,8 @@
 require "yaml"
 
 data = YAML.load(File.read ARGV[0])[:cv]
-puts "tpr,fpr"
+puts "descriptor,model,tpr,fpr"
 data.each do |algo,values|
-  puts [algo.upcase.gsub('_','-').sub("HIGH-CONFIDENCE","HC"),values[:tpr],values[:fpr]].join(",")
+  desc,model = algo.split("_",2)
+  puts [desc.upcase,model.upcase.gsub('_','-').sub("HIGH-CONFIDENCE","HC").sub(/^LR$/,"LR-sgd").sub("LR2","LR-scikit"),values[:tpr],values[:fpr]].join(",")
 end
diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb
deleted file mode 100755
index d0da0af..0000000
--- a/scripts/summary2table.rb
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env ruby
-require 'yaml'
-
-rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"}
-data = YAML.load_file("crossvalidations/summary.yaml")[:cv]
-
-case ARGV[0]
-when "tensorflow"
-  header = ["MP2D-RF","MP2D-LR-sgd","MP2D-LR-scikit","MP2D-NN","MP2D-SVM","CDK-RF","CDK-LR-sgd","CDK-LR-scikit","CDK-NN","CDK-SVM"]
-  desc = ["mp2d","cdk"]
-  algos = ["rf","lr","lr2","nn","svm"]
-  keys = []
-  desc.each do |d| 
-    algos.each do |a|
-      keys << "tensorflow-"+a+"-"+d
-    end
-  end
-when "lazar"
-  header = ["MP2D", "CDK"]
-  mp2dkeys = ["lazar-mp2d-all","lazar-mp2d-high-confidence"]
-  cdkkeys = ["lazar-cdk-all","lazar-cdk-high-confidence"]
-  puts ","+header.join(",")
-  rows.each do |short,long|
-    print long+","
-    print mp2dkeys.collect{|k| data[k][short]}.join("/")+","
-    puts cdkkeys.collect{|k| data[k][short]}.join("/")
-  end
-  exit
-end
-puts ","+header.join(",")
-rows.each do |short,long|
-  print long+","
-  puts keys.collect{|k| data[k][short]}.join(",")
-end
diff --git a/scripts/tsne-cdk-coordinates.R b/scripts/tsne-cdk-coordinates.R
index f9baff9..f9baff9 100644..100755
--- a/scripts/tsne-cdk-coordinates.R
+++ b/scripts/tsne-cdk-coordinates.R
diff --git a/scripts/tsne-classifications.R b/scripts/tsne-classifications.R
new file mode 100755
index 0000000..6f37d11
--- /dev/null
+++ b/scripts/tsne-classifications.R
@@ -0,0 +1,19 @@
+#!/usr/bin/env Rscript
+library(Rtsne)
+library(ggplot2)
+args = commandArgs(trailingOnly=TRUE)
+tsne = read.csv(args[1],header=T)
+tsne[,1] = NULL
+labels = read.csv(args[2],header=F, na.strings = "-")[,1]
+labels = factor(labels,levels=c("TP","TN","FN","FP","NA","PA-mutagenic","PA-nonmutagenic","PA-NA"))
+tsne_plot = data.frame(x = tsne$x, y = tsne$y)
+blue = "#00BFC4"
+red = "#F8766D"
+green = "#7CAE00"
+grey = "#AAAAAA"
+colors = c("PA-mutagenic" = "red", "PA-nonmutagenic" = "green", "PA-NA" = grey, "TP" = red, "TN" = green, "FP" = "red", "FN" = "green", "NA" = grey)
+shapes = c("PA-mutagenic" = 22, "PA-nonmutagenic" = 22, "PA-NA" = 22,"TP" = 16, "TN" = 16, "FP" = 23, "FN" = 23, "NA" = 16)
+fills = c("PA-mutagenic" = blue, "PA-nonmutagenic" = blue, "PA-NA" = blue, "TP" = red, "TN" = green, "FP" = "green", "FN" = "red", "NA" = grey)
+plot = ggplot(tsne_plot)
+plot + geom_point(aes(x=x, y=y, color = labels, shape = labels, fill = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_shape_manual(values = shapes) + scale_color_manual(values = colors) + scale_fill_manual(values = fills)
+ggsave(args[3])
diff --git a/scripts/tsne-mp2d-coordinates.R b/scripts/tsne-mp2d-coordinates.R
index ef97595..ef97595 100644..100755
--- a/scripts/tsne-mp2d-coordinates.R
+++ b/scripts/tsne-mp2d-coordinates.R