summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2021-02-22 23:26:29 +0100
committerChristoph Helma <helma@in-silico.ch>2021-02-22 23:26:29 +0100
commited83d4c5347ebf43b2de55782b290b66bada4561 (patch)
treeddf3ee1eb6d4f5d250835345798086b5204a23ee /scripts
parent3af0c3d5c5b7f7d506a4582bbe3dca7d22bbefcc (diff)
more script consolidations
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/cdk-descriptors.rb41
-rwxr-xr-xscripts/confusion-matrix.rb21
-rwxr-xr-xscripts/cv-summary.rb (renamed from scripts/confusion-matrix-summary.rb)2
-rwxr-xr-xscripts/cv-tensorflow-confusion-matrix.rb32
-rwxr-xr-xscripts/data.rb42
-rwxr-xr-xscripts/mp2d-distances.rb30
-rwxr-xr-xscripts/pa-groups.R34
-rwxr-xr-xscripts/pa-predictions-latex.rb74
-rwxr-xr-xscripts/pa-predictions.rb18
-rwxr-xr-xscripts/pa-summary.rb2
-rwxr-xr-xscripts/pa-table.rb70
-rwxr-xr-xscripts/roc.R43
-rwxr-xr-xscripts/roc.rb8
-rwxr-xr-xscripts/summary2roc.rb9
-rwxr-xr-xscripts/summary2table.rb2
-rwxr-xr-xscripts/tsne-cdk-descriptors.rb26
-rwxr-xr-xscripts/tsne-cdk.R21
-rwxr-xr-xscripts/tsne-mp2d-distances.rb30
-rwxr-xr-xscripts/tsne-mp2d.R14
-rwxr-xr-xscripts/tsne-mutagenicity.R12
20 files changed, 274 insertions, 257 deletions
diff --git a/scripts/cdk-descriptors.rb b/scripts/cdk-descriptors.rb
deleted file mode 100755
index bb13f97..0000000
--- a/scripts/cdk-descriptors.rb
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env ruby
-train = File.readlines(ARGV[0])
-pa = File.readlines(ARGV[1])
-train_header = train.shift.chomp.split(",").collect{|i| i.gsub('"','')}
-pa_header = pa.shift.chomp.split(";")
-train_header.shift
-train_header.shift
-pa_header.shift
-
-#train_only = train_header - pa_header
-#pa_only = pa_header - train_header
-#puts train_only.size.to_s+ " training set descriptors missing from PAs:"
-#puts train_only.join(",")
-#puts
-#puts pa_only.size.to_s+ " PA descriptors not in training set:"
-#puts pa_only.join(",")
-#exit
-
-common = train_header & pa_header
-
-puts (["Mutagenicity"]+common).join(",")
-train.each do |line|
- items = line.chomp.split ","
- id = items.shift
- #id = "TRAIN"+id.gsub('"','')
- act = items.shift
- act == '"1"' ? act = "mutagen" : act = "non-mutagen"
- descriptors = {}
- items.each_with_index {|item,i| descriptors[train_header[i]] = item.sub(',','.').to_f }
- puts ([id,act]+common.collect{|h| descriptors[h]}).join(",")
-end
-
-
-pa.each do |line|
- items = line.chomp.split ";"
- id = "PA"+items.shift
- act = "PA"
- descriptors = {}
- items.each_with_index {|item,i| descriptors[pa_header[i]] = item.sub(',','.').to_f }
- puts ([id,act]+common.collect{|h| descriptors[h]}).join(",")
-end
diff --git a/scripts/confusion-matrix.rb b/scripts/confusion-matrix.rb
new file mode 100755
index 0000000..c40ee2f
--- /dev/null
+++ b/scripts/confusion-matrix.rb
@@ -0,0 +1,21 @@
+#!/usr/bin/env ruby
+require 'csv'
+
+tp = 0
+fp = 0
+tn = 0
+fn = 0
+File.readlines(ARGV[0]).each do |line|
+ pred = line.chomp.split(",").last
+ case pred
+ when "TP"
+ tp+=1
+ when "TN"
+ tn+=1
+ when "FP"
+ fp+=1
+ when "FN"
+ fn+=1
+ end
+end
+puts "#{tp},#{fp}\n#{fn},#{tn}"
diff --git a/scripts/confusion-matrix-summary.rb b/scripts/cv-summary.rb
index 8a32f79..aad7e2a 100755
--- a/scripts/confusion-matrix-summary.rb
+++ b/scripts/cv-summary.rb
@@ -28,7 +28,7 @@ ARGV.each do |f|
:ppv_perc => (100*tp/(tp+fp)).round(0),
:npv_perc => (100*tn/(tn+fn)).round(0),
}
- results[File.basename(f,".csv")] = result
+ results[f.sub("crossvalidations/confusion-matrices/","").sub(".csv","").gsub("/","_").gsub("-","_")] = result
end
results = {:cv => results}
puts results.to_yaml
diff --git a/scripts/cv-tensorflow-confusion-matrix.rb b/scripts/cv-tensorflow-confusion-matrix.rb
deleted file mode 100755
index ae72b8e..0000000
--- a/scripts/cv-tensorflow-confusion-matrix.rb
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env ruby
-require 'csv'
-
-tp = 0
-fp = 0
-tn = 0
-fn = 0
-
-pred = CSV.read(ARGV[0],headers: true,:col_sep => ",")
-act = CSV.read(File.join("data","training","mutagenicity.csv"),headers: true,:col_sep => ",")
-
-data = {}
-
-pred.each do |row|
- row[1].to_f < 0.5 ? p = 0 : p = 1
- data[row[0]] =[p]
-end
-
-act.each do |row|
- data[row[0]] << row[1].to_i if data[row[0]]
-end
-
-data.each do |smi,a|
-
- tp += 1 if a[0] == 1 and a[1] == 1
- tn += 1 if a[0] == 0 and a[1] == 0
- fp += 1 if a[0] == 0 and a[1] == 1
- fn += 1 if a[0] == 1 and a[1] == 0
-
-end
-
-puts "#{tp},#{fp}\n#{fn},#{tn}"
diff --git a/scripts/data.rb b/scripts/data.rb
index e834677..72e6b28 100755
--- a/scripts/data.rb
+++ b/scripts/data.rb
@@ -10,20 +10,46 @@ data[:cv][:n] = `cut -f1 -d ',' mutagenicity/mutagenicity.csv | wc -l`.chomp.to_
data[:cv][:n_uniq] = `cut -f1 -d ',' mutagenicity/mutagenicity.csv | sort -u | wc -l`.chomp.to_i - 1
data[:cv][:cdk] = {}
-cdk = File.readlines("mutagenicity/cdk/mutagenicity-mod-2.new.csv")
-data[:cv][:cdk][:n_descriptors] = cdk.shift.split(",").size-2
+cdk = File.readlines("mutagenicity/mutagenicity-cdk.csv")
+data[:cv][:cdk][:n_descriptors] = cdk.shift.split(",").size-1
data[:cv][:cdk][:n_compounds] = cdk.size
-
-
-
data[:pa][:groups] = {}
-lines = File.readlines("pyrrolizidine-alkaloids/pa-predictions.csv")
-pa_groups = lines.shift.chomp.split(",")[1..9].collect{|g| g.sub(/[ -]/,"_").to_sym}
+lines = File.readlines("pyrrolizidine-alkaloids/pa-groups.csv")
+pa_groups = lines.shift.chomp.split(",")[1..-1].collect{|g| g.sub(/[ -]/,"_").to_sym}
pa_groups.each {|g| data[:pa][:groups][g] = {}; data[:pa][:groups][g][:n] = 0}
+groups = {}
lines.each do |l|
- l.chomp.split(",")[1..9].each_with_index do |v,i|
+ items = l.chomp.split(",")
+ smi = items[0]
+ items[1..-1].each_with_index do |v,i|
data[:pa][:groups][pa_groups[i]][:n] += v.to_i
+ groups[pa_groups[i]] ||= []
+ groups[pa_groups[i]] << smi if v == "1"
end
end
+lines = File.readlines("pyrrolizidine-alkaloids/pa-predictions.csv")
+algos = lines.shift.chomp.split(",")[1..-1].collect{|g| g.sub(/[ -]/,"_").to_sym}
+lines.each do |l|
+ items = l.chomp.split(",")
+ smi = items[0]
+ items[1..-1].each do |v|
+ groups.each do |group,smiles|
+ data[:pa][:groups][group][:mut] ||= 0
+ data[:pa][:groups][group][:non_mut] ||= 0
+ if smiles.include? smi
+ if v == "1"
+ data[:pa][:groups][group][:mut] += 1
+ elsif v == "0"
+ data[:pa][:groups][group][:non_mut] += 1
+ end
+ end
+ end
+ end
+end
+data[:pa][:groups].each do |g,values|
+ data[:pa][:groups][g][:n_pred] = values[:mut]+values[:non_mut]
+ data[:pa][:groups][g][:mut_perc] = (100*values[:mut]/data[:pa][:groups][g][:n_pred]).round
+ data[:pa][:groups][g][:non_mut_perc] = (100*values[:non_mut]/data[:pa][:groups][g][:n_pred]).round
+end
puts data.to_yaml
diff --git a/scripts/mp2d-distances.rb b/scripts/mp2d-distances.rb
deleted file mode 100755
index 34cc136..0000000
--- a/scripts/mp2d-distances.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../../lazar/lib/similarity.rb"
-
-dependent_variables = File.readlines(File.join("..","lazar","models","mutagenicity-mp2d","dependent-variables")).collect{|l| l.chomp}
-independent_variables = File.readlines(File.join("..","lazar","models","mutagenicity-mp2d","independent-variables")).collect{|l| l.chomp.split ","}
-independent_variables += File.readlines(File.join("..","lazar","predictions","pa-mp2d","independent-variables")).collect{|l| l.chomp.split ","}
-
-dist = []
-independent_variables.each_with_index do |v1,i|
- dist << []
- line = []
- independent_variables.each_with_index do |v2,j|
- if j > i
- d = 1-Similarity.tanimoto([v1,v2])
- dist[i][j] = d
- elsif i == j
- d = 0
- else
- d = dist[j][i]
- end
- line << d
- end
- if dependent_variables[i]
- act="mutagen" if dependent_variables[i] == "1"
- act="non-mutagen" if dependent_variables[i] == "0"
- else
- act="PA"
- end
- puts ([act]+line).join(",")
-end
diff --git a/scripts/pa-groups.R b/scripts/pa-groups.R
index ae4a3c3..3c6ce2c 100755
--- a/scripts/pa-groups.R
+++ b/scripts/pa-groups.R
@@ -1,11 +1,33 @@
#!/usr/bin/env Rscript
library(ggplot2)
-data <- read.csv("pyrrolizidine-alkaloids/pa-predictions.csv")
+args = commandArgs(trailingOnly=TRUE)
+groups = read.csv(args[1],header=T)
+data = read.csv(args[2])
for (i in c(2:10)) {
- name <- names(data)[i]
- cols <- append(c(15,17),c(19:28))
- group <- data[data[i] == 1,cols]
- freq <- 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group))
- plot <- ggplot(data.frame(freq),aes(x=names(data)[cols],y=freq)) + geom_bar(stat="identity") + ylab("% mutagenic") + xlab(element_blank()) + ylim(c(0,100)) + theme(axis.text.x = element_text(angle=90))
+ name = names(groups)[i]
+ cols = c(2:15)
+ group = data[groups[i] == 1,cols]
+ freq = 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group))
+ algos = toupper(names(data)[cols])
+ algos = gsub("HIGH",'HC',algos)
+ algos = gsub(".CONFIDENCE",'',algos)
+ algos = gsub("\\.",'-',algos)
+ algos <- factor(algos,levels=rev(c(
+ "MP2D-LAZAR-ALL",
+ "MP2D-LAZAR-HC",
+ "MP2D-RF",
+ "MP2D-LR",
+ "MP2D-LR2",
+ "MP2D-NN",
+ "MP2D-SVM",
+ "CDK-LAZAR-ALL",
+ "CDK-LAZAR-HC",
+ "CDK-RF",
+ "CDK-LR",
+ "CDK-LR2",
+ "CDK-NN",
+ "CDK-SVM"
+ )))
+ plot = ggplot(data.frame(freq),aes(x=freq,y=algos)) + geom_bar(stat="identity") + xlab("% mutagenic") + ylab(element_blank()) + xlim(c(0,100))# + theme(axis.text.x = element_text(angle=90))
ggsave(paste("figures/",name,".png",sep=""))
}
diff --git a/scripts/pa-predictions-latex.rb b/scripts/pa-predictions-latex.rb
index 1aa6383..d97f3d9 100755
--- a/scripts/pa-predictions-latex.rb
+++ b/scripts/pa-predictions-latex.rb
@@ -1,7 +1,20 @@
#!/usr/bin/env ruby
-#1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & LR-sgd & LR-scikit & NN & RF \\\\
-puts '
+group_lines = File.readlines(ARGV[0]) # groups
+group_names = []
+group_lines.shift.chomp.split(",")[1..-1].each_with_index do |g,i|
+ group_names << "#{i}: #{g}"
+end
+groups = {}
+group_lines.each do |l|
+ items = l.chomp.split(",")
+ smi = items.shift
+ groups[smi] = items
+end
+pred_lines = File.readlines(ARGV[1]) # predictions
+algo_names = pred_lines.shift.chomp.split(",")[1..-1]
+
+print '
\documentclass[]{scrartcl}
\usepackage{color, colortbl}
\usepackage{longtable}
@@ -14,50 +27,49 @@ puts '
\definecolor{black}{rgb}{0,0,0}
\definecolor{white}{rgb}{1,1,1}
\tiny
-\begin{longtable}{rrrrrrrrrccccccccccccccc}
-\caption{Summary of pyrrolizidine alkaloid predictions: red: mutagen, green: non-mutagen, grey: no prediction, dark red/green: low confidence; 1: Retronecine, 2: Otonecine, 3: Platynecine, 4: N-oxide, 5: Dehydropyrrolizidine, 6:Tertiary PA, 7: Macrocyclic-diester, 8: Monoester, 9: Diester} \\\\
-\label{tab:pa}
-
-% 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & DL & SVM & LR-sgd & LR-scikit & NN & RF \\kill % needed as guide for multicolumn
-% \multicolumn{9}{c}{PA Group} & & \multicolumn{2}{c}{lazar} & \multicolumn{4}{c}{Tensorflow}\\\\
-1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Exp. & MP2D & CDK & LR1 & LR2 & NN & RF & SVM & LR1 & LR2 & NN & RF & SVM \\kill % needed as guide for multicolumn
-\multicolumn{9}{c}{PA Group} & & \multicolumn{2}{c}{lazar} & \multicolumn{5}{c}{MP2D} & \multicolumn{5}{c}{CDK}\\\\
-
-1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Exp. & MP2D & CDK & LR1 & LR2 & NN & RF & SVM & LR1 & LR2 & NN & RF & SVM \\\\
-% 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & LR-sgd & LR-scikit & NN & RF \\\\
-\hline
+\begin{longtable}{rrrrrrrrrcccccccccccccc}
+\caption{Summary of pyrrolizidine alkaloid predictions: red: mutagen, green: non-mutagen, grey: no prediction, dark red/green: low confidence; '
+print "#{group_names.join(', ')}"
+puts '} \\\\
+\label{tab:pa}'
+header2 = ((1..9).to_a + algo_names.select{|a| !a.match(/high/)}.collect{|a| a.sub(/mp2d|cdk/,'').sub('-','').upcase.sub("LAZAR-ALL","lazar")}).join(" & ")
+print header2
+puts ' \\kill % needed as guide for multicolumn'
+puts '\multicolumn{9}{c}{PA Group} & \multicolumn{6}{c}{MP2D} & \multicolumn{6}{c}{CDK}\\\\'
+puts header2 + '\\\\'
+puts '\hline
\renewcommand{\arraystretch}{0.075}
'
-lines = File.readlines(ARGV[0])
-header = lines.shift.chomp.split(",")
-lines.each do |l|
- row = ""
+pred_lines.each do |l|
+ row = []
values = l.chomp.split(",")
+ smi = values.shift
+ groups[smi].each do |v|
+ v == "1" ? row << '\cellcolor{black}' : row << '\cellcolor{white}'
+ end
values.each_with_index do |v,i|
- if i == 1
- v == "1" ? row += '\cellcolor{black}' : row += '\cellcolor{white}'
- elsif i > 1 and i < 10
- v == "1" ? row += ' & \cellcolor{black}' : row += ' & \cellcolor{white}'
- elsif i == 14 or i == 16 # lazar
+ case algo_names[i]
+ when /lazar-all/
if v == "1"
- values[i+1] == "1" ? row += ' & \cellcolor{red}' : row += ' & \cellcolor{darkred}'
+ values[i+1] == "1" ? row << '\cellcolor{red}' : row << '\cellcolor{darkred}'
elsif v == "0"
- values[i+1] == "0" ? row += ' & \cellcolor{green}' : row += ' & \cellcolor{darkgreen}'
+ values[i+1] == "0" ? row << '\cellcolor{green}' : row << '\cellcolor{darkgreen}'
else
- row += ' & \cellcolor{grey}'
+ row << '\cellcolor{grey}'
end
- elsif i == 13 or i > 17 # measured or tensorflow
+ when /lazar-high-confidence/ # do nothing
+ else
if v == "1"
- row += ' & \cellcolor{red}'
+ row << '\cellcolor{red}'
elsif v == "0"
- row += ' & \cellcolor{green}'
+ row << '\cellcolor{green}'
else
- row += ' & \cellcolor{grey}'
+ row << '\cellcolor{grey}'
end
end
end
- puts row + ' \\\\'
+ puts row.join(" & ") + ' \\\\'
end
puts '
\end{longtable}
diff --git a/scripts/pa-predictions.rb b/scripts/pa-predictions.rb
new file mode 100755
index 0000000..9500c39
--- /dev/null
+++ b/scripts/pa-predictions.rb
@@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+
+predictions = {}
+algos = []
+
+ARGV.each do |f|
+ name = f.sub("pyrrolizidine-alkaloids/","").sub("/","-").sub(".csv","")
+ algos << name
+ File.readlines(f).each do |l|
+ smi,pred = l.chomp.split(",")
+ predictions[smi] ||= {}
+ predictions[smi][name] = pred
+ end
+end
+puts (["Canonical SMILES"] + algos).join(",")
+predictions.each do |smi,pred|
+ puts ([smi]+algos.collect{|a| pred[a]}).join(",")
+end
diff --git a/scripts/pa-summary.rb b/scripts/pa-summary.rb
index 9fb3d0e..418aa18 100755
--- a/scripts/pa-summary.rb
+++ b/scripts/pa-summary.rb
@@ -7,7 +7,7 @@ summary = {:n => lines.size}
lines.each do |line|
items = line.chomp.split(",")
items.each_with_index do |v,i|
- if header[i].match (/MP2D|CDK/)
+ unless header[i].match (/SMILES/)
key = header[i].gsub("-","_").downcase.to_sym
summary[key] ||= { :n => 0, :mut => 0, :non_mut => 0 }
case v
diff --git a/scripts/pa-table.rb b/scripts/pa-table.rb
deleted file mode 100755
index e2c4983..0000000
--- a/scripts/pa-table.rb
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env ruby
-
-# red groups
-tab = []
-File.read("data/pyrrolizidine-alkaloids/pa-groups.csv").each_line do |l|
- items = l.chomp.split(';')
- if items.first.empty?
- items[0] = "ID"
- else
- id = items.shift
- items.collect!{|i| i == "NA" ? 0 : 1}
- items = [id]+items
- end
- tab << items
-end
-
-tab[0] += ["CID","SMILES","Canonical SMILES","Measured","lazar-all-MP2D","lazar-high-confidence-MP2D","lazar-all-CDK","lazar-high-confidence-CDK"]
-
-i = 0
-File.read("pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv").each_line do |l|
- if i > 0
- id,cid,name,smi = l.chomp.split(";")
- tab[i] += [cid,'"'+smi+'"']
- end
- i += 1
-end
-
-i = 1
-File.read("pyrrolizidine-alkaloids/mp2d/lazar/predictions").each_line do |l|
- cansmi,exp,mut,p0,p1,max_sim,nn = l.chomp.split(",")
- max_sim.to_f < 0.5? hc = "" : hc = mut
- hc = "" if mut.empty?
- tab[i] += ['"'+cansmi+'"',exp,mut,hc]
- i += 1
-end
-
-i=1
-File.read("pyrrolizidine-alkaloids/cdk/lazar/predictions").each_line do |l|
- cansmi,exp,mut,p0,p1,max_sim,nn = l.chomp.split(",")
- max_sim.to_f < 0.5? hc = "" : hc = mut
- hc = "" if mut.empty?
- tab[i] += [mut,hc]
- i += 1
-end
-
-Dir["pyrrolizidine-alkaloids/mp2d/tensorflow/*.csv"].each do |r|
- tab[0] << r.sub('pyrrolizidine-alkaloids/mp2d/tensorflow/pred.','').sub(/\..*csv/,'').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN").sub("svm","SVM")+"-MP2D"
- i = 0
- File.read(r).each_line do |l|
- if i > 0
- id,pred = l.chomp.split(",")
- pred.to_f > 0.5 ? tab[i] << 1 : tab[i] << 0
- end
- i += 1
- end
-end
-
-Dir["pyrrolizidine-alkaloids/cdk/tensorflow/*.csv"].each do |r|
- tab[0] << r.sub('pyrrolizidine-alkaloids/cdk/tensorflow/pred.','').sub(/\..*csv/,'').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN").sub("svm","SVM")+"-CDK"
- i = 0
- File.read(r).each_line do |l|
- if i > 0
- id,pred = l.chomp.split(",")
- pred.to_f > 0.5 ? tab[i] << 1 : tab[i] << 0
- end
- i += 1
- end
-end
-
-puts tab.collect{|r| r.join(",")}.join("\n")
diff --git a/scripts/roc.R b/scripts/roc.R
index 281ab13..32e1674 100755
--- a/scripts/roc.R
+++ b/scripts/roc.R
@@ -1,10 +1,45 @@
#!/usr/bin/env Rscript
library(ggplot2)
data <- read.csv("figures/roc.csv",header=T)
-p <- ggplot(data, aes(x=fpr, y=tpr)) + geom_abline()
-#p <- p + geom_label(label=rownames(data) )
-p <- p + geom_point(aes(color=rownames(data)))
-p <- p + theme(legend.title=element_blank(), legend.position="bottom")
+labels = factor(row.names(data), levels = c("MP2D-LAZAR-HC", "MP2D-LAZAR-ALL", "MP2D-RF", "MP2D-LR", "MP2D-LR2", "MP2D-NN", "MP2D-SVM", "CDK-LAZAR-HC", "CDK-LAZAR-ALL", "CDK-RF", "CDK-LR", "CDK-LR2", "CDK-NN", "CDK-SVM"))
+shapes = c(
+"MP2D-LAZAR-HC" = 16,
+"MP2D-LAZAR-ALL" = 16,
+"MP2D-RF" = 16,
+"MP2D-LR" = 16,
+"MP2D-LR2" = 16,
+"MP2D-NN" = 16,
+"MP2D-SVM" = 16,
+"CDK-LAZAR-HC" = 17,
+"CDK-LAZAR-ALL" = 17,
+"CDK-RF" = 17,
+"CDK-LR" = 17,
+"CDK-LR2" = 17,
+"CDK-NN" = 17,
+"CDK-SVM" = 17)
+
+colors <- c(
+"MP2D-LAZAR-HC" = "#E69F00",
+"MP2D-LAZAR-ALL" = "#56B4E9",
+"MP2D-RF" = "#009E73",
+"MP2D-LR" = "#F0E442",
+"MP2D-LR2" = "#0072B2",
+"MP2D-NN" = "#D55E00",
+"MP2D-SVM" = "#CC79A7",
+"CDK-LAZAR-HC" = "#E69F00",
+"CDK-LAZAR-ALL" = "#56B4E9",
+"CDK-RF" = "#009E73",
+"CDK-LR" = "#F0E442",
+"CDK-LR2" = "#0072B2",
+"CDK-NN" = "#D55E00",
+"CDK-SVM" = "#CC79A7")
+
+p <- ggplot(data)
+p <- p + geom_point(aes(x=fpr, y=tpr, color = labels, shape = labels))
+p <- p + geom_abline()
+p <- p + theme(legend.title=element_blank())
p <- p + expand_limits(x=c(0,1),y=c(0,1))
+p <- p + scale_shape_manual(values = shapes)
+p <- p + scale_color_manual(values = colors)
p <- p + labs(x = "False positive rate", y = "True positive rate")
ggsave("figures/roc.png")
diff --git a/scripts/roc.rb b/scripts/roc.rb
new file mode 100755
index 0000000..8e2dc51
--- /dev/null
+++ b/scripts/roc.rb
@@ -0,0 +1,8 @@
+#!/usr/bin/env ruby
+require "yaml"
+
+data = YAML.load(File.read ARGV[0])[:cv]
+puts "tpr,fpr"
+data.each do |algo,values|
+ puts [algo.upcase.gsub('_','-').sub("HIGH-CONFIDENCE","HC"),values[:tpr],values[:fpr]].join(",")
+end
diff --git a/scripts/summary2roc.rb b/scripts/summary2roc.rb
deleted file mode 100755
index 258f64d..0000000
--- a/scripts/summary2roc.rb
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env ruby
-require "yaml"
-
-data = YAML.load(File.read ARGV[0])[:cv]
-puts "tpr,fpr"
-data.each do |algo,values|
- algo = algo.sub("tensorflow-","").sub("selected","FS").sub(".v3","").sub("high-confidence","HC").sub("padel","PaDEL").sub("lazar ","lazar-MP2D ").sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("nn","NN").sub("-rf","-RF").sub("-svm","-SVM").sub("cdk","CDK").sub("mp2d","MP2D")
- puts [algo,values[:tpr],values[:fpr]].join(",")
-end
diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb
index 8bc323c..d0da0af 100755
--- a/scripts/summary2table.rb
+++ b/scripts/summary2table.rb
@@ -2,7 +2,7 @@
require 'yaml'
rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"}
-data = YAML.load_file("10-fold-crossvalidations/summary.yaml")[:cv]
+data = YAML.load_file("crossvalidations/summary.yaml")[:cv]
case ARGV[0]
when "tensorflow"
diff --git a/scripts/tsne-cdk-descriptors.rb b/scripts/tsne-cdk-descriptors.rb
new file mode 100755
index 0000000..a994c29
--- /dev/null
+++ b/scripts/tsne-cdk-descriptors.rb
@@ -0,0 +1,26 @@
+#!/usr/bin/env ruby
+train = File.readlines(ARGV[0])
+pa = File.readlines(ARGV[1])
+train_header = train.shift.chomp.split(",").collect{|i| i.gsub('"','')}
+pa_header = pa.shift.chomp.split(",")
+train_header.shift
+pa_header.shift
+
+common = train_header & pa_header
+
+train.each do |line|
+ items = line.chomp.split ","
+ smi = items.shift
+ descriptors = {}
+ items.each_with_index {|item,i| descriptors[train_header[i]] = item.to_f }
+ puts ([smi]+common.collect{|h| descriptors[h]}).join(",")
+end
+
+
+pa.each do |line|
+ items = line.chomp.split ","
+ smi = items.shift
+ descriptors = {}
+ items.each_with_index {|item,i| descriptors[pa_header[i]] = item.to_f }
+ puts ([smi]+common.collect{|h| descriptors[h]}).join(",")
+end
diff --git a/scripts/tsne-cdk.R b/scripts/tsne-cdk.R
index c59d2df..cdf44e5 100755
--- a/scripts/tsne-cdk.R
+++ b/scripts/tsne-cdk.R
@@ -1,14 +1,17 @@
#!/usr/bin/env Rscript
library(Rtsne)
library(ggplot2)
-data <- read.csv("figures/tsne-padel.csv")
-labels <- data$Mutagenicity
-data$Mutagenicity <- NULL
+data <- read.csv("figures/tsne-cdk.csv",header=F)
+#data[,1] <- NULL
+#labels <- data[,2]
+#data[,2] <- NULL
m <- as.matrix(data)
+#class(m) <- "numeric"
+#print(m)
tsne <- Rtsne(m,verbose=T,check_duplicates=F)
-write.csv(data.frame(x = tsne$Y[,1], y = tsne$Y[,2]),"padel-tsne.csv")
-tsne_plot <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2])
-colors <- c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00")
-plot <- ggplot(tsne_plot)
-plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_color_manual(values = colors)
-ggsave("figures/tsne-cdk.png")
+write.csv(data.frame(x = tsne$Y[,1], y = tsne$Y[,2]),"figures/tsne-coordinates-cdk.csv")
+#tsne_plot <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2])
+#colors <- c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00")
+#plot <- ggplot(tsne_plot)
+#plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_color_manual(values = colors)
+#ggsave("figures/tsne-cdk.png")
diff --git a/scripts/tsne-mp2d-distances.rb b/scripts/tsne-mp2d-distances.rb
new file mode 100755
index 0000000..f0a3afd
--- /dev/null
+++ b/scripts/tsne-mp2d-distances.rb
@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+require_relative "../../lazar/lib/similarity.rb"
+
+independent_variables = []
+smiles = []
+ARGV.each do |f|
+ File.readlines(f).each do |l|
+ items = l.chomp.split ","
+ smiles << items.shift
+ independent_variables << items
+ end
+end
+
+dist = []
+independent_variables.each_with_index do |v1,i|
+ dist << []
+ line = []
+ independent_variables.each_with_index do |v2,j|
+ if j > i
+ d = 1-Similarity.tanimoto([v1,v2])
+ dist[i][j] = d
+ elsif i == j
+ d = 0
+ else
+ d = dist[j][i]
+ end
+ line << d
+ end
+ puts ([smiles[i]]+line).join(",")
+end
diff --git a/scripts/tsne-mp2d.R b/scripts/tsne-mp2d.R
deleted file mode 100755
index 3fdab76..0000000
--- a/scripts/tsne-mp2d.R
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env Rscript
-library(Rtsne)
-library(ggplot2)
-data <- read.csv("figures/tsne-mp2d.csv",header=F)
-labels <- data[,1]
-data[,1] <- NULL
-m <- as.matrix(data)
-dist <- as.dist(m)
-tsne <- Rtsne(dist,verbose=T,is_distance=T)
-tsne_plot <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2])
-colors <- c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00")
-plot <- ggplot(tsne_plot)
-plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_color_manual(values = colors)
-ggsave("figures/tsne-mp2d.png")
diff --git a/scripts/tsne-mutagenicity.R b/scripts/tsne-mutagenicity.R
new file mode 100755
index 0000000..c8d63d1
--- /dev/null
+++ b/scripts/tsne-mutagenicity.R
@@ -0,0 +1,12 @@
+#!/usr/bin/env Rscript
+library(Rtsne)
+library(ggplot2)
+args = commandArgs(trailingOnly=TRUE)
+tsne = read.csv(args[1],header=T)
+tsne[,1] = NULL
+labels = read.csv(args[2],header=F)[,1]
+tsne_plot = data.frame(x = tsne$x, y = tsne$y)
+colors = c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00")
+plot = ggplot(tsne_plot)
+plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_color_manual(values = colors)
+ggsave(args[3])