diff options
author | Christoph Helma <helma@in-silico.ch> | 2021-02-22 23:26:29 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2021-02-22 23:26:29 +0100 |
commit | ed83d4c5347ebf43b2de55782b290b66bada4561 (patch) | |
tree | ddf3ee1eb6d4f5d250835345798086b5204a23ee /scripts | |
parent | 3af0c3d5c5b7f7d506a4582bbe3dca7d22bbefcc (diff) |
more script consolidations
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/cdk-descriptors.rb | 41 | ||||
-rwxr-xr-x | scripts/confusion-matrix.rb | 21 | ||||
-rwxr-xr-x | scripts/cv-summary.rb (renamed from scripts/confusion-matrix-summary.rb) | 2 | ||||
-rwxr-xr-x | scripts/cv-tensorflow-confusion-matrix.rb | 32 | ||||
-rwxr-xr-x | scripts/data.rb | 42 | ||||
-rwxr-xr-x | scripts/mp2d-distances.rb | 30 | ||||
-rwxr-xr-x | scripts/pa-groups.R | 34 | ||||
-rwxr-xr-x | scripts/pa-predictions-latex.rb | 74 | ||||
-rwxr-xr-x | scripts/pa-predictions.rb | 18 | ||||
-rwxr-xr-x | scripts/pa-summary.rb | 2 | ||||
-rwxr-xr-x | scripts/pa-table.rb | 70 | ||||
-rwxr-xr-x | scripts/roc.R | 43 | ||||
-rwxr-xr-x | scripts/roc.rb | 8 | ||||
-rwxr-xr-x | scripts/summary2roc.rb | 9 | ||||
-rwxr-xr-x | scripts/summary2table.rb | 2 | ||||
-rwxr-xr-x | scripts/tsne-cdk-descriptors.rb | 26 | ||||
-rwxr-xr-x | scripts/tsne-cdk.R | 21 | ||||
-rwxr-xr-x | scripts/tsne-mp2d-distances.rb | 30 | ||||
-rwxr-xr-x | scripts/tsne-mp2d.R | 14 | ||||
-rwxr-xr-x | scripts/tsne-mutagenicity.R | 12 |
20 files changed, 274 insertions, 257 deletions
diff --git a/scripts/cdk-descriptors.rb b/scripts/cdk-descriptors.rb deleted file mode 100755 index bb13f97..0000000 --- a/scripts/cdk-descriptors.rb +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env ruby -train = File.readlines(ARGV[0]) -pa = File.readlines(ARGV[1]) -train_header = train.shift.chomp.split(",").collect{|i| i.gsub('"','')} -pa_header = pa.shift.chomp.split(";") -train_header.shift -train_header.shift -pa_header.shift - -#train_only = train_header - pa_header -#pa_only = pa_header - train_header -#puts train_only.size.to_s+ " training set descriptors missing from PAs:" -#puts train_only.join(",") -#puts -#puts pa_only.size.to_s+ " PA descriptors not in training set:" -#puts pa_only.join(",") -#exit - -common = train_header & pa_header - -puts (["Mutagenicity"]+common).join(",") -train.each do |line| - items = line.chomp.split "," - id = items.shift - #id = "TRAIN"+id.gsub('"','') - act = items.shift - act == '"1"' ? act = "mutagen" : act = "non-mutagen" - descriptors = {} - items.each_with_index {|item,i| descriptors[train_header[i]] = item.sub(',','.').to_f } - puts ([id,act]+common.collect{|h| descriptors[h]}).join(",") -end - - -pa.each do |line| - items = line.chomp.split ";" - id = "PA"+items.shift - act = "PA" - descriptors = {} - items.each_with_index {|item,i| descriptors[pa_header[i]] = item.sub(',','.').to_f } - puts ([id,act]+common.collect{|h| descriptors[h]}).join(",") -end diff --git a/scripts/confusion-matrix.rb b/scripts/confusion-matrix.rb new file mode 100755 index 0000000..c40ee2f --- /dev/null +++ b/scripts/confusion-matrix.rb @@ -0,0 +1,21 @@ +#!/usr/bin/env ruby +require 'csv' + +tp = 0 +fp = 0 +tn = 0 +fn = 0 +File.readlines(ARGV[0]).each do |line| + pred = line.chomp.split(",").last + case pred + when "TP" + tp+=1 + when "TN" + tn+=1 + when "FP" + fp+=1 + when "FN" + fn+=1 + end +end +puts "#{tp},#{fp}\n#{fn},#{tn}" diff --git a/scripts/confusion-matrix-summary.rb b/scripts/cv-summary.rb index 8a32f79..aad7e2a 100755 --- a/scripts/confusion-matrix-summary.rb +++ b/scripts/cv-summary.rb @@ -28,7 +28,7 @@ ARGV.each do |f| :ppv_perc => (100*tp/(tp+fp)).round(0), :npv_perc => (100*tn/(tn+fn)).round(0), } - results[File.basename(f,".csv")] = result + results[f.sub("crossvalidations/confusion-matrices/","").sub(".csv","").gsub("/","_").gsub("-","_")] = result end results = {:cv => results} puts results.to_yaml diff --git a/scripts/cv-tensorflow-confusion-matrix.rb b/scripts/cv-tensorflow-confusion-matrix.rb deleted file mode 100755 index ae72b8e..0000000 --- a/scripts/cv-tensorflow-confusion-matrix.rb +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env ruby -require 'csv' - -tp = 0 -fp = 0 -tn = 0 -fn = 0 - -pred = CSV.read(ARGV[0],headers: true,:col_sep => ",") -act = CSV.read(File.join("data","training","mutagenicity.csv"),headers: true,:col_sep => ",") - -data = {} - -pred.each do |row| - row[1].to_f < 0.5 ? p = 0 : p = 1 - data[row[0]] =[p] -end - -act.each do |row| - data[row[0]] << row[1].to_i if data[row[0]] -end - -data.each do |smi,a| - - tp += 1 if a[0] == 1 and a[1] == 1 - tn += 1 if a[0] == 0 and a[1] == 0 - fp += 1 if a[0] == 0 and a[1] == 1 - fn += 1 if a[0] == 1 and a[1] == 0 - -end - -puts "#{tp},#{fp}\n#{fn},#{tn}" diff --git a/scripts/data.rb b/scripts/data.rb index e834677..72e6b28 100755 --- a/scripts/data.rb +++ b/scripts/data.rb @@ -10,20 +10,46 @@ data[:cv][:n] = `cut -f1 -d ',' mutagenicity/mutagenicity.csv | wc -l`.chomp.to_ data[:cv][:n_uniq] = `cut -f1 -d ',' mutagenicity/mutagenicity.csv | sort -u | wc -l`.chomp.to_i - 1 data[:cv][:cdk] = {} -cdk = File.readlines("mutagenicity/cdk/mutagenicity-mod-2.new.csv") -data[:cv][:cdk][:n_descriptors] = cdk.shift.split(",").size-2 +cdk = File.readlines("mutagenicity/mutagenicity-cdk.csv") +data[:cv][:cdk][:n_descriptors] = cdk.shift.split(",").size-1 data[:cv][:cdk][:n_compounds] = cdk.size - - - data[:pa][:groups] = {} -lines = File.readlines("pyrrolizidine-alkaloids/pa-predictions.csv") -pa_groups = lines.shift.chomp.split(",")[1..9].collect{|g| g.sub(/[ -]/,"_").to_sym} +lines = File.readlines("pyrrolizidine-alkaloids/pa-groups.csv") +pa_groups = lines.shift.chomp.split(",")[1..-1].collect{|g| g.sub(/[ -]/,"_").to_sym} pa_groups.each {|g| data[:pa][:groups][g] = {}; data[:pa][:groups][g][:n] = 0} +groups = {} lines.each do |l| - l.chomp.split(",")[1..9].each_with_index do |v,i| + items = l.chomp.split(",") + smi = items[0] + items[1..-1].each_with_index do |v,i| data[:pa][:groups][pa_groups[i]][:n] += v.to_i + groups[pa_groups[i]] ||= [] + groups[pa_groups[i]] << smi if v == "1" end end +lines = File.readlines("pyrrolizidine-alkaloids/pa-predictions.csv") +algos = lines.shift.chomp.split(",")[1..-1].collect{|g| g.sub(/[ -]/,"_").to_sym} +lines.each do |l| + items = l.chomp.split(",") + smi = items[0] + items[1..-1].each do |v| + groups.each do |group,smiles| + data[:pa][:groups][group][:mut] ||= 0 + data[:pa][:groups][group][:non_mut] ||= 0 + if smiles.include? smi + if v == "1" + data[:pa][:groups][group][:mut] += 1 + elsif v == "0" + data[:pa][:groups][group][:non_mut] += 1 + end + end + end + end +end +data[:pa][:groups].each do |g,values| + data[:pa][:groups][g][:n_pred] = values[:mut]+values[:non_mut] + data[:pa][:groups][g][:mut_perc] = (100*values[:mut]/data[:pa][:groups][g][:n_pred]).round + data[:pa][:groups][g][:non_mut_perc] = (100*values[:non_mut]/data[:pa][:groups][g][:n_pred]).round +end puts data.to_yaml diff --git a/scripts/mp2d-distances.rb b/scripts/mp2d-distances.rb deleted file mode 100755 index 34cc136..0000000 --- a/scripts/mp2d-distances.rb +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../../lazar/lib/similarity.rb" - -dependent_variables = File.readlines(File.join("..","lazar","models","mutagenicity-mp2d","dependent-variables")).collect{|l| l.chomp} -independent_variables = File.readlines(File.join("..","lazar","models","mutagenicity-mp2d","independent-variables")).collect{|l| l.chomp.split ","} -independent_variables += File.readlines(File.join("..","lazar","predictions","pa-mp2d","independent-variables")).collect{|l| l.chomp.split ","} - -dist = [] -independent_variables.each_with_index do |v1,i| - dist << [] - line = [] - independent_variables.each_with_index do |v2,j| - if j > i - d = 1-Similarity.tanimoto([v1,v2]) - dist[i][j] = d - elsif i == j - d = 0 - else - d = dist[j][i] - end - line << d - end - if dependent_variables[i] - act="mutagen" if dependent_variables[i] == "1" - act="non-mutagen" if dependent_variables[i] == "0" - else - act="PA" - end - puts ([act]+line).join(",") -end diff --git a/scripts/pa-groups.R b/scripts/pa-groups.R index ae4a3c3..3c6ce2c 100755 --- a/scripts/pa-groups.R +++ b/scripts/pa-groups.R @@ -1,11 +1,33 @@ #!/usr/bin/env Rscript library(ggplot2) -data <- read.csv("pyrrolizidine-alkaloids/pa-predictions.csv") +args = commandArgs(trailingOnly=TRUE) +groups = read.csv(args[1],header=T) +data = read.csv(args[2]) for (i in c(2:10)) { - name <- names(data)[i] - cols <- append(c(15,17),c(19:28)) - group <- data[data[i] == 1,cols] - freq <- 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group)) - plot <- ggplot(data.frame(freq),aes(x=names(data)[cols],y=freq)) + geom_bar(stat="identity") + ylab("% mutagenic") + xlab(element_blank()) + ylim(c(0,100)) + theme(axis.text.x = element_text(angle=90)) + name = names(groups)[i] + cols = c(2:15) + group = data[groups[i] == 1,cols] + freq = 100*colSums(group,na.rm=TRUE)/colSums(!is.na(group)) + algos = toupper(names(data)[cols]) + algos = gsub("HIGH",'HC',algos) + algos = gsub(".CONFIDENCE",'',algos) + algos = gsub("\\.",'-',algos) + algos <- factor(algos,levels=rev(c( + "MP2D-LAZAR-ALL", + "MP2D-LAZAR-HC", + "MP2D-RF", + "MP2D-LR", + "MP2D-LR2", + "MP2D-NN", + "MP2D-SVM", + "CDK-LAZAR-ALL", + "CDK-LAZAR-HC", + "CDK-RF", + "CDK-LR", + "CDK-LR2", + "CDK-NN", + "CDK-SVM" + ))) + plot = ggplot(data.frame(freq),aes(x=freq,y=algos)) + geom_bar(stat="identity") + xlab("% mutagenic") + ylab(element_blank()) + xlim(c(0,100))# + theme(axis.text.x = element_text(angle=90)) ggsave(paste("figures/",name,".png",sep="")) } diff --git a/scripts/pa-predictions-latex.rb b/scripts/pa-predictions-latex.rb index 1aa6383..d97f3d9 100755 --- a/scripts/pa-predictions-latex.rb +++ b/scripts/pa-predictions-latex.rb @@ -1,7 +1,20 @@ #!/usr/bin/env ruby -#1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & LR-sgd & LR-scikit & NN & RF \\\\ -puts ' +group_lines = File.readlines(ARGV[0]) # groups +group_names = [] +group_lines.shift.chomp.split(",")[1..-1].each_with_index do |g,i| + group_names << "#{i}: #{g}" +end +groups = {} +group_lines.each do |l| + items = l.chomp.split(",") + smi = items.shift + groups[smi] = items +end +pred_lines = File.readlines(ARGV[1]) # predictions +algo_names = pred_lines.shift.chomp.split(",")[1..-1] + +print ' \documentclass[]{scrartcl} \usepackage{color, colortbl} \usepackage{longtable} @@ -14,50 +27,49 @@ puts ' \definecolor{black}{rgb}{0,0,0} \definecolor{white}{rgb}{1,1,1} \tiny -\begin{longtable}{rrrrrrrrrccccccccccccccc} -\caption{Summary of pyrrolizidine alkaloid predictions: red: mutagen, green: non-mutagen, grey: no prediction, dark red/green: low confidence; 1: Retronecine, 2: Otonecine, 3: Platynecine, 4: N-oxide, 5: Dehydropyrrolizidine, 6:Tertiary PA, 7: Macrocyclic-diester, 8: Monoester, 9: Diester} \\\\ -\label{tab:pa} - -% 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & DL & SVM & LR-sgd & LR-scikit & NN & RF \\kill % needed as guide for multicolumn -% \multicolumn{9}{c}{PA Group} & & \multicolumn{2}{c}{lazar} & \multicolumn{4}{c}{Tensorflow}\\\\ -1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Exp. & MP2D & CDK & LR1 & LR2 & NN & RF & SVM & LR1 & LR2 & NN & RF & SVM \\kill % needed as guide for multicolumn -\multicolumn{9}{c}{PA Group} & & \multicolumn{2}{c}{lazar} & \multicolumn{5}{c}{MP2D} & \multicolumn{5}{c}{CDK}\\\\ - -1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Exp. & MP2D & CDK & LR1 & LR2 & NN & RF & SVM & LR1 & LR2 & NN & RF & SVM \\\\ -% 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Measured & MP2D & CDK & LR-sgd & LR-scikit & NN & RF \\\\ -\hline +\begin{longtable}{rrrrrrrrrcccccccccccccc} +\caption{Summary of pyrrolizidine alkaloid predictions: red: mutagen, green: non-mutagen, grey: no prediction, dark red/green: low confidence; ' +print "#{group_names.join(', ')}" +puts '} \\\\ +\label{tab:pa}' +header2 = ((1..9).to_a + algo_names.select{|a| !a.match(/high/)}.collect{|a| a.sub(/mp2d|cdk/,'').sub('-','').upcase.sub("LAZAR-ALL","lazar")}).join(" & ") +print header2 +puts ' \\kill % needed as guide for multicolumn' +puts '\multicolumn{9}{c}{PA Group} & \multicolumn{6}{c}{MP2D} & \multicolumn{6}{c}{CDK}\\\\' +puts header2 + '\\\\' +puts '\hline \renewcommand{\arraystretch}{0.075} ' -lines = File.readlines(ARGV[0]) -header = lines.shift.chomp.split(",") -lines.each do |l| - row = "" +pred_lines.each do |l| + row = [] values = l.chomp.split(",") + smi = values.shift + groups[smi].each do |v| + v == "1" ? row << '\cellcolor{black}' : row << '\cellcolor{white}' + end values.each_with_index do |v,i| - if i == 1 - v == "1" ? row += '\cellcolor{black}' : row += '\cellcolor{white}' - elsif i > 1 and i < 10 - v == "1" ? row += ' & \cellcolor{black}' : row += ' & \cellcolor{white}' - elsif i == 14 or i == 16 # lazar + case algo_names[i] + when /lazar-all/ if v == "1" - values[i+1] == "1" ? row += ' & \cellcolor{red}' : row += ' & \cellcolor{darkred}' + values[i+1] == "1" ? row << '\cellcolor{red}' : row << '\cellcolor{darkred}' elsif v == "0" - values[i+1] == "0" ? row += ' & \cellcolor{green}' : row += ' & \cellcolor{darkgreen}' + values[i+1] == "0" ? row << '\cellcolor{green}' : row << '\cellcolor{darkgreen}' else - row += ' & \cellcolor{grey}' + row << '\cellcolor{grey}' end - elsif i == 13 or i > 17 # measured or tensorflow + when /lazar-high-confidence/ # do nothing + else if v == "1" - row += ' & \cellcolor{red}' + row << '\cellcolor{red}' elsif v == "0" - row += ' & \cellcolor{green}' + row << '\cellcolor{green}' else - row += ' & \cellcolor{grey}' + row << '\cellcolor{grey}' end end end - puts row + ' \\\\' + puts row.join(" & ") + ' \\\\' end puts ' \end{longtable} diff --git a/scripts/pa-predictions.rb b/scripts/pa-predictions.rb new file mode 100755 index 0000000..9500c39 --- /dev/null +++ b/scripts/pa-predictions.rb @@ -0,0 +1,18 @@ +#!/usr/bin/env ruby + +predictions = {} +algos = [] + +ARGV.each do |f| + name = f.sub("pyrrolizidine-alkaloids/","").sub("/","-").sub(".csv","") + algos << name + File.readlines(f).each do |l| + smi,pred = l.chomp.split(",") + predictions[smi] ||= {} + predictions[smi][name] = pred + end +end +puts (["Canonical SMILES"] + algos).join(",") +predictions.each do |smi,pred| + puts ([smi]+algos.collect{|a| pred[a]}).join(",") +end diff --git a/scripts/pa-summary.rb b/scripts/pa-summary.rb index 9fb3d0e..418aa18 100755 --- a/scripts/pa-summary.rb +++ b/scripts/pa-summary.rb @@ -7,7 +7,7 @@ summary = {:n => lines.size} lines.each do |line| items = line.chomp.split(",") items.each_with_index do |v,i| - if header[i].match (/MP2D|CDK/) + unless header[i].match (/SMILES/) key = header[i].gsub("-","_").downcase.to_sym summary[key] ||= { :n => 0, :mut => 0, :non_mut => 0 } case v diff --git a/scripts/pa-table.rb b/scripts/pa-table.rb deleted file mode 100755 index e2c4983..0000000 --- a/scripts/pa-table.rb +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env ruby - -# red groups -tab = [] -File.read("data/pyrrolizidine-alkaloids/pa-groups.csv").each_line do |l| - items = l.chomp.split(';') - if items.first.empty? - items[0] = "ID" - else - id = items.shift - items.collect!{|i| i == "NA" ? 0 : 1} - items = [id]+items - end - tab << items -end - -tab[0] += ["CID","SMILES","Canonical SMILES","Measured","lazar-all-MP2D","lazar-high-confidence-MP2D","lazar-all-CDK","lazar-high-confidence-CDK"] - -i = 0 -File.read("pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv").each_line do |l| - if i > 0 - id,cid,name,smi = l.chomp.split(";") - tab[i] += [cid,'"'+smi+'"'] - end - i += 1 -end - -i = 1 -File.read("pyrrolizidine-alkaloids/mp2d/lazar/predictions").each_line do |l| - cansmi,exp,mut,p0,p1,max_sim,nn = l.chomp.split(",") - max_sim.to_f < 0.5? hc = "" : hc = mut - hc = "" if mut.empty? - tab[i] += ['"'+cansmi+'"',exp,mut,hc] - i += 1 -end - -i=1 -File.read("pyrrolizidine-alkaloids/cdk/lazar/predictions").each_line do |l| - cansmi,exp,mut,p0,p1,max_sim,nn = l.chomp.split(",") - max_sim.to_f < 0.5? hc = "" : hc = mut - hc = "" if mut.empty? - tab[i] += [mut,hc] - i += 1 -end - -Dir["pyrrolizidine-alkaloids/mp2d/tensorflow/*.csv"].each do |r| - tab[0] << r.sub('pyrrolizidine-alkaloids/mp2d/tensorflow/pred.','').sub(/\..*csv/,'').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN").sub("svm","SVM")+"-MP2D" - i = 0 - File.read(r).each_line do |l| - if i > 0 - id,pred = l.chomp.split(",") - pred.to_f > 0.5 ? tab[i] << 1 : tab[i] << 0 - end - i += 1 - end -end - -Dir["pyrrolizidine-alkaloids/cdk/tensorflow/*.csv"].each do |r| - tab[0] << r.sub('pyrrolizidine-alkaloids/cdk/tensorflow/pred.','').sub(/\..*csv/,'').sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("rf","RF").sub("nn","NN").sub("svm","SVM")+"-CDK" - i = 0 - File.read(r).each_line do |l| - if i > 0 - id,pred = l.chomp.split(",") - pred.to_f > 0.5 ? tab[i] << 1 : tab[i] << 0 - end - i += 1 - end -end - -puts tab.collect{|r| r.join(",")}.join("\n") diff --git a/scripts/roc.R b/scripts/roc.R index 281ab13..32e1674 100755 --- a/scripts/roc.R +++ b/scripts/roc.R @@ -1,10 +1,45 @@ #!/usr/bin/env Rscript library(ggplot2) data <- read.csv("figures/roc.csv",header=T) -p <- ggplot(data, aes(x=fpr, y=tpr)) + geom_abline() -#p <- p + geom_label(label=rownames(data) ) -p <- p + geom_point(aes(color=rownames(data))) -p <- p + theme(legend.title=element_blank(), legend.position="bottom") +labels = factor(row.names(data), levels = c("MP2D-LAZAR-HC", "MP2D-LAZAR-ALL", "MP2D-RF", "MP2D-LR", "MP2D-LR2", "MP2D-NN", "MP2D-SVM", "CDK-LAZAR-HC", "CDK-LAZAR-ALL", "CDK-RF", "CDK-LR", "CDK-LR2", "CDK-NN", "CDK-SVM")) +shapes = c( +"MP2D-LAZAR-HC" = 16, +"MP2D-LAZAR-ALL" = 16, +"MP2D-RF" = 16, +"MP2D-LR" = 16, +"MP2D-LR2" = 16, +"MP2D-NN" = 16, +"MP2D-SVM" = 16, +"CDK-LAZAR-HC" = 17, +"CDK-LAZAR-ALL" = 17, +"CDK-RF" = 17, +"CDK-LR" = 17, +"CDK-LR2" = 17, +"CDK-NN" = 17, +"CDK-SVM" = 17) + +colors <- c( +"MP2D-LAZAR-HC" = "#E69F00", +"MP2D-LAZAR-ALL" = "#56B4E9", +"MP2D-RF" = "#009E73", +"MP2D-LR" = "#F0E442", +"MP2D-LR2" = "#0072B2", +"MP2D-NN" = "#D55E00", +"MP2D-SVM" = "#CC79A7", +"CDK-LAZAR-HC" = "#E69F00", +"CDK-LAZAR-ALL" = "#56B4E9", +"CDK-RF" = "#009E73", +"CDK-LR" = "#F0E442", +"CDK-LR2" = "#0072B2", +"CDK-NN" = "#D55E00", +"CDK-SVM" = "#CC79A7") + +p <- ggplot(data) +p <- p + geom_point(aes(x=fpr, y=tpr, color = labels, shape = labels)) +p <- p + geom_abline() +p <- p + theme(legend.title=element_blank()) p <- p + expand_limits(x=c(0,1),y=c(0,1)) +p <- p + scale_shape_manual(values = shapes) +p <- p + scale_color_manual(values = colors) p <- p + labs(x = "False positive rate", y = "True positive rate") ggsave("figures/roc.png") diff --git a/scripts/roc.rb b/scripts/roc.rb new file mode 100755 index 0000000..8e2dc51 --- /dev/null +++ b/scripts/roc.rb @@ -0,0 +1,8 @@ +#!/usr/bin/env ruby +require "yaml" + +data = YAML.load(File.read ARGV[0])[:cv] +puts "tpr,fpr" +data.each do |algo,values| + puts [algo.upcase.gsub('_','-').sub("HIGH-CONFIDENCE","HC"),values[:tpr],values[:fpr]].join(",") +end diff --git a/scripts/summary2roc.rb b/scripts/summary2roc.rb deleted file mode 100755 index 258f64d..0000000 --- a/scripts/summary2roc.rb +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env ruby -require "yaml" - -data = YAML.load(File.read ARGV[0])[:cv] -puts "tpr,fpr" -data.each do |algo,values| - algo = algo.sub("tensorflow-","").sub("selected","FS").sub(".v3","").sub("high-confidence","HC").sub("padel","PaDEL").sub("lazar ","lazar-MP2D ").sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("nn","NN").sub("-rf","-RF").sub("-svm","-SVM").sub("cdk","CDK").sub("mp2d","MP2D") - puts [algo,values[:tpr],values[:fpr]].join(",") -end diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb index 8bc323c..d0da0af 100755 --- a/scripts/summary2table.rb +++ b/scripts/summary2table.rb @@ -2,7 +2,7 @@ require 'yaml' rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"} -data = YAML.load_file("10-fold-crossvalidations/summary.yaml")[:cv] +data = YAML.load_file("crossvalidations/summary.yaml")[:cv] case ARGV[0] when "tensorflow" diff --git a/scripts/tsne-cdk-descriptors.rb b/scripts/tsne-cdk-descriptors.rb new file mode 100755 index 0000000..a994c29 --- /dev/null +++ b/scripts/tsne-cdk-descriptors.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby +train = File.readlines(ARGV[0]) +pa = File.readlines(ARGV[1]) +train_header = train.shift.chomp.split(",").collect{|i| i.gsub('"','')} +pa_header = pa.shift.chomp.split(",") +train_header.shift +pa_header.shift + +common = train_header & pa_header + +train.each do |line| + items = line.chomp.split "," + smi = items.shift + descriptors = {} + items.each_with_index {|item,i| descriptors[train_header[i]] = item.to_f } + puts ([smi]+common.collect{|h| descriptors[h]}).join(",") +end + + +pa.each do |line| + items = line.chomp.split "," + smi = items.shift + descriptors = {} + items.each_with_index {|item,i| descriptors[pa_header[i]] = item.to_f } + puts ([smi]+common.collect{|h| descriptors[h]}).join(",") +end diff --git a/scripts/tsne-cdk.R b/scripts/tsne-cdk.R index c59d2df..cdf44e5 100755 --- a/scripts/tsne-cdk.R +++ b/scripts/tsne-cdk.R @@ -1,14 +1,17 @@ #!/usr/bin/env Rscript library(Rtsne) library(ggplot2) -data <- read.csv("figures/tsne-padel.csv") -labels <- data$Mutagenicity -data$Mutagenicity <- NULL +data <- read.csv("figures/tsne-cdk.csv",header=F) +#data[,1] <- NULL +#labels <- data[,2] +#data[,2] <- NULL m <- as.matrix(data) +#class(m) <- "numeric" +#print(m) tsne <- Rtsne(m,verbose=T,check_duplicates=F) -write.csv(data.frame(x = tsne$Y[,1], y = tsne$Y[,2]),"padel-tsne.csv") -tsne_plot <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2]) -colors <- c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00") -plot <- ggplot(tsne_plot) -plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_color_manual(values = colors) -ggsave("figures/tsne-cdk.png") +write.csv(data.frame(x = tsne$Y[,1], y = tsne$Y[,2]),"figures/tsne-coordinates-cdk.csv") +#tsne_plot <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2]) +#colors <- c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00") +#plot <- ggplot(tsne_plot) +#plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_color_manual(values = colors) +#ggsave("figures/tsne-cdk.png") diff --git a/scripts/tsne-mp2d-distances.rb b/scripts/tsne-mp2d-distances.rb new file mode 100755 index 0000000..f0a3afd --- /dev/null +++ b/scripts/tsne-mp2d-distances.rb @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby +require_relative "../../lazar/lib/similarity.rb" + +independent_variables = [] +smiles = [] +ARGV.each do |f| + File.readlines(f).each do |l| + items = l.chomp.split "," + smiles << items.shift + independent_variables << items + end +end + +dist = [] +independent_variables.each_with_index do |v1,i| + dist << [] + line = [] + independent_variables.each_with_index do |v2,j| + if j > i + d = 1-Similarity.tanimoto([v1,v2]) + dist[i][j] = d + elsif i == j + d = 0 + else + d = dist[j][i] + end + line << d + end + puts ([smiles[i]]+line).join(",") +end diff --git a/scripts/tsne-mp2d.R b/scripts/tsne-mp2d.R deleted file mode 100755 index 3fdab76..0000000 --- a/scripts/tsne-mp2d.R +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env Rscript -library(Rtsne) -library(ggplot2) -data <- read.csv("figures/tsne-mp2d.csv",header=F) -labels <- data[,1] -data[,1] <- NULL -m <- as.matrix(data) -dist <- as.dist(m) -tsne <- Rtsne(dist,verbose=T,is_distance=T) -tsne_plot <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2]) -colors <- c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00") -plot <- ggplot(tsne_plot) -plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_color_manual(values = colors) -ggsave("figures/tsne-mp2d.png") diff --git a/scripts/tsne-mutagenicity.R b/scripts/tsne-mutagenicity.R new file mode 100755 index 0000000..c8d63d1 --- /dev/null +++ b/scripts/tsne-mutagenicity.R @@ -0,0 +1,12 @@ +#!/usr/bin/env Rscript +library(Rtsne) +library(ggplot2) +args = commandArgs(trailingOnly=TRUE) +tsne = read.csv(args[1],header=T) +tsne[,1] = NULL +labels = read.csv(args[2],header=F)[,1] +tsne_plot = data.frame(x = tsne$x, y = tsne$y) +colors = c("PA" = "#00BFC4", "mutagen" = "#F8766D", "non-mutagen" = "#7CAE00") +plot = ggplot(tsne_plot) +plot + geom_point(aes(x=x, y=y, color = labels)) + xlab(element_blank()) + ylab(element_blank()) + theme(axis.ticks = element_blank(), axis.text = element_blank(), legend.title=element_blank(), legend.position="bottom") + scale_color_manual(values = colors) +ggsave(args[3]) |