From 9d7b4aaff715e731ba81bf131dfaa9de5a9d0fdd Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 17 Feb 2021 23:11:49 +0100 Subject: cleanup, scripts adjusted, improved figures --- Makefile | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'Makefile') diff --git a/Makefile b/Makefile index 41128b3..5b97564 100644 --- a/Makefile +++ b/Makefile @@ -15,10 +15,12 @@ TEMPLATE_FILE_LATEX = pandoc-scholar.latex # crossvalidations -CV_DIR = 10-fold-crossvalidations/ +CV_DIR = crossvalidations/ +LAZAR_MODELS_DIR = ../lazar/models/ +LAZAR_PREDICTIONS_DIR = ../lazar/predictions/ -LAZAR_MP2D_CONFUSION_MATRIX_DIR = $(CV_DIR)mp2d/lazar/crossvalidation/confusion_matrices/ -LAZAR_CDK_CONFUSION_MATRIX_DIR = $(CV_DIR)cdk/lazar/crossvalidation/confusion_matrices/ +LAZAR_MP2D_CONFUSION_MATRIX_DIR = $(LAZAR_MODELS_DIR)mutagenicity-mp2d/crossvalidation/confusion-matrices/ +LAZAR_CDK_CONFUSION_MATRIX_DIR = $(LAZAR_MODELS_DIR)mutagenicity-cdk/crossvalidation/confusion-matrices/ TENSORFLOW_MP2D_CV_DIR = $(CV_DIR)mp2d/tensorflow/ TENSORFLOW_CDK_CV_DIR = $(CV_DIR)cdk/tensorflow/ @@ -54,7 +56,6 @@ CV_SUMMARY = $(CV_DIR)summary.yaml # PA predictions PA_DIR = pyrrolizidine-alkaloids/ -PA_DATA_DIR = data/pyrrolizidine-alkaloids/ PA_MP2D_DIR = $(PA_DIR)mp2d/ PA_CDK_DIR = $(PA_DIR)cdk/ @@ -82,8 +83,8 @@ PA_CDK_TENSORFLOW_PREDICTIONS = $(addprefix $(PA_CDK_TENSORFLOW_DIR), \ ) PA_PREDICTIONS = \ - $(PA_MP2D_LAZAR_DIR)pa-mp2d-predictions.csv \ - $(PA_CDK_LAZAR_DIR)pa-cdk-predictions.csv \ + $(PA_MP2D_LAZAR_DIR)predictions\ + $(PA_CDK_LAZAR_DIR)predictions \ $(PA_MP2D_TENSORFLOW_PREDICTIONS) \ $(PA_CDK_TENSORFLOW_PREDICTIONS) @@ -95,13 +96,9 @@ TABLES = $(addprefix tables/, \ lazar-summary.csv \ tensorflow-summary.csv \ pa-tab.tex \ - pa-summary.csv \ ) -FIGURES = $(addprefix figures/, \ - roc.png \ - tsne-mp2d.png \ - tsne-cdk.png \ +PA_FIGURES = $(addprefix figures/, \ Dehydropyrrolizidine.png \ Diester.png \ Macrocyclic.diester.png \ @@ -113,6 +110,8 @@ FIGURES = $(addprefix figures/, \ Tertiary.PA.png \ ) +FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-cdk.png $(PA_FIGURES) + SUMMARY = summary.yaml all: mutagenicity.pdf @@ -126,15 +125,13 @@ mutagenicity.mustache.md: $(SUMMARY) mutagenicity.md $(TABLES) $(FIGURES) figures/tsne-cdk.png: figures/tsne-cdk.csv Rscript scripts/tsne-cdk.R -# TODO: filtered CDK descriptors -figures/tsne-cdk.csv: data/training/GenoTox-database.csv data/pyrrolizidine-alkaloids/PA-Padel-2D_m2.csv +figures/tsne-cdk.csv: mutagenicity/cdk/mutagenicity-mod-2.new.csv pyrrolizidine-alkaloids/PA-Padel-2D_m2.csv scripts/cdk-descriptors.rb $^ > $@ figures/tsne-mp2d.png: figures/tsne-mp2d.csv Rscript scripts/tsne-mp2d.R -# TODO: exported fingerprints -figures/tsne-mp2d.csv: 10-fold-crossvalidations/mp2d/lazar/independent_variables +figures/tsne-mp2d.csv: $(LAZAR_MODELS_DIR)mutagenicity-mp2d/independent-variables scripts/mp2d-distances.rb > figures/tsne-mp2d.csv figures/roc.png: figures/roc.csv @@ -143,6 +140,9 @@ figures/roc.png: figures/roc.csv figures/roc.csv: $(CV_SUMMARY) scripts/summary2roc.rb $< > $@ +$(PA_FIGURES): tables/pa-table.csv + scripts/pa-groups.R + # tables tables/pa-summary.csv: $(PA_SUMMARY) @@ -170,15 +170,14 @@ $(PA_SUMMARY): tables/pa-table.csv # PA lazar predictions -# TODO move to script dir, generate independent_variables from data -$(PA_CDK_LAZAR_DIR)pa-cdk-predictions.csv: $(PA_CDK_LAZAR_DIR)pa_independent_variables - ../lazar/bin/batch_padel_classification.rb 10-fold-crossvalidations/lazar-pa-cdk $< > $@ +$(PA_DIR)pa-canonical-smiles: $(PA_DIR)180920_PA_complete_SMILES.csv + scripts/pa-smiles.rb $< | obabel -ismi - -ocan | tr -d "\t" > $@ -$(PA_MP2D_LAZAR_DIR)pa-mp2d-predictions.csv: $(PA_DATA_DIR)pa-smiles.csv - ../lazar/bin/batch_fingerprint_classification.rb ../lazar/models/mutagenicity $< > $@ +$(PA_CDK_LAZAR_DIR)predictions: $(LAZAR_PREDICTIONS_DIR)pa-cdk/predictions + cp $< $@ -$(PA_DATA_DIR)pa-smiles.csv: $(PA_DATA_DIR)180920_PA_complete_SMILES.csv - cut -f1,4 -d ';' $< | sed 's/;/,/' > $@ +$(PA_MP2D_LAZAR_DIR)predictions: $(LAZAR_PREDICTIONS_DIR)pa-mp2d/predictions + cp $< $@ # crossvalidation summary @@ -222,17 +221,17 @@ $(CONFUSION_MATRICES_DIR)tensorflow-svm-mp2d.csv: $(TENSORFLOW_MP2D_CV_DIR)pred. ### cdk -$(CONFUSION_MATRICES_DIR)tensorflow-lr-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.lr.v4.csv +$(CONFUSION_MATRICES_DIR)tensorflow-lr-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.lr.v4.norm.csv scripts/cv-tensorflow-confusion-matrix.rb $< > $@ -$(CONFUSION_MATRICES_DIR)tensorflow-lr2-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.lr2.v4.csv +$(CONFUSION_MATRICES_DIR)tensorflow-lr2-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.lr2.norm.v4.csv scripts/cv-tensorflow-confusion-matrix.rb $< > $@ -$(CONFUSION_MATRICES_DIR)tensorflow-nn-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.nn.v4.csv +$(CONFUSION_MATRICES_DIR)tensorflow-nn-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.nn.v4.norm.csv scripts/cv-tensorflow-confusion-matrix.rb $< > $@ -$(CONFUSION_MATRICES_DIR)tensorflow-rf-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.rf.v4.csv +$(CONFUSION_MATRICES_DIR)tensorflow-rf-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.rf.norm.v4.csv scripts/cv-tensorflow-confusion-matrix.rb $< > $@ -$(CONFUSION_MATRICES_DIR)tensorflow-svm-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.svm.v4.csv +$(CONFUSION_MATRICES_DIR)tensorflow-svm-cdk.csv: $(TENSORFLOW_CDK_CV_DIR)pred.svm.norm.v4.csv scripts/cv-tensorflow-confusion-matrix.rb $< > $@ -- cgit v1.2.3