diff options
author | Christoph Helma <helma@in-silico.ch> | 2017-02-13 15:24:11 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2017-02-13 15:24:11 +0100 |
commit | 04baa2d6ddab1963759f99c87cf8f87cbd435831 (patch) | |
tree | 9302cf57ba42b8c7efb76515e7acafb95ea6e683 /scripts | |
parent | db82eef974b8783c40e7daa504feead3f555fdb8 (diff) |
adjustments for latest lazar version
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/create-median-correlation.rb | 20 | ||||
-rwxr-xr-x | scripts/create-test.rb | 21 | ||||
-rwxr-xr-x | scripts/create-training.rb | 21 | ||||
-rwxr-xr-x | scripts/crossvalidation-plots.R | 16 | ||||
-rwxr-xr-x | scripts/crossvalidation.rb | 33 | ||||
-rwxr-xr-x | scripts/dataset-variability.R | 15 | ||||
-rwxr-xr-x | scripts/functional-groups-images.rb | 26 | ||||
-rwxr-xr-x | scripts/functional-groups.R | 3 | ||||
-rwxr-xr-x | scripts/misclassifications.rb | 12 | ||||
-rwxr-xr-x | scripts/test-correlation-plot.R | 8 | ||||
-rwxr-xr-x | scripts/test-prediction-plot.R | 8 | ||||
-rwxr-xr-x | scripts/test-validation-results.rb | 14 | ||||
-rwxr-xr-x | scripts/test-validation.rb | 32 |
13 files changed, 132 insertions, 97 deletions
diff --git a/scripts/create-median-correlation.rb b/scripts/create-median-correlation.rb index 9a2f6f5..2b932d0 100755 --- a/scripts/create-median-correlation.rb +++ b/scripts/create-median-correlation.rb @@ -1,13 +1,15 @@ -require_relative 'include.rb' +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox -old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv") -new = Dataset.from_csv_file File.join(DATA,"swiss.csv") +old = Dataset.from_csv_file File.join("data","mazzatorta_log10.csv") +new = Dataset.from_csv_file File.join("data","swiss_log10.csv") -common_compound_ids = (old.compound_ids & new.compound_ids).uniq +common_compounds = (old.compounds & new.compounds).uniq data = [] -common_compound_ids.each do |cid| - c = Compound.find cid +puts ["SMILES","mazzatorta","swiss"].join(",") +common_compounds.each do |c| old_values = old.values(c,old.features.first) new_values = new.values(c,new.features.first) identical = old_values & new_values @@ -21,8 +23,4 @@ common_compound_ids.each do |cid| end data.sort!{|a,b| a[1] <=> b[1]} - -CSV.open(File.join(DATA,"median-correlation.csv"),"w+") do |csv| - csv << ["SMILES","mazzatorta","swiss"] - data.each{|r| csv << r} -end +puts data.collect{|r| r.join ","}.join("\n") diff --git a/scripts/create-test.rb b/scripts/create-test.rb index 782f172..151b5e8 100755 --- a/scripts/create-test.rb +++ b/scripts/create-test.rb @@ -2,14 +2,14 @@ require_relative '../../lazar/lib/lazar' include OpenTox -old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv") -new = Dataset.from_csv_file File.join(DATA,"swiss.csv") +old = Dataset.from_csv_file File.join("data","mazzatorta_log10.csv") +new = Dataset.from_csv_file File.join("data","swiss_log10.csv") -common_compound_ids = (old.compound_ids & new.compound_ids).uniq +common_compounds = (old.compounds & new.compounds).uniq +puts ["SMILES","-log10(LOAEL)","Dataset"].join "," data = [] -common_compound_ids.each do |cid| - c = Compound.find cid +common_compounds.each do |c| old_values = old.values(c,old.features.first) new_values = new.values(c,new.features.first) identical = old_values & new_values @@ -18,19 +18,16 @@ common_compound_ids.each do |cid| new_values -= identical end identical.each do |v| - data << [c.smiles,v,"mazzatorta, swiss"] + data << [c.smiles,v,"mazzatorta and swiss"] if v end old_values.each do |v| - data << [c.smiles,v,"mazzatorta"] + data << [c.smiles,v,"mazzatorta"] if v end new_values.each do |v| - data << [c.smiles,v,"swiss"] + data << [c.smiles,v,"swiss"] if v end end data.sort!{|a,b| a[1] <=> b[1]} -CSV.open(File.join(DATA,"test.csv"),"w+") do |csv| - csv << ["SMILES","LOAEL","Dataset"] - data.each{|r| csv << r} -end +puts data.collect{|r| r.join ","}.join "\n" diff --git a/scripts/create-training.rb b/scripts/create-training.rb index d05bc1c..8fca3f4 100755 --- a/scripts/create-training.rb +++ b/scripts/create-training.rb @@ -2,14 +2,14 @@ require_relative '../../lazar/lib/lazar' include OpenTox -old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv") -new = Dataset.from_csv_file File.join(DATA,"swiss.csv") +old = Dataset.from_csv_file File.join("data","mazzatorta_log10.csv") +new = Dataset.from_csv_file File.join("data","swiss_log10.csv") -common_compound_ids = (old.compound_ids + new.compound_ids).uniq +common_compounds = (old.compounds + new.compounds).uniq +puts ["SMILES","-log10(LOAEL)","Dataset"].join "," data = [] -common_compound_ids.each do |cid| - c = Compound.find cid +common_compounds.each do |c| old_values = old.values(c,old.features.first) new_values = new.values(c,new.features.first) identical = old_values & new_values @@ -18,19 +18,16 @@ common_compound_ids.each do |cid| new_values -= identical end identical.each do |v| - data << [c.smiles,v,"mazzatorta, swiss"] + data << [c.smiles,v,"mazzatorta and swiss"] if v end old_values.each do |v| - data << [c.smiles,v,"mazzatorta"] + data << [c.smiles,v,"mazzatorta"] if v end new_values.each do |v| - data << [c.smiles,v,"swiss"] + data << [c.smiles,v,"swiss"] if v end end data.sort!{|a,b| a[1] <=> b[1]} -CSV.open(File.join(DATA,"training.csv"),"w+") do |csv| - csv << ["SMILES","LOAEL","Dataset"] - data.each{|r| csv << r} -end +puts data.collect{|r| r.join ","}.join "\n" diff --git a/scripts/crossvalidation-plots.R b/scripts/crossvalidation-plots.R index 2bc259f..7a4f340 100755 --- a/scripts/crossvalidation-plots.R +++ b/scripts/crossvalidation-plots.R @@ -1,15 +1,17 @@ +#!/usr/bin/Rscript + library(ggplot2) library(grid) library(gridExtra) -t0 = read.csv("data/training-cv-0.csv",header=T) -t1 = read.csv("data/training-cv-1.csv",header=T) -t2 = read.csv("data/training-cv-2.csv",header=T) +t0 = read.csv("data/training_log10-cv-0.csv",header=T) +t1 = read.csv("data/training_log10-cv-1.csv",header=T) +t2 = read.csv("data/training_log10-cv-2.csv",header=T) -p0 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t0,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) -p1 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t1,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) -p2 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t2,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) +p0 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t0,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) +p1 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t1,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) +p2 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t2,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) -pdf('figure/crossvalidation.pdf') +pdf('figures/crossvalidation.pdf') grid.arrange(p0,p1,p2,ncol=2) dev.off() diff --git a/scripts/crossvalidation.rb b/scripts/crossvalidation.rb index 79aeb83..9657af1 100755 --- a/scripts/crossvalidation.rb +++ b/scripts/crossvalidation.rb @@ -1,25 +1,34 @@ -require_relative 'include.rb' - +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox +require 'yaml' name = File.basename ARGV[0], ".csv" -file = File.join DATA,ARGV[0] +file = File.join "data",ARGV[0] dataset = Dataset.from_csv_file file -model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression") +model = Model::LazarRegression.create(training_dataset: dataset)#, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression") #model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") #model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average") -csv_file = File.join(DATA,ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.csv")) -id_file = File.join(DATA,ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.id")) -cv = RegressionCrossValidation.create model +csv_file = File.join("data",ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.csv")) +id_file = File.join("data",ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.id")) +cv = Validation::RegressionCrossValidation.create model File.open(id_file,"w+"){|f| f.puts cv.id} - +#cv = Validation::RegressionCrossValidation.first +p cv.id data = [] -cv.predictions.each do |p| - smi = Compound.find(p[0]).smiles - data << [smi,p[1].median,p[2],p[3]] +cv.predictions.each do |cid,p| + smi = Compound.find(cid).smiles + if p["prediction_interval"] + data << [smi,p["value"],p["measurements"].median,p["prediction_interval"][0],p["prediction_interval"][1]] + else + data << [smi,p["value"],p["measurements"].median,nil,nil] + end end data.sort!{|a,b| a[1] <=> b[1]} CSV.open(csv_file,"w+") do |csv| - csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Confidence"] + csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Prediction_interval_low","Prediction_interval_high"] data.each{|r| csv << r} end +=begin +=end diff --git a/scripts/dataset-variability.R b/scripts/dataset-variability.R index 775fd03..65234a4 100755 --- a/scripts/dataset-variability.R +++ b/scripts/dataset-variability.R @@ -1,17 +1,18 @@ +#!/usr/bin/Rscript library(ggplot2) library(grid) library(gridExtra) -m = read.csv("data/mazzatorta.csv",header=T) -s = read.csv("data/swiss.csv",header=T) +m = read.csv("data/mazzatorta_log10.csv",header=T) +s = read.csv("data/swiss_log10.csv",header=T) m.dupsmi = unique(m$SMILES[duplicated(m$SMILES)]) s.dupsmi = unique(s$SMILES[duplicated(s$SMILES)]) m.dup = m[m$SMILES %in% m.dupsmi,] s.dup = s[s$SMILES %in% s.dupsmi,] -m.dup$LOAEL= -log10(m.dup$LOAEL) -s.dup$LOAEL= -log10(s.dup$LOAEL) +#m.dup$LOAEL= -log10(m.dup$LOAEL) +#s.dup$LOAEL= -log10(s.dup$LOAEL) m.dup$SMILES <- reorder(m.dup$SMILES,m.dup$LOAEL) s.dup$SMILES <- reorder(s.dup$SMILES,s.dup$LOAEL) @@ -22,12 +23,12 @@ p2 <- ggplot(s.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab #grid.arrange(p1,p2,ncol=1) #dev.off() -data <- read.csv("data/test.csv",header=T) -data$LOAEL = -log(data$LOAEL) +data <- read.csv("data/test_log10.csv",header=T) +#data$LOAEL = -log(data$LOAEL) data$SMILES <- reorder(data$SMILES,data$LOAEL) img = ggplot(data,aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Dataset)) + geom_point() img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank()) img = img + scale_fill_discrete(breaks=c("Mazzatorta", "Both", "Swiss Federal Office")) img = img -ggsave(file='figure/dataset-variability.pdf', plot=img, width=12,height=8) +ggsave(file='figures/dataset-variability.pdf', plot=img, width=12,height=8) diff --git a/scripts/functional-groups-images.rb b/scripts/functional-groups-images.rb new file mode 100755 index 0000000..346bd5f --- /dev/null +++ b/scripts/functional-groups-images.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby +# www.smartsview.de/smartsview/auto/<image-format>/<visualization-modus>/<legend-option>/<SMARTS> +# syntax rules +# image-format: pdf, png or svg +# visualization modus: 1 or 2 (1 = Complete Visualization, 2 = Element Symbols) +# legend option: both, none, static, dynamic +# SMARTS: All special symbols used in SMARTS can be used except '#', which has to be escaped with %23 + +require 'uri' +SERVICE_URI = "http://www.smartsview.de/smartsview/auto/pdf/2/both/" + + +inFile = File.join("data","functional-groups-smarts.csv") +hash = {} +File.readlines(inFile).each do |line| + columns = line.split(",",2) + group = columns[0].strip + smarts = columns[1].sub(/^'|'$/,"").strip.sub(/^'|'$/,"").strip + hash[group] = smarts +end + +hash.each do |group,smarts| + `wget '#{URI.escape(SERVICE_URI+smarts)}' -O "#{File.join("figures",group+'.pdf')}"` +end + + diff --git a/scripts/functional-groups.R b/scripts/functional-groups.R index 01f2043..6121073 100755 --- a/scripts/functional-groups.R +++ b/scripts/functional-groups.R @@ -1,3 +1,4 @@ +#!/usr/bin/Rscript library("ggplot2") data <- read.csv("data/functional-groups-reduced4R.csv",header=F) @@ -6,4 +7,4 @@ names(data) = c("V1","V2","Dataset") data$V1 <- reorder(data$V1,-data$V2) ggplot(data,aes(x=V1,y=V2,fill=Dataset)) + geom_bar(stat="identity", position=position_dodge()) + xlab("") + ylab("") + coord_flip() -ggsave("figure/functional-groups.pdf") +ggsave("figures/functional-groups.pdf") diff --git a/scripts/misclassifications.rb b/scripts/misclassifications.rb index 171077c..d285868 100755 --- a/scripts/misclassifications.rb +++ b/scripts/misclassifications.rb @@ -1,4 +1,6 @@ -require_relative 'include.rb' +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox class Range def intersection(other) @@ -9,21 +11,21 @@ class Range end experimental = {} -CSV.foreach(File.join(DATA,"test.csv")) do |row| +CSV.foreach(File.join("data","test_log10.csv")) do |row| experimental[row[0]] ||= [] experimental[row[0]] << row[1].to_f end predictions = {} -CSV.foreach(File.join(DATA,"training-test-predictions.csv"),:headers => true) do |row| - predictions[row[0]] = [-Math.log10(row[2].to_f),Math.log10(row[3].to_f).abs] +CSV.foreach(File.join("data","training-test-predictions.csv"),:headers => true) do |row| + predictions[row[0]] = [row[2].to_f,row[3].to_f.abs] end outside_experimental_values = 0 within_experimental_values = 0 out = [] predictions.each do |smi,pred| - exp = experimental[smi].collect{|e| -Math.log10(e)}.uniq + exp = experimental[smi].uniq # https://en.wikipedia.org/wiki/Prediction_interval min = pred[0]-1.96*pred[1] max = pred[0]+1.96*pred[1] diff --git a/scripts/test-correlation-plot.R b/scripts/test-correlation-plot.R index 74a2739..ef69058 100755 --- a/scripts/test-correlation-plot.R +++ b/scripts/test-correlation-plot.R @@ -1,15 +1,17 @@ +#!/usr/bin/Rscript + library(ggplot2) library(grid) library(gridExtra) experimental <- read.csv("data/median-correlation.csv",header=T) -p1 = qplot(-log10(mazzatorta),-log10(swiss),data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)",main="Experimental data") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) +p1 = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)",main="Experimental data") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) training = read.csv("data/training-test-predictions.csv",header=T) -p2 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) +p2 = qplot(LOAEL_predicted,LOAEL_measured_median,data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) -pdf('figure/test-correlation.pdf') +pdf('figures/test-correlation.pdf') grid.arrange(p1,p2,ncol=1,respect=T) dev.off() diff --git a/scripts/test-prediction-plot.R b/scripts/test-prediction-plot.R index db003d3..7201e1a 100755 --- a/scripts/test-prediction-plot.R +++ b/scripts/test-prediction-plot.R @@ -1,7 +1,9 @@ +#!/usr/bin/Rscript + library(ggplot2) training = read.csv("data/training-test-predictions.csv",header=T) -test <- read.csv("data/test.csv",header=T) +test <- read.csv("data/test_log10.csv",header=T) n = c("SMILES","LOAEL","Source") data = data.frame(factor(test$SMILES),test$LOAEL,factor(test$Dataset)) @@ -11,11 +13,11 @@ comb = data.frame(factor(training$SMILES),training$LOAEL_predicted,factor(traini names(comb) = n comb$Type = "predicted" data = rbind(data,comb) -data$LOAEL = -log(data$LOAEL) +#data$LOAEL = -log(data$LOAEL) data$SMILES <- reorder(data$SMILES,data$LOAEL) #img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),shape=Source,color=Type)) img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Type)) img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank()) img <- img + geom_point() -ggsave(file='figure/test-prediction.pdf', plot=img,width=12, height=8) +ggsave(file='figures/test-prediction.pdf', plot=img,width=12, height=8) diff --git a/scripts/test-validation-results.rb b/scripts/test-validation-results.rb new file mode 100755 index 0000000..2750019 --- /dev/null +++ b/scripts/test-validation-results.rb @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox + +validation = Validation::TrainTest.find File.read("data/training-test-predictions.id").chomp + +data = [] +puts ["SMILES","LOAEL_measured_median","LOAEL_predicted","Error","Dataset"].join(",") +validation.predictions.each do |id,p| + data << [Compound.find(id).smiles, p["measurements"].median, p["value"], (p["measurements"].median-p["value"]).abs,"test-prediction"] +end + +data.sort!{|a,b| a[1] <=> b[1]} +puts data.collect{|r| r.join ","}.join("\n") diff --git a/scripts/test-validation.rb b/scripts/test-validation.rb index 0bbcc42..5c07449 100755 --- a/scripts/test-validation.rb +++ b/scripts/test-validation.rb @@ -1,26 +1,10 @@ -require_relative "include.rb" +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox -test = Dataset.from_csv_file(File.join(DATA,"test.csv")) +test = Dataset.from_csv_file(File.join("data","test_log10.csv")) +train = Dataset.from_csv_file(File.join("data","training_log10.csv")) -file = File.join(DATA,ARGV[0]) -dataset = Dataset.from_csv_file file -model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression") -#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") -#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average") -validation = RegressionValidation.create model, dataset, test -csv_file = file.sub(".csv","-test-predictions.csv") -id_file = file.sub(".csv","-test-predictions.id") -File.open(id_file,"w+"){|f| f.puts validation.id} -name = File.basename(ARGV[0],".csv") - -data = [] -validation.predictions.each do |p| - data << [Compound.find(p[0]).smiles, p[1].median, p[2], p[3],"#{name}-prediction"] -end - -data.sort!{|a,b| a[1] <=> b[1]} - -CSV.open(csv_file,"w+") do |csv| - csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","RMSE","Dataset"] - data.each{|r| csv << r} -end +model = Model::LazarRegression.create(training_dataset: train) +validation = Validation::TrainTest.create model, train, test +puts validation.id |