diff options
author | Christoph Helma <helma@in-silico.ch> | 2017-02-16 13:11:24 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2017-02-16 13:11:24 +0100 |
commit | c2c5a94d6dccaf59f30a6415d5b2d20e652b50b4 (patch) | |
tree | 981d9fef32aa88beab4b2c775ad4b0bcd72d1794 /scripts | |
parent | bb8797e0047f02768033cf6839dc926d30c016d2 (diff) |
final models, rf, sim 0.5 and 0.1
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/create-median-correlation.rb | 1 | ||||
-rwxr-xr-x | scripts/create-testset.rb (renamed from scripts/create-test.rb) | 0 | ||||
-rwxr-xr-x | scripts/create-trainingset.rb (renamed from scripts/create-training.rb) | 0 | ||||
-rwxr-xr-x | scripts/crossvalidation-plots.R | 17 | ||||
-rwxr-xr-x | scripts/crossvalidation-table.rb | 24 | ||||
-rwxr-xr-x | scripts/crossvalidation.rb | 20 | ||||
-rwxr-xr-x | scripts/include.rb | 5 | ||||
-rwxr-xr-x | scripts/median-correlation-plot.R | 8 | ||||
-rwxr-xr-x | scripts/prediction-test-correlation-plot.R | 9 | ||||
-rwxr-xr-x | scripts/test-correlation-plot.R | 17 | ||||
-rwxr-xr-x | scripts/test-prediction-plot.R | 2 | ||||
-rwxr-xr-x | scripts/test-prediction.rb | 35 | ||||
-rwxr-xr-x | scripts/test-validation-results.rb | 6 | ||||
-rwxr-xr-x | scripts/testset-validation.rb (renamed from scripts/test-validation.rb) | 2 |
14 files changed, 88 insertions, 58 deletions
diff --git a/scripts/create-median-correlation.rb b/scripts/create-median-correlation.rb index 2cdd98e..fe20ad0 100755 --- a/scripts/create-median-correlation.rb +++ b/scripts/create-median-correlation.rb @@ -12,6 +12,7 @@ common_compounds.each do |c| old_values = old.values(c,old.features.first) new_values = new.values(c,new.features.first) identical = old_values & new_values + # remove identical values from both datasets unless identical.empty? old_values -= identical new_values -= identical diff --git a/scripts/create-test.rb b/scripts/create-testset.rb index 03d5c9e..03d5c9e 100755 --- a/scripts/create-test.rb +++ b/scripts/create-testset.rb diff --git a/scripts/create-training.rb b/scripts/create-trainingset.rb index 0976db8..0976db8 100755 --- a/scripts/create-training.rb +++ b/scripts/create-trainingset.rb diff --git a/scripts/crossvalidation-plots.R b/scripts/crossvalidation-plots.R index 7a4f340..8a4f76e 100755 --- a/scripts/crossvalidation-plots.R +++ b/scripts/crossvalidation-plots.R @@ -1,17 +1,8 @@ #!/usr/bin/Rscript library(ggplot2) -library(grid) -library(gridExtra) -t0 = read.csv("data/training_log10-cv-0.csv",header=T) -t1 = read.csv("data/training_log10-cv-1.csv",header=T) -t2 = read.csv("data/training_log10-cv-2.csv",header=T) - -p0 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t0,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) -p1 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t1,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) -p2 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t2,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) - -pdf('figures/crossvalidation.pdf') -grid.arrange(p0,p1,p2,ncol=2) -dev.off() +nr = commandArgs(TRUE)[1] +data = read.csv(paste("data/training_log10-cv-",nr,".csv",sep="")) +img = qplot(LOAEL_predicted,LOAEL_measured_median,data=data,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",colour=Warnings) + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) + scale_color_manual(values=c("#00BFC4", "#F8766D")) +ggsave(file=paste('figures/crossvalidation',nr,'.pdf',sep=""), plot=img,width=12, height=8) diff --git a/scripts/crossvalidation-table.rb b/scripts/crossvalidation-table.rb new file mode 100755 index 0000000..499b166 --- /dev/null +++ b/scripts/crossvalidation-table.rb @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox +require 'yaml' +csv_file = ARGV[0].sub(/id$/,"csv") +cv = Validation::RegressionCrossValidation.find File.read(ARGV[0]).chomp +data = [] +cv.predictions.each do |cid,p| + smi = Compound.find(cid).smiles + warnings = "F" + warnings = "T" if p["warnings"] and !p["warnings"].empty? + if p["prediction_interval"] + data << [smi,p["value"],p["measurements"].median,p["prediction_interval"][0],p["prediction_interval"][1],warnings] + else + data << [smi,p["value"],p["measurements"].median,nil,nil,warnings] + end +end + +data.sort!{|a,b| a[1] <=> b[1]} + +CSV.open(csv_file,"w+") do |csv| + csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Prediction_interval_low","Prediction_interval_high","Warnings"] + data.each{|r| csv << r} +end diff --git a/scripts/crossvalidation.rb b/scripts/crossvalidation.rb index e02c5ca..6deca60 100755 --- a/scripts/crossvalidation.rb +++ b/scripts/crossvalidation.rb @@ -5,25 +5,7 @@ require 'yaml' name = File.basename ARGV[0], ".csv" file = File.join "data",ARGV[0] dataset = Dataset.from_csv_file file -model = Model::LazarRegression.create(training_dataset: dataset, algorithms: { :prediction => {:method => "Algorithm::Caret.rf"}, :similarity => { :min => 0.5 }}) -csv_file = File.join("data",ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.csv")) +model = Model::LazarRegression.create(training_dataset: dataset)#, algorithms: { :prediction => {:method => "Algorithm::Caret.rf"}, :similarity => { :min => 0.5 }}) id_file = File.join("data",ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.id")) cv = Validation::RegressionCrossValidation.create model File.open(id_file,"w+"){|f| f.puts cv.id} -p cv.id -data = [] -cv.predictions.each do |cid,p| - smi = Compound.find(cid).smiles - if p["prediction_interval"] - data << [smi,p["value"],p["measurements"].median,p["prediction_interval"][0],p["prediction_interval"][1]] - else - data << [smi,p["value"],p["measurements"].median,nil,nil] - end -end - -data.sort!{|a,b| a[1] <=> b[1]} - -CSV.open(csv_file,"w+") do |csv| - csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Prediction_interval_low","Prediction_interval_high"] - data.each{|r| csv << r} -end diff --git a/scripts/include.rb b/scripts/include.rb deleted file mode 100755 index edc3a64..0000000 --- a/scripts/include.rb +++ /dev/null @@ -1,5 +0,0 @@ -require_relative '../lazar/lib/lazar' -include OpenTox -DATA = File.join(File.dirname(__FILE__),"data") -#$mongo.database.drop -#$gridfs = $mongo.database.fs diff --git a/scripts/median-correlation-plot.R b/scripts/median-correlation-plot.R new file mode 100755 index 0000000..f4b28c2 --- /dev/null +++ b/scripts/median-correlation-plot.R @@ -0,0 +1,8 @@ +#!/usr/bin/Rscript + +library(ggplot2) + +experimental <- read.csv("data/median-correlation.csv",header=T) +img = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) + +ggsave(file='figures/median-correlation.pdf', plot=img,width=12, height=8) diff --git a/scripts/prediction-test-correlation-plot.R b/scripts/prediction-test-correlation-plot.R new file mode 100755 index 0000000..648e864 --- /dev/null +++ b/scripts/prediction-test-correlation-plot.R @@ -0,0 +1,9 @@ +#!/usr/bin/Rscript + +library(ggplot2) + +training = read.csv("data/training-test-predictions.csv",header=T) + +img = qplot(LOAEL_predicted,LOAEL_measured_median,data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)", colour = Warnings) + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) + scale_color_manual(values=c("#00BFC4", "#F8766D")) + +ggsave(file='figures/prediction-test-correlation.pdf', plot=img,width=12, height=8) diff --git a/scripts/test-correlation-plot.R b/scripts/test-correlation-plot.R deleted file mode 100755 index ef69058..0000000 --- a/scripts/test-correlation-plot.R +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/Rscript - -library(ggplot2) -library(grid) -library(gridExtra) - -experimental <- read.csv("data/median-correlation.csv",header=T) -p1 = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)",main="Experimental data") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) - -training = read.csv("data/training-test-predictions.csv",header=T) - -p2 = qplot(LOAEL_predicted,LOAEL_measured_median,data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) - -pdf('figures/test-correlation.pdf') -grid.arrange(p1,p2,ncol=1,respect=T) -dev.off() - diff --git a/scripts/test-prediction-plot.R b/scripts/test-prediction-plot.R index 512fa82..dddf91a 100755 --- a/scripts/test-prediction-plot.R +++ b/scripts/test-prediction-plot.R @@ -6,6 +6,6 @@ data = read.csv("data/predictions-measurements.csv",header=T) data$SMILES <- reorder(data$SMILES,data$LOAEL) img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Origin)) img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank()) -img <- img + geom_point() +img <- img + geom_point() + scale_color_manual(values=c("#619CFF", "#00BFC4", "#F8766D")) ggsave(file='figures/test-prediction.pdf', plot=img,width=12, height=8) diff --git a/scripts/test-prediction.rb b/scripts/test-prediction.rb new file mode 100755 index 0000000..ff74100 --- /dev/null +++ b/scripts/test-prediction.rb @@ -0,0 +1,35 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox + +predictions = {} +warnings = {} +CSV.foreach("data/training-test-predictions.csv") do |row| + unless row[0] == "SMILES" + predictions[row[0]] = row[2] + warnings[row[0]] = row[5] + end +end + +measurements = {} +CSV.foreach("data/test_log10.csv") do |row| + unless row[0] == "SMILES" + measurements[row[0]] ||= [] + measurements[row[0]] << row[1] + end +end + +File.open(File.join("data","predictions-measurements.csv"),"w+") do |f| + f.puts ["SMILES","LOAEL","Origin"].join "," + predictions.each do |smi,v| + if warnings[smi] == "T" + f.puts [smi,v,"Warning"].join "," + elsif warnings[smi] == "F" + f.puts [smi,v,"Prediction"].join "," + end + measurements[smi].each do |m| + f.puts [smi,m,"Measurement"].join "," + end + end +end + diff --git a/scripts/test-validation-results.rb b/scripts/test-validation-results.rb index e17d960..b119bb7 100755 --- a/scripts/test-validation-results.rb +++ b/scripts/test-validation-results.rb @@ -6,11 +6,13 @@ validation = Validation::TrainTest.find File.read("data/training-test-prediction data = [] validation.predictions.each do |id,p| - data << [Compound.find(id).smiles, p["measurements"].median, p["value"], (p["measurements"].median-p["value"]).abs,"test-prediction"] + warnings = "F" + warnings = "T" if p["warnings"] and !p["warnings"].empty? + data << [Compound.find(id).smiles, p["measurements"].median, p["value"], (p["measurements"].median-p["value"]).abs,"test-prediction",warnings] end data.sort!{|a,b| a[1] <=> b[1]} File.open(File.join("data","training-test-predictions.csv"),"w+") do |f| - f.puts ["SMILES","LOAEL_measured_median","LOAEL_predicted","Error","Dataset"].join(",") + f.puts ["SMILES","LOAEL_measured_median","LOAEL_predicted","Error","Dataset","Warnings"].join(",") f.puts data.collect{|r| r.join ","}.join("\n") end diff --git a/scripts/test-validation.rb b/scripts/testset-validation.rb index 0b8c0a7..82e5f7d 100755 --- a/scripts/test-validation.rb +++ b/scripts/testset-validation.rb @@ -5,6 +5,6 @@ include OpenTox test = Dataset.from_csv_file(File.join("data","test_log10.csv")) train = Dataset.from_csv_file(File.join("data","training_log10.csv")) -model = Model::LazarRegression.create(training_dataset: train, algorithms: { :prediction => {:method => "Algorithm::Caret.rf"}, :similarity => { :min => 0.5 }}) +model = Model::LazarRegression.create(training_dataset: train)#, algorithms: { :prediction => {:method => "Algorithm::Caret.rf"}, :similarity => { :min => 0.5 }}) validation = Validation::TrainTest.create model, train, test File.open(File.join("data","training-test-predictions.id"),"w+") { |f| f.puts validation.id } |