summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2017-02-16 13:11:24 +0100
committerChristoph Helma <helma@in-silico.ch>2017-02-16 13:11:24 +0100
commitc2c5a94d6dccaf59f30a6415d5b2d20e652b50b4 (patch)
tree981d9fef32aa88beab4b2c775ad4b0bcd72d1794 /scripts
parentbb8797e0047f02768033cf6839dc926d30c016d2 (diff)
final models, rf, sim 0.5 and 0.1
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/create-median-correlation.rb1
-rwxr-xr-xscripts/create-testset.rb (renamed from scripts/create-test.rb)0
-rwxr-xr-xscripts/create-trainingset.rb (renamed from scripts/create-training.rb)0
-rwxr-xr-xscripts/crossvalidation-plots.R17
-rwxr-xr-xscripts/crossvalidation-table.rb24
-rwxr-xr-xscripts/crossvalidation.rb20
-rwxr-xr-xscripts/include.rb5
-rwxr-xr-xscripts/median-correlation-plot.R8
-rwxr-xr-xscripts/prediction-test-correlation-plot.R9
-rwxr-xr-xscripts/test-correlation-plot.R17
-rwxr-xr-xscripts/test-prediction-plot.R2
-rwxr-xr-xscripts/test-prediction.rb35
-rwxr-xr-xscripts/test-validation-results.rb6
-rwxr-xr-xscripts/testset-validation.rb (renamed from scripts/test-validation.rb)2
14 files changed, 88 insertions, 58 deletions
diff --git a/scripts/create-median-correlation.rb b/scripts/create-median-correlation.rb
index 2cdd98e..fe20ad0 100755
--- a/scripts/create-median-correlation.rb
+++ b/scripts/create-median-correlation.rb
@@ -12,6 +12,7 @@ common_compounds.each do |c|
old_values = old.values(c,old.features.first)
new_values = new.values(c,new.features.first)
identical = old_values & new_values
+ # remove identical values from both datasets
unless identical.empty?
old_values -= identical
new_values -= identical
diff --git a/scripts/create-test.rb b/scripts/create-testset.rb
index 03d5c9e..03d5c9e 100755
--- a/scripts/create-test.rb
+++ b/scripts/create-testset.rb
diff --git a/scripts/create-training.rb b/scripts/create-trainingset.rb
index 0976db8..0976db8 100755
--- a/scripts/create-training.rb
+++ b/scripts/create-trainingset.rb
diff --git a/scripts/crossvalidation-plots.R b/scripts/crossvalidation-plots.R
index 7a4f340..8a4f76e 100755
--- a/scripts/crossvalidation-plots.R
+++ b/scripts/crossvalidation-plots.R
@@ -1,17 +1,8 @@
#!/usr/bin/Rscript
library(ggplot2)
-library(grid)
-library(gridExtra)
-t0 = read.csv("data/training_log10-cv-0.csv",header=T)
-t1 = read.csv("data/training_log10-cv-1.csv",header=T)
-t2 = read.csv("data/training_log10-cv-2.csv",header=T)
-
-p0 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t0,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
-p1 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t1,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
-p2 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t2,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
-
-pdf('figures/crossvalidation.pdf')
-grid.arrange(p0,p1,p2,ncol=2)
-dev.off()
+nr = commandArgs(TRUE)[1]
+data = read.csv(paste("data/training_log10-cv-",nr,".csv",sep=""))
+img = qplot(LOAEL_predicted,LOAEL_measured_median,data=data,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",colour=Warnings) + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) + scale_color_manual(values=c("#00BFC4", "#F8766D"))
+ggsave(file=paste('figures/crossvalidation',nr,'.pdf',sep=""), plot=img,width=12, height=8)
diff --git a/scripts/crossvalidation-table.rb b/scripts/crossvalidation-table.rb
new file mode 100755
index 0000000..499b166
--- /dev/null
+++ b/scripts/crossvalidation-table.rb
@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+require 'yaml'
+csv_file = ARGV[0].sub(/id$/,"csv")
+cv = Validation::RegressionCrossValidation.find File.read(ARGV[0]).chomp
+data = []
+cv.predictions.each do |cid,p|
+ smi = Compound.find(cid).smiles
+ warnings = "F"
+ warnings = "T" if p["warnings"] and !p["warnings"].empty?
+ if p["prediction_interval"]
+ data << [smi,p["value"],p["measurements"].median,p["prediction_interval"][0],p["prediction_interval"][1],warnings]
+ else
+ data << [smi,p["value"],p["measurements"].median,nil,nil,warnings]
+ end
+end
+
+data.sort!{|a,b| a[1] <=> b[1]}
+
+CSV.open(csv_file,"w+") do |csv|
+ csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Prediction_interval_low","Prediction_interval_high","Warnings"]
+ data.each{|r| csv << r}
+end
diff --git a/scripts/crossvalidation.rb b/scripts/crossvalidation.rb
index e02c5ca..6deca60 100755
--- a/scripts/crossvalidation.rb
+++ b/scripts/crossvalidation.rb
@@ -5,25 +5,7 @@ require 'yaml'
name = File.basename ARGV[0], ".csv"
file = File.join "data",ARGV[0]
dataset = Dataset.from_csv_file file
-model = Model::LazarRegression.create(training_dataset: dataset, algorithms: { :prediction => {:method => "Algorithm::Caret.rf"}, :similarity => { :min => 0.5 }})
-csv_file = File.join("data",ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.csv"))
+model = Model::LazarRegression.create(training_dataset: dataset)#, algorithms: { :prediction => {:method => "Algorithm::Caret.rf"}, :similarity => { :min => 0.5 }})
id_file = File.join("data",ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.id"))
cv = Validation::RegressionCrossValidation.create model
File.open(id_file,"w+"){|f| f.puts cv.id}
-p cv.id
-data = []
-cv.predictions.each do |cid,p|
- smi = Compound.find(cid).smiles
- if p["prediction_interval"]
- data << [smi,p["value"],p["measurements"].median,p["prediction_interval"][0],p["prediction_interval"][1]]
- else
- data << [smi,p["value"],p["measurements"].median,nil,nil]
- end
-end
-
-data.sort!{|a,b| a[1] <=> b[1]}
-
-CSV.open(csv_file,"w+") do |csv|
- csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Prediction_interval_low","Prediction_interval_high"]
- data.each{|r| csv << r}
-end
diff --git a/scripts/include.rb b/scripts/include.rb
deleted file mode 100755
index edc3a64..0000000
--- a/scripts/include.rb
+++ /dev/null
@@ -1,5 +0,0 @@
-require_relative '../lazar/lib/lazar'
-include OpenTox
-DATA = File.join(File.dirname(__FILE__),"data")
-#$mongo.database.drop
-#$gridfs = $mongo.database.fs
diff --git a/scripts/median-correlation-plot.R b/scripts/median-correlation-plot.R
new file mode 100755
index 0000000..f4b28c2
--- /dev/null
+++ b/scripts/median-correlation-plot.R
@@ -0,0 +1,8 @@
+#!/usr/bin/Rscript
+
+library(ggplot2)
+
+experimental <- read.csv("data/median-correlation.csv",header=T)
+img = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
+
+ggsave(file='figures/median-correlation.pdf', plot=img,width=12, height=8)
diff --git a/scripts/prediction-test-correlation-plot.R b/scripts/prediction-test-correlation-plot.R
new file mode 100755
index 0000000..648e864
--- /dev/null
+++ b/scripts/prediction-test-correlation-plot.R
@@ -0,0 +1,9 @@
+#!/usr/bin/Rscript
+
+library(ggplot2)
+
+training = read.csv("data/training-test-predictions.csv",header=T)
+
+img = qplot(LOAEL_predicted,LOAEL_measured_median,data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)", colour = Warnings) + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) + scale_color_manual(values=c("#00BFC4", "#F8766D"))
+
+ggsave(file='figures/prediction-test-correlation.pdf', plot=img,width=12, height=8)
diff --git a/scripts/test-correlation-plot.R b/scripts/test-correlation-plot.R
deleted file mode 100755
index ef69058..0000000
--- a/scripts/test-correlation-plot.R
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/Rscript
-
-library(ggplot2)
-library(grid)
-library(gridExtra)
-
-experimental <- read.csv("data/median-correlation.csv",header=T)
-p1 = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)",main="Experimental data") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
-
-training = read.csv("data/training-test-predictions.csv",header=T)
-
-p2 = qplot(LOAEL_predicted,LOAEL_measured_median,data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
-
-pdf('figures/test-correlation.pdf')
-grid.arrange(p1,p2,ncol=1,respect=T)
-dev.off()
-
diff --git a/scripts/test-prediction-plot.R b/scripts/test-prediction-plot.R
index 512fa82..dddf91a 100755
--- a/scripts/test-prediction-plot.R
+++ b/scripts/test-prediction-plot.R
@@ -6,6 +6,6 @@ data = read.csv("data/predictions-measurements.csv",header=T)
data$SMILES <- reorder(data$SMILES,data$LOAEL)
img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Origin))
img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank())
-img <- img + geom_point()
+img <- img + geom_point() + scale_color_manual(values=c("#619CFF", "#00BFC4", "#F8766D"))
ggsave(file='figures/test-prediction.pdf', plot=img,width=12, height=8)
diff --git a/scripts/test-prediction.rb b/scripts/test-prediction.rb
new file mode 100755
index 0000000..ff74100
--- /dev/null
+++ b/scripts/test-prediction.rb
@@ -0,0 +1,35 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+
+predictions = {}
+warnings = {}
+CSV.foreach("data/training-test-predictions.csv") do |row|
+ unless row[0] == "SMILES"
+ predictions[row[0]] = row[2]
+ warnings[row[0]] = row[5]
+ end
+end
+
+measurements = {}
+CSV.foreach("data/test_log10.csv") do |row|
+ unless row[0] == "SMILES"
+ measurements[row[0]] ||= []
+ measurements[row[0]] << row[1]
+ end
+end
+
+File.open(File.join("data","predictions-measurements.csv"),"w+") do |f|
+ f.puts ["SMILES","LOAEL","Origin"].join ","
+ predictions.each do |smi,v|
+ if warnings[smi] == "T"
+ f.puts [smi,v,"Warning"].join ","
+ elsif warnings[smi] == "F"
+ f.puts [smi,v,"Prediction"].join ","
+ end
+ measurements[smi].each do |m|
+ f.puts [smi,m,"Measurement"].join ","
+ end
+ end
+end
+
diff --git a/scripts/test-validation-results.rb b/scripts/test-validation-results.rb
index e17d960..b119bb7 100755
--- a/scripts/test-validation-results.rb
+++ b/scripts/test-validation-results.rb
@@ -6,11 +6,13 @@ validation = Validation::TrainTest.find File.read("data/training-test-prediction
data = []
validation.predictions.each do |id,p|
- data << [Compound.find(id).smiles, p["measurements"].median, p["value"], (p["measurements"].median-p["value"]).abs,"test-prediction"]
+ warnings = "F"
+ warnings = "T" if p["warnings"] and !p["warnings"].empty?
+ data << [Compound.find(id).smiles, p["measurements"].median, p["value"], (p["measurements"].median-p["value"]).abs,"test-prediction",warnings]
end
data.sort!{|a,b| a[1] <=> b[1]}
File.open(File.join("data","training-test-predictions.csv"),"w+") do |f|
- f.puts ["SMILES","LOAEL_measured_median","LOAEL_predicted","Error","Dataset"].join(",")
+ f.puts ["SMILES","LOAEL_measured_median","LOAEL_predicted","Error","Dataset","Warnings"].join(",")
f.puts data.collect{|r| r.join ","}.join("\n")
end
diff --git a/scripts/test-validation.rb b/scripts/testset-validation.rb
index 0b8c0a7..82e5f7d 100755
--- a/scripts/test-validation.rb
+++ b/scripts/testset-validation.rb
@@ -5,6 +5,6 @@ include OpenTox
test = Dataset.from_csv_file(File.join("data","test_log10.csv"))
train = Dataset.from_csv_file(File.join("data","training_log10.csv"))
-model = Model::LazarRegression.create(training_dataset: train, algorithms: { :prediction => {:method => "Algorithm::Caret.rf"}, :similarity => { :min => 0.5 }})
+model = Model::LazarRegression.create(training_dataset: train)#, algorithms: { :prediction => {:method => "Algorithm::Caret.rf"}, :similarity => { :min => 0.5 }})
validation = Validation::TrainTest.create model, train, test
File.open(File.join("data","training-test-predictions.id"),"w+") { |f| f.puts validation.id }