summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2017-02-13 15:24:11 +0100
committerChristoph Helma <helma@in-silico.ch>2017-02-13 15:24:11 +0100
commit04baa2d6ddab1963759f99c87cf8f87cbd435831 (patch)
tree9302cf57ba42b8c7efb76515e7acafb95ea6e683 /scripts
parentdb82eef974b8783c40e7daa504feead3f555fdb8 (diff)
adjustments for latest lazar version
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/create-median-correlation.rb20
-rwxr-xr-xscripts/create-test.rb21
-rwxr-xr-xscripts/create-training.rb21
-rwxr-xr-xscripts/crossvalidation-plots.R16
-rwxr-xr-xscripts/crossvalidation.rb33
-rwxr-xr-xscripts/dataset-variability.R15
-rwxr-xr-xscripts/functional-groups-images.rb26
-rwxr-xr-xscripts/functional-groups.R3
-rwxr-xr-xscripts/misclassifications.rb12
-rwxr-xr-xscripts/test-correlation-plot.R8
-rwxr-xr-xscripts/test-prediction-plot.R8
-rwxr-xr-xscripts/test-validation-results.rb14
-rwxr-xr-xscripts/test-validation.rb32
13 files changed, 132 insertions, 97 deletions
diff --git a/scripts/create-median-correlation.rb b/scripts/create-median-correlation.rb
index 9a2f6f5..2b932d0 100755
--- a/scripts/create-median-correlation.rb
+++ b/scripts/create-median-correlation.rb
@@ -1,13 +1,15 @@
-require_relative 'include.rb'
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
-old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv")
-new = Dataset.from_csv_file File.join(DATA,"swiss.csv")
+old = Dataset.from_csv_file File.join("data","mazzatorta_log10.csv")
+new = Dataset.from_csv_file File.join("data","swiss_log10.csv")
-common_compound_ids = (old.compound_ids & new.compound_ids).uniq
+common_compounds = (old.compounds & new.compounds).uniq
data = []
-common_compound_ids.each do |cid|
- c = Compound.find cid
+puts ["SMILES","mazzatorta","swiss"].join(",")
+common_compounds.each do |c|
old_values = old.values(c,old.features.first)
new_values = new.values(c,new.features.first)
identical = old_values & new_values
@@ -21,8 +23,4 @@ common_compound_ids.each do |cid|
end
data.sort!{|a,b| a[1] <=> b[1]}
-
-CSV.open(File.join(DATA,"median-correlation.csv"),"w+") do |csv|
- csv << ["SMILES","mazzatorta","swiss"]
- data.each{|r| csv << r}
-end
+puts data.collect{|r| r.join ","}.join("\n")
diff --git a/scripts/create-test.rb b/scripts/create-test.rb
index 782f172..151b5e8 100755
--- a/scripts/create-test.rb
+++ b/scripts/create-test.rb
@@ -2,14 +2,14 @@
require_relative '../../lazar/lib/lazar'
include OpenTox
-old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv")
-new = Dataset.from_csv_file File.join(DATA,"swiss.csv")
+old = Dataset.from_csv_file File.join("data","mazzatorta_log10.csv")
+new = Dataset.from_csv_file File.join("data","swiss_log10.csv")
-common_compound_ids = (old.compound_ids & new.compound_ids).uniq
+common_compounds = (old.compounds & new.compounds).uniq
+puts ["SMILES","-log10(LOAEL)","Dataset"].join ","
data = []
-common_compound_ids.each do |cid|
- c = Compound.find cid
+common_compounds.each do |c|
old_values = old.values(c,old.features.first)
new_values = new.values(c,new.features.first)
identical = old_values & new_values
@@ -18,19 +18,16 @@ common_compound_ids.each do |cid|
new_values -= identical
end
identical.each do |v|
- data << [c.smiles,v,"mazzatorta, swiss"]
+ data << [c.smiles,v,"mazzatorta and swiss"] if v
end
old_values.each do |v|
- data << [c.smiles,v,"mazzatorta"]
+ data << [c.smiles,v,"mazzatorta"] if v
end
new_values.each do |v|
- data << [c.smiles,v,"swiss"]
+ data << [c.smiles,v,"swiss"] if v
end
end
data.sort!{|a,b| a[1] <=> b[1]}
-CSV.open(File.join(DATA,"test.csv"),"w+") do |csv|
- csv << ["SMILES","LOAEL","Dataset"]
- data.each{|r| csv << r}
-end
+puts data.collect{|r| r.join ","}.join "\n"
diff --git a/scripts/create-training.rb b/scripts/create-training.rb
index d05bc1c..8fca3f4 100755
--- a/scripts/create-training.rb
+++ b/scripts/create-training.rb
@@ -2,14 +2,14 @@
require_relative '../../lazar/lib/lazar'
include OpenTox
-old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv")
-new = Dataset.from_csv_file File.join(DATA,"swiss.csv")
+old = Dataset.from_csv_file File.join("data","mazzatorta_log10.csv")
+new = Dataset.from_csv_file File.join("data","swiss_log10.csv")
-common_compound_ids = (old.compound_ids + new.compound_ids).uniq
+common_compounds = (old.compounds + new.compounds).uniq
+puts ["SMILES","-log10(LOAEL)","Dataset"].join ","
data = []
-common_compound_ids.each do |cid|
- c = Compound.find cid
+common_compounds.each do |c|
old_values = old.values(c,old.features.first)
new_values = new.values(c,new.features.first)
identical = old_values & new_values
@@ -18,19 +18,16 @@ common_compound_ids.each do |cid|
new_values -= identical
end
identical.each do |v|
- data << [c.smiles,v,"mazzatorta, swiss"]
+ data << [c.smiles,v,"mazzatorta and swiss"] if v
end
old_values.each do |v|
- data << [c.smiles,v,"mazzatorta"]
+ data << [c.smiles,v,"mazzatorta"] if v
end
new_values.each do |v|
- data << [c.smiles,v,"swiss"]
+ data << [c.smiles,v,"swiss"] if v
end
end
data.sort!{|a,b| a[1] <=> b[1]}
-CSV.open(File.join(DATA,"training.csv"),"w+") do |csv|
- csv << ["SMILES","LOAEL","Dataset"]
- data.each{|r| csv << r}
-end
+puts data.collect{|r| r.join ","}.join "\n"
diff --git a/scripts/crossvalidation-plots.R b/scripts/crossvalidation-plots.R
index 2bc259f..7a4f340 100755
--- a/scripts/crossvalidation-plots.R
+++ b/scripts/crossvalidation-plots.R
@@ -1,15 +1,17 @@
+#!/usr/bin/Rscript
+
library(ggplot2)
library(grid)
library(gridExtra)
-t0 = read.csv("data/training-cv-0.csv",header=T)
-t1 = read.csv("data/training-cv-1.csv",header=T)
-t2 = read.csv("data/training-cv-2.csv",header=T)
+t0 = read.csv("data/training_log10-cv-0.csv",header=T)
+t1 = read.csv("data/training_log10-cv-1.csv",header=T)
+t2 = read.csv("data/training_log10-cv-2.csv",header=T)
-p0 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t0,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
-p1 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t1,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
-p2 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t2,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
+p0 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t0,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
+p1 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t1,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
+p2 = qplot(LOAEL_predicted,LOAEL_measured_median,data=t2,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
-pdf('figure/crossvalidation.pdf')
+pdf('figures/crossvalidation.pdf')
grid.arrange(p0,p1,p2,ncol=2)
dev.off()
diff --git a/scripts/crossvalidation.rb b/scripts/crossvalidation.rb
index 79aeb83..9657af1 100755
--- a/scripts/crossvalidation.rb
+++ b/scripts/crossvalidation.rb
@@ -1,25 +1,34 @@
-require_relative 'include.rb'
-
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+require 'yaml'
name = File.basename ARGV[0], ".csv"
-file = File.join DATA,ARGV[0]
+file = File.join "data",ARGV[0]
dataset = Dataset.from_csv_file file
-model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
+model = Model::LazarRegression.create(training_dataset: dataset)#, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average")
-csv_file = File.join(DATA,ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.csv"))
-id_file = File.join(DATA,ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.id"))
-cv = RegressionCrossValidation.create model
+csv_file = File.join("data",ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.csv"))
+id_file = File.join("data",ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.id"))
+cv = Validation::RegressionCrossValidation.create model
File.open(id_file,"w+"){|f| f.puts cv.id}
-
+#cv = Validation::RegressionCrossValidation.first
+p cv.id
data = []
-cv.predictions.each do |p|
- smi = Compound.find(p[0]).smiles
- data << [smi,p[1].median,p[2],p[3]]
+cv.predictions.each do |cid,p|
+ smi = Compound.find(cid).smiles
+ if p["prediction_interval"]
+ data << [smi,p["value"],p["measurements"].median,p["prediction_interval"][0],p["prediction_interval"][1]]
+ else
+ data << [smi,p["value"],p["measurements"].median,nil,nil]
+ end
end
data.sort!{|a,b| a[1] <=> b[1]}
CSV.open(csv_file,"w+") do |csv|
- csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Confidence"]
+ csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Prediction_interval_low","Prediction_interval_high"]
data.each{|r| csv << r}
end
+=begin
+=end
diff --git a/scripts/dataset-variability.R b/scripts/dataset-variability.R
index 775fd03..65234a4 100755
--- a/scripts/dataset-variability.R
+++ b/scripts/dataset-variability.R
@@ -1,17 +1,18 @@
+#!/usr/bin/Rscript
library(ggplot2)
library(grid)
library(gridExtra)
-m = read.csv("data/mazzatorta.csv",header=T)
-s = read.csv("data/swiss.csv",header=T)
+m = read.csv("data/mazzatorta_log10.csv",header=T)
+s = read.csv("data/swiss_log10.csv",header=T)
m.dupsmi = unique(m$SMILES[duplicated(m$SMILES)])
s.dupsmi = unique(s$SMILES[duplicated(s$SMILES)])
m.dup = m[m$SMILES %in% m.dupsmi,]
s.dup = s[s$SMILES %in% s.dupsmi,]
-m.dup$LOAEL= -log10(m.dup$LOAEL)
-s.dup$LOAEL= -log10(s.dup$LOAEL)
+#m.dup$LOAEL= -log10(m.dup$LOAEL)
+#s.dup$LOAEL= -log10(s.dup$LOAEL)
m.dup$SMILES <- reorder(m.dup$SMILES,m.dup$LOAEL)
s.dup$SMILES <- reorder(s.dup$SMILES,s.dup$LOAEL)
@@ -22,12 +23,12 @@ p2 <- ggplot(s.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab
#grid.arrange(p1,p2,ncol=1)
#dev.off()
-data <- read.csv("data/test.csv",header=T)
-data$LOAEL = -log(data$LOAEL)
+data <- read.csv("data/test_log10.csv",header=T)
+#data$LOAEL = -log(data$LOAEL)
data$SMILES <- reorder(data$SMILES,data$LOAEL)
img = ggplot(data,aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Dataset)) + geom_point()
img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank())
img = img + scale_fill_discrete(breaks=c("Mazzatorta", "Both", "Swiss Federal Office"))
img = img
-ggsave(file='figure/dataset-variability.pdf', plot=img, width=12,height=8)
+ggsave(file='figures/dataset-variability.pdf', plot=img, width=12,height=8)
diff --git a/scripts/functional-groups-images.rb b/scripts/functional-groups-images.rb
new file mode 100755
index 0000000..346bd5f
--- /dev/null
+++ b/scripts/functional-groups-images.rb
@@ -0,0 +1,26 @@
+#!/usr/bin/env ruby
+# www.smartsview.de/smartsview/auto/<image-format>/<visualization-modus>/<legend-option>/<SMARTS>
+# syntax rules
+# image-format: pdf, png or svg
+# visualization modus: 1 or 2 (1 = Complete Visualization, 2 = Element Symbols)
+# legend option: both, none, static, dynamic
+# SMARTS: All special symbols used in SMARTS can be used except '#', which has to be escaped with %23
+
+require 'uri'
+SERVICE_URI = "http://www.smartsview.de/smartsview/auto/pdf/2/both/"
+
+
+inFile = File.join("data","functional-groups-smarts.csv")
+hash = {}
+File.readlines(inFile).each do |line|
+ columns = line.split(",",2)
+ group = columns[0].strip
+ smarts = columns[1].sub(/^'|'$/,"").strip.sub(/^'|'$/,"").strip
+ hash[group] = smarts
+end
+
+hash.each do |group,smarts|
+ `wget '#{URI.escape(SERVICE_URI+smarts)}' -O "#{File.join("figures",group+'.pdf')}"`
+end
+
+
diff --git a/scripts/functional-groups.R b/scripts/functional-groups.R
index 01f2043..6121073 100755
--- a/scripts/functional-groups.R
+++ b/scripts/functional-groups.R
@@ -1,3 +1,4 @@
+#!/usr/bin/Rscript
library("ggplot2")
data <- read.csv("data/functional-groups-reduced4R.csv",header=F)
@@ -6,4 +7,4 @@ names(data) = c("V1","V2","Dataset")
data$V1 <- reorder(data$V1,-data$V2)
ggplot(data,aes(x=V1,y=V2,fill=Dataset)) + geom_bar(stat="identity", position=position_dodge()) + xlab("") + ylab("") + coord_flip()
-ggsave("figure/functional-groups.pdf")
+ggsave("figures/functional-groups.pdf")
diff --git a/scripts/misclassifications.rb b/scripts/misclassifications.rb
index 171077c..d285868 100755
--- a/scripts/misclassifications.rb
+++ b/scripts/misclassifications.rb
@@ -1,4 +1,6 @@
-require_relative 'include.rb'
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
class Range
def intersection(other)
@@ -9,21 +11,21 @@ class Range
end
experimental = {}
-CSV.foreach(File.join(DATA,"test.csv")) do |row|
+CSV.foreach(File.join("data","test_log10.csv")) do |row|
experimental[row[0]] ||= []
experimental[row[0]] << row[1].to_f
end
predictions = {}
-CSV.foreach(File.join(DATA,"training-test-predictions.csv"),:headers => true) do |row|
- predictions[row[0]] = [-Math.log10(row[2].to_f),Math.log10(row[3].to_f).abs]
+CSV.foreach(File.join("data","training-test-predictions.csv"),:headers => true) do |row|
+ predictions[row[0]] = [row[2].to_f,row[3].to_f.abs]
end
outside_experimental_values = 0
within_experimental_values = 0
out = []
predictions.each do |smi,pred|
- exp = experimental[smi].collect{|e| -Math.log10(e)}.uniq
+ exp = experimental[smi].uniq
# https://en.wikipedia.org/wiki/Prediction_interval
min = pred[0]-1.96*pred[1]
max = pred[0]+1.96*pred[1]
diff --git a/scripts/test-correlation-plot.R b/scripts/test-correlation-plot.R
index 74a2739..ef69058 100755
--- a/scripts/test-correlation-plot.R
+++ b/scripts/test-correlation-plot.R
@@ -1,15 +1,17 @@
+#!/usr/bin/Rscript
+
library(ggplot2)
library(grid)
library(gridExtra)
experimental <- read.csv("data/median-correlation.csv",header=T)
-p1 = qplot(-log10(mazzatorta),-log10(swiss),data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)",main="Experimental data") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
+p1 = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)",main="Experimental data") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
training = read.csv("data/training-test-predictions.csv",header=T)
-p2 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
+p2 = qplot(LOAEL_predicted,LOAEL_measured_median,data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
-pdf('figure/test-correlation.pdf')
+pdf('figures/test-correlation.pdf')
grid.arrange(p1,p2,ncol=1,respect=T)
dev.off()
diff --git a/scripts/test-prediction-plot.R b/scripts/test-prediction-plot.R
index db003d3..7201e1a 100755
--- a/scripts/test-prediction-plot.R
+++ b/scripts/test-prediction-plot.R
@@ -1,7 +1,9 @@
+#!/usr/bin/Rscript
+
library(ggplot2)
training = read.csv("data/training-test-predictions.csv",header=T)
-test <- read.csv("data/test.csv",header=T)
+test <- read.csv("data/test_log10.csv",header=T)
n = c("SMILES","LOAEL","Source")
data = data.frame(factor(test$SMILES),test$LOAEL,factor(test$Dataset))
@@ -11,11 +13,11 @@ comb = data.frame(factor(training$SMILES),training$LOAEL_predicted,factor(traini
names(comb) = n
comb$Type = "predicted"
data = rbind(data,comb)
-data$LOAEL = -log(data$LOAEL)
+#data$LOAEL = -log(data$LOAEL)
data$SMILES <- reorder(data$SMILES,data$LOAEL)
#img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),shape=Source,color=Type))
img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Type))
img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank())
img <- img + geom_point()
-ggsave(file='figure/test-prediction.pdf', plot=img,width=12, height=8)
+ggsave(file='figures/test-prediction.pdf', plot=img,width=12, height=8)
diff --git a/scripts/test-validation-results.rb b/scripts/test-validation-results.rb
new file mode 100755
index 0000000..2750019
--- /dev/null
+++ b/scripts/test-validation-results.rb
@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+
+validation = Validation::TrainTest.find File.read("data/training-test-predictions.id").chomp
+
+data = []
+puts ["SMILES","LOAEL_measured_median","LOAEL_predicted","Error","Dataset"].join(",")
+validation.predictions.each do |id,p|
+ data << [Compound.find(id).smiles, p["measurements"].median, p["value"], (p["measurements"].median-p["value"]).abs,"test-prediction"]
+end
+
+data.sort!{|a,b| a[1] <=> b[1]}
+puts data.collect{|r| r.join ","}.join("\n")
diff --git a/scripts/test-validation.rb b/scripts/test-validation.rb
index 0bbcc42..5c07449 100755
--- a/scripts/test-validation.rb
+++ b/scripts/test-validation.rb
@@ -1,26 +1,10 @@
-require_relative "include.rb"
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
-test = Dataset.from_csv_file(File.join(DATA,"test.csv"))
+test = Dataset.from_csv_file(File.join("data","test_log10.csv"))
+train = Dataset.from_csv_file(File.join("data","training_log10.csv"))
-file = File.join(DATA,ARGV[0])
-dataset = Dataset.from_csv_file file
-model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
-#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
-#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average")
-validation = RegressionValidation.create model, dataset, test
-csv_file = file.sub(".csv","-test-predictions.csv")
-id_file = file.sub(".csv","-test-predictions.id")
-File.open(id_file,"w+"){|f| f.puts validation.id}
-name = File.basename(ARGV[0],".csv")
-
-data = []
-validation.predictions.each do |p|
- data << [Compound.find(p[0]).smiles, p[1].median, p[2], p[3],"#{name}-prediction"]
-end
-
-data.sort!{|a,b| a[1] <=> b[1]}
-
-CSV.open(csv_file,"w+") do |csv|
- csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","RMSE","Dataset"]
- data.each{|r| csv << r}
-end
+model = Model::LazarRegression.create(training_dataset: train)
+validation = Validation::TrainTest.create model, train, test
+puts validation.id