summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2017-02-09 14:25:19 +0100
committerChristoph Helma <helma@in-silico.ch>2017-02-09 14:25:19 +0100
commitdb82eef974b8783c40e7daa504feead3f555fdb8 (patch)
treea9bf02a6455cb2cb399b15bed3fcd759cb2b3448 /scripts
parent7ad7c10c1e708f6b5a3473de24dbeab03d0b74a3 (diff)
directories restructured
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/correct-predictions.R1
-rwxr-xr-xscripts/create-median-correlation.rb28
-rwxr-xr-xscripts/create-test.rb36
-rwxr-xr-xscripts/create-training.rb36
-rwxr-xr-xscripts/crossvalidation-plots.R15
-rwxr-xr-xscripts/crossvalidation.rb25
-rwxr-xr-xscripts/dataset-variability.R33
-rwxr-xr-xscripts/functional-groups-smarts.rb14
-rwxr-xr-xscripts/functional-groups.R9
-rwxr-xr-xscripts/functional-groups.rb31
-rwxr-xr-xscripts/functional-groups4R.rb30
-rwxr-xr-xscripts/include.rb5
-rwxr-xr-xscripts/mazzatorta-unique-smiles.rb17
-rwxr-xr-xscripts/misclassification-predictions.rb15
-rwxr-xr-xscripts/misclassifications.rb61
-rwxr-xr-xscripts/noael_loael2mmol.rb19
-rwxr-xr-xscripts/test-correlation-plot.R15
-rwxr-xr-xscripts/test-prediction-plot.R21
-rwxr-xr-xscripts/test-prediction.R15
-rwxr-xr-xscripts/test-validation.rb26
20 files changed, 452 insertions, 0 deletions
diff --git a/scripts/correct-predictions.R b/scripts/correct-predictions.R
new file mode 100755
index 0000000..f39d4fe
--- /dev/null
+++ b/scripts/correct-predictions.R
@@ -0,0 +1 @@
+correct_predictions = 134
diff --git a/scripts/create-median-correlation.rb b/scripts/create-median-correlation.rb
new file mode 100755
index 0000000..9a2f6f5
--- /dev/null
+++ b/scripts/create-median-correlation.rb
@@ -0,0 +1,28 @@
+require_relative 'include.rb'
+
+old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv")
+new = Dataset.from_csv_file File.join(DATA,"swiss.csv")
+
+common_compound_ids = (old.compound_ids & new.compound_ids).uniq
+
+data = []
+common_compound_ids.each do |cid|
+ c = Compound.find cid
+ old_values = old.values(c,old.features.first)
+ new_values = new.values(c,new.features.first)
+ identical = old_values & new_values
+ unless identical.empty?
+ old_values -= identical
+ new_values -= identical
+ end
+ unless old_values.empty? or new_values.empty?
+ data << [c.smiles,old_values.median,new_values.median]
+ end
+end
+
+data.sort!{|a,b| a[1] <=> b[1]}
+
+CSV.open(File.join(DATA,"median-correlation.csv"),"w+") do |csv|
+ csv << ["SMILES","mazzatorta","swiss"]
+ data.each{|r| csv << r}
+end
diff --git a/scripts/create-test.rb b/scripts/create-test.rb
new file mode 100755
index 0000000..782f172
--- /dev/null
+++ b/scripts/create-test.rb
@@ -0,0 +1,36 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+
+old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv")
+new = Dataset.from_csv_file File.join(DATA,"swiss.csv")
+
+common_compound_ids = (old.compound_ids & new.compound_ids).uniq
+
+data = []
+common_compound_ids.each do |cid|
+ c = Compound.find cid
+ old_values = old.values(c,old.features.first)
+ new_values = new.values(c,new.features.first)
+ identical = old_values & new_values
+ unless identical.empty?
+ old_values -= identical
+ new_values -= identical
+ end
+ identical.each do |v|
+ data << [c.smiles,v,"mazzatorta, swiss"]
+ end
+ old_values.each do |v|
+ data << [c.smiles,v,"mazzatorta"]
+ end
+ new_values.each do |v|
+ data << [c.smiles,v,"swiss"]
+ end
+end
+
+data.sort!{|a,b| a[1] <=> b[1]}
+
+CSV.open(File.join(DATA,"test.csv"),"w+") do |csv|
+ csv << ["SMILES","LOAEL","Dataset"]
+ data.each{|r| csv << r}
+end
diff --git a/scripts/create-training.rb b/scripts/create-training.rb
new file mode 100755
index 0000000..d05bc1c
--- /dev/null
+++ b/scripts/create-training.rb
@@ -0,0 +1,36 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+
+old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv")
+new = Dataset.from_csv_file File.join(DATA,"swiss.csv")
+
+common_compound_ids = (old.compound_ids + new.compound_ids).uniq
+
+data = []
+common_compound_ids.each do |cid|
+ c = Compound.find cid
+ old_values = old.values(c,old.features.first)
+ new_values = new.values(c,new.features.first)
+ identical = old_values & new_values
+ unless identical.empty?
+ old_values -= identical
+ new_values -= identical
+ end
+ identical.each do |v|
+ data << [c.smiles,v,"mazzatorta, swiss"]
+ end
+ old_values.each do |v|
+ data << [c.smiles,v,"mazzatorta"]
+ end
+ new_values.each do |v|
+ data << [c.smiles,v,"swiss"]
+ end
+end
+
+data.sort!{|a,b| a[1] <=> b[1]}
+
+CSV.open(File.join(DATA,"training.csv"),"w+") do |csv|
+ csv << ["SMILES","LOAEL","Dataset"]
+ data.each{|r| csv << r}
+end
diff --git a/scripts/crossvalidation-plots.R b/scripts/crossvalidation-plots.R
new file mode 100755
index 0000000..2bc259f
--- /dev/null
+++ b/scripts/crossvalidation-plots.R
@@ -0,0 +1,15 @@
+library(ggplot2)
+library(grid)
+library(gridExtra)
+
+t0 = read.csv("data/training-cv-0.csv",header=T)
+t1 = read.csv("data/training-cv-1.csv",header=T)
+t2 = read.csv("data/training-cv-2.csv",header=T)
+
+p0 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t0,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
+p1 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t1,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
+p2 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t2,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5)
+
+pdf('figure/crossvalidation.pdf')
+grid.arrange(p0,p1,p2,ncol=2)
+dev.off()
diff --git a/scripts/crossvalidation.rb b/scripts/crossvalidation.rb
new file mode 100755
index 0000000..79aeb83
--- /dev/null
+++ b/scripts/crossvalidation.rb
@@ -0,0 +1,25 @@
+require_relative 'include.rb'
+
+name = File.basename ARGV[0], ".csv"
+file = File.join DATA,ARGV[0]
+dataset = Dataset.from_csv_file file
+model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
+#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
+#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average")
+csv_file = File.join(DATA,ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.csv"))
+id_file = File.join(DATA,ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.id"))
+cv = RegressionCrossValidation.create model
+File.open(id_file,"w+"){|f| f.puts cv.id}
+
+data = []
+cv.predictions.each do |p|
+ smi = Compound.find(p[0]).smiles
+ data << [smi,p[1].median,p[2],p[3]]
+end
+
+data.sort!{|a,b| a[1] <=> b[1]}
+
+CSV.open(csv_file,"w+") do |csv|
+ csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Confidence"]
+ data.each{|r| csv << r}
+end
diff --git a/scripts/dataset-variability.R b/scripts/dataset-variability.R
new file mode 100755
index 0000000..775fd03
--- /dev/null
+++ b/scripts/dataset-variability.R
@@ -0,0 +1,33 @@
+library(ggplot2)
+library(grid)
+library(gridExtra)
+
+m = read.csv("data/mazzatorta.csv",header=T)
+s = read.csv("data/swiss.csv",header=T)
+
+m.dupsmi = unique(m$SMILES[duplicated(m$SMILES)])
+s.dupsmi = unique(s$SMILES[duplicated(s$SMILES)])
+m.dup = m[m$SMILES %in% m.dupsmi,]
+s.dup = s[s$SMILES %in% s.dupsmi,]
+
+m.dup$LOAEL= -log10(m.dup$LOAEL)
+s.dup$LOAEL= -log10(s.dup$LOAEL)
+m.dup$SMILES <- reorder(m.dup$SMILES,m.dup$LOAEL)
+s.dup$SMILES <- reorder(s.dup$SMILES,s.dup$LOAEL)
+
+p1 <- ggplot(m.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + geom_point() + ggtitle("Mazzatorta") + ylim(-1,4)
+p2 <- ggplot(s.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + geom_point() + ggtitle("Swiss Federal Office") + ylim(-1,4)
+
+#pdf('figure/dataset-variability.pdf')
+#grid.arrange(p1,p2,ncol=1)
+#dev.off()
+
+data <- read.csv("data/test.csv",header=T)
+data$LOAEL = -log(data$LOAEL)
+data$SMILES <- reorder(data$SMILES,data$LOAEL)
+img = ggplot(data,aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Dataset)) + geom_point()
+img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank())
+img = img + scale_fill_discrete(breaks=c("Mazzatorta", "Both", "Swiss Federal Office"))
+img = img
+
+ggsave(file='figure/dataset-variability.pdf', plot=img, width=12,height=8)
diff --git a/scripts/functional-groups-smarts.rb b/scripts/functional-groups-smarts.rb
new file mode 100755
index 0000000..0fd0520
--- /dev/null
+++ b/scripts/functional-groups-smarts.rb
@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require 'csv'
+
+smarts = {}
+File.open("../functional-groups.txt").each_line do |row|
+ name,sma = row.chomp.split ": "
+ smarts[name] = "'#{sma}'"
+end
+
+names = []
+CSV.foreach("../data/functional-groups-reduced4R.csv") do |row|
+ names << row[0].gsub(" ", "_")
+end
+names.uniq.each{|name| puts [name,smarts[name]].join ","}
diff --git a/scripts/functional-groups.R b/scripts/functional-groups.R
new file mode 100755
index 0000000..01f2043
--- /dev/null
+++ b/scripts/functional-groups.R
@@ -0,0 +1,9 @@
+library("ggplot2")
+
+data <- read.csv("data/functional-groups-reduced4R.csv",header=F)
+
+names(data) = c("V1","V2","Dataset")
+data$V1 <- reorder(data$V1,-data$V2)
+
+ggplot(data,aes(x=V1,y=V2,fill=Dataset)) + geom_bar(stat="identity", position=position_dodge()) + xlab("") + ylab("") + coord_flip()
+ggsave("figure/functional-groups.pdf")
diff --git a/scripts/functional-groups.rb b/scripts/functional-groups.rb
new file mode 100755
index 0000000..9da2a18
--- /dev/null
+++ b/scripts/functional-groups.rb
@@ -0,0 +1,31 @@
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+old = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","regression","LOAEL_mg_corrected_smiles_mmol.csv")
+new = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","regression","swissRat_chron_LOAEL_mmol.csv")
+
+functional_groups = {}
+#functional_groups[:old] = {}
+#functional_groups[:new] = {}
+table = []
+#File.open("functional-groups.csv","w+") do |file|
+ File.open("functional-groups.txt").each_line do |line|
+ name, smarts = line.chomp.split ": "
+ if smarts
+ smarts_feature = Smarts.from_smarts smarts
+ oldcount = 0
+ old.compounds.each do |c|
+ oldcount += Algorithm::Descriptor.smarts_match(c,smarts_feature).first.to_i
+ end
+ newcount = 0
+ new.compounds.each do |c|
+ newcount += Algorithm::Descriptor.smarts_match(c,smarts_feature).first.to_i
+ end
+ puts "#{name}, #{oldcount}, #{newcount}" if oldcount > 0 and newcount > 0
+ else
+ p name, smarts
+ end
+ #table << [name, oldcount, newcount]
+ end
+#end
+#print table.to_csv
+#old_fp = old.compounds.collect{|c| c.fingerprint("FP4")}
diff --git a/scripts/functional-groups4R.rb b/scripts/functional-groups4R.rb
new file mode 100755
index 0000000..0b14b7a
--- /dev/null
+++ b/scripts/functional-groups4R.rb
@@ -0,0 +1,30 @@
+require 'csv'
+csv = []
+exclude = [
+ %{Acetal},
+ "Anion",
+ %r{_bond},
+ %{_carbon},
+ "Charged",
+ %{Hetero_},
+ %{_rings},
+ "Kation",
+ %{NOS},
+ "Salt",
+ "Spiro",
+ %{Sugar}
+]
+CSV.foreach("data/functional-groups.csv") do |row|
+ keep = true
+ exclude.each do |patt|
+ keep = false if row[0].match(patt)
+ end
+ if keep and [row[1].to_i,row[2].to_i].max >= 25
+ csv << [row[0].gsub('_',' '),row[1].to_i,"Mazzatorta"]
+ csv << [row[0].gsub('_',' '),row[2].to_i,"Swiss Federal Office"]
+ else
+ p row
+ end
+end
+
+File.open("data/functional-groups-reduced4R.csv","w+"){|f| f.puts csv.collect{|r| r.join ", "}.join("\n")}
diff --git a/scripts/include.rb b/scripts/include.rb
new file mode 100755
index 0000000..edc3a64
--- /dev/null
+++ b/scripts/include.rb
@@ -0,0 +1,5 @@
+require_relative '../lazar/lib/lazar'
+include OpenTox
+DATA = File.join(File.dirname(__FILE__),"data")
+#$mongo.database.drop
+#$gridfs = $mongo.database.fs
diff --git a/scripts/mazzatorta-unique-smiles.rb b/scripts/mazzatorta-unique-smiles.rb
new file mode 100755
index 0000000..0b6db2a
--- /dev/null
+++ b/scripts/mazzatorta-unique-smiles.rb
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+
+csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8')
+head = csv_in.shift
+data = []
+data = []
+csv_in.each do |line|
+ c = Compound.from_smiles line[0]
+ # round to 5 significant digits in order to detect duplicates
+ mmol = line[1].to_f.signif(5)
+ data << [c.smiles,mmol,"mazzatorta"] #if c
+end
+data.sort!{|a,b| a[1] <=> b[1]}
+puts ["SMILES","LOAEL","Dataset"].join ","
+puts data.collect{|row| row.join ","}.join "\n"
diff --git a/scripts/misclassification-predictions.rb b/scripts/misclassification-predictions.rb
new file mode 100755
index 0000000..8d8c837
--- /dev/null
+++ b/scripts/misclassification-predictions.rb
@@ -0,0 +1,15 @@
+require_relative "include.rb"
+dataset = Dataset.from_csv_file "data/training.csv"
+compounds = CSV.read("data/misclassifications.csv")[1..2].collect{|m| Compound.from_smiles(m[0])}
+model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
+#predictions = compounds.collect{|c| model.predict c}
+#predictions.each do |p|
+#end
+#p compounds[1].smiles
+p compounds[1].names
+prediction = model.predict compounds[1]
+#prediction[:neighbors] = prediction[:neighbors].collect{|n| n.delete(:dataset_ids)}
+prediction[:neighbors].each{|n| n.delete(:dataset_ids)}
+#prediction[:neighbors] = prediction[:neighbors].collect{|n| n[:tanimoto]}
+prediction[:neighbors] = prediction[:neighbors].collect{|n| Compound.find(n["_id"]).smiles}
+puts JSON.pretty_generate(prediction)
diff --git a/scripts/misclassifications.rb b/scripts/misclassifications.rb
new file mode 100755
index 0000000..171077c
--- /dev/null
+++ b/scripts/misclassifications.rb
@@ -0,0 +1,61 @@
+require_relative 'include.rb'
+
+class Range
+ def intersection(other)
+ return nil if (self.max < other.begin or other.max < self.begin)
+ [self.begin, other.begin].max..[self.max, other.max].min
+ end
+ alias_method :&, :intersection
+end
+
+experimental = {}
+CSV.foreach(File.join(DATA,"test.csv")) do |row|
+ experimental[row[0]] ||= []
+ experimental[row[0]] << row[1].to_f
+end
+
+predictions = {}
+CSV.foreach(File.join(DATA,"training-test-predictions.csv"),:headers => true) do |row|
+ predictions[row[0]] = [-Math.log10(row[2].to_f),Math.log10(row[3].to_f).abs]
+end
+
+outside_experimental_values = 0
+within_experimental_values = 0
+out = []
+predictions.each do |smi,pred|
+ exp = experimental[smi].collect{|e| -Math.log10(e)}.uniq
+ # https://en.wikipedia.org/wiki/Prediction_interval
+ min = pred[0]-1.96*pred[1]
+ max = pred[0]+1.96*pred[1]
+ pred = predictions[smi][0]
+ ci = predictions[smi][1]
+ err = nil
+ if (min..max) & (exp.min..exp.max)
+ within_experimental_values += 1
+ else
+ outside_experimental_values += 1
+ if exp.min < min
+ err = exp.min - min
+ elsif exp.max > max
+ err = exp.max - max
+ end
+ end
+ if err
+ out << {
+ :smi => smi,
+ :experimental => exp,
+ :min => min,
+ :max => max,
+ :prediction => predictions[smi][0],
+ :ci => predictions[smi][1],
+ :error => err
+ }
+ end
+end
+
+
+out.sort!{|a,b| b[:error].abs <=> a[:error].abs}
+csv = [["SMILES","Distance"]] + out.collect{|o| [o[:smi], o[:error]]}
+File.open("data/misclassifications.csv","w+"){|f| f.puts csv.collect{|r| r.join ", "}.join("\n")}
+
+#File.open("correct-predictions.R","w+"){|f| f.puts "correct_predictions = #{within_experimental_values}"}
diff --git a/scripts/noael_loael2mmol.rb b/scripts/noael_loael2mmol.rb
new file mode 100755
index 0000000..3d79aae
--- /dev/null
+++ b/scripts/noael_loael2mmol.rb
@@ -0,0 +1,19 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8')
+head = csv_in.shift
+data = []
+csv_in.each do |line|
+ smi = line[11]
+ mg = line[19]
+ unless mg.to_f == 0.0
+ c = Compound.from_smiles smi
+ # round to 5 significant digits in order to detect duplicates
+ mmol = c.mg_to_mmol(mg.to_f).signif(5)
+ data << [c.smiles, mmol,"swiss"]
+ end
+end
+data.sort!{|a,b| a[1] <=> b[1]}
+puts ["SMILES","LOAEL","Dataset"].join ","
+puts data.collect{|row| row.join ","}.join "\n"
diff --git a/scripts/test-correlation-plot.R b/scripts/test-correlation-plot.R
new file mode 100755
index 0000000..74a2739
--- /dev/null
+++ b/scripts/test-correlation-plot.R
@@ -0,0 +1,15 @@
+library(ggplot2)
+library(grid)
+library(gridExtra)
+
+experimental <- read.csv("data/median-correlation.csv",header=T)
+p1 = qplot(-log10(mazzatorta),-log10(swiss),data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)",main="Experimental data") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
+
+training = read.csv("data/training-test-predictions.csv",header=T)
+
+p2 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
+
+pdf('figure/test-correlation.pdf')
+grid.arrange(p1,p2,ncol=1,respect=T)
+dev.off()
+
diff --git a/scripts/test-prediction-plot.R b/scripts/test-prediction-plot.R
new file mode 100755
index 0000000..db003d3
--- /dev/null
+++ b/scripts/test-prediction-plot.R
@@ -0,0 +1,21 @@
+library(ggplot2)
+
+training = read.csv("data/training-test-predictions.csv",header=T)
+test <- read.csv("data/test.csv",header=T)
+n = c("SMILES","LOAEL","Source")
+
+data = data.frame(factor(test$SMILES),test$LOAEL,factor(test$Dataset))
+names(data) = n
+data$Type = "experimental"
+comb = data.frame(factor(training$SMILES),training$LOAEL_predicted,factor(training$Dataset))
+names(comb) = n
+comb$Type = "predicted"
+data = rbind(data,comb)
+data$LOAEL = -log(data$LOAEL)
+data$SMILES <- reorder(data$SMILES,data$LOAEL)
+#img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),shape=Source,color=Type))
+img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Type))
+img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank())
+img <- img + geom_point()
+
+ggsave(file='figure/test-prediction.pdf', plot=img,width=12, height=8)
diff --git a/scripts/test-prediction.R b/scripts/test-prediction.R
new file mode 100755
index 0000000..9b1ea76
--- /dev/null
+++ b/scripts/test-prediction.R
@@ -0,0 +1,15 @@
+training = read.csv("data/training-test-predictions.csv",header=T)
+test <- read.csv("data/test.csv",header=T)
+n = c("SMILES","LOAEL","Source")
+
+data = data.frame(factor(test$SMILES),test$LOAEL,factor(test$Dataset))
+names(data) = n
+data$Type = "experimental"
+comb = data.frame(factor(training$SMILES),training$LOAEL_predicted,factor(training$Dataset))
+names(comb) = n
+comb$Type = "predicted"
+print(data[comb$SMILES,])
+#data = rbind(data,comb)
+#data$LOAEL = -log(data$LOAEL)
+#data$SMILES <- reorder(data$SMILES,data$LOAEL)
+#print(data)
diff --git a/scripts/test-validation.rb b/scripts/test-validation.rb
new file mode 100755
index 0000000..0bbcc42
--- /dev/null
+++ b/scripts/test-validation.rb
@@ -0,0 +1,26 @@
+require_relative "include.rb"
+
+test = Dataset.from_csv_file(File.join(DATA,"test.csv"))
+
+file = File.join(DATA,ARGV[0])
+dataset = Dataset.from_csv_file file
+model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
+#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
+#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average")
+validation = RegressionValidation.create model, dataset, test
+csv_file = file.sub(".csv","-test-predictions.csv")
+id_file = file.sub(".csv","-test-predictions.id")
+File.open(id_file,"w+"){|f| f.puts validation.id}
+name = File.basename(ARGV[0],".csv")
+
+data = []
+validation.predictions.each do |p|
+ data << [Compound.find(p[0]).smiles, p[1].median, p[2], p[3],"#{name}-prediction"]
+end
+
+data.sort!{|a,b| a[1] <=> b[1]}
+
+CSV.open(csv_file,"w+") do |csv|
+ csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","RMSE","Dataset"]
+ data.each{|r| csv << r}
+end