diff options
author | Christoph Helma <helma@in-silico.ch> | 2017-02-09 14:25:19 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2017-02-09 14:25:19 +0100 |
commit | db82eef974b8783c40e7daa504feead3f555fdb8 (patch) | |
tree | a9bf02a6455cb2cb399b15bed3fcd759cb2b3448 /scripts | |
parent | 7ad7c10c1e708f6b5a3473de24dbeab03d0b74a3 (diff) |
directories restructured
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/correct-predictions.R | 1 | ||||
-rwxr-xr-x | scripts/create-median-correlation.rb | 28 | ||||
-rwxr-xr-x | scripts/create-test.rb | 36 | ||||
-rwxr-xr-x | scripts/create-training.rb | 36 | ||||
-rwxr-xr-x | scripts/crossvalidation-plots.R | 15 | ||||
-rwxr-xr-x | scripts/crossvalidation.rb | 25 | ||||
-rwxr-xr-x | scripts/dataset-variability.R | 33 | ||||
-rwxr-xr-x | scripts/functional-groups-smarts.rb | 14 | ||||
-rwxr-xr-x | scripts/functional-groups.R | 9 | ||||
-rwxr-xr-x | scripts/functional-groups.rb | 31 | ||||
-rwxr-xr-x | scripts/functional-groups4R.rb | 30 | ||||
-rwxr-xr-x | scripts/include.rb | 5 | ||||
-rwxr-xr-x | scripts/mazzatorta-unique-smiles.rb | 17 | ||||
-rwxr-xr-x | scripts/misclassification-predictions.rb | 15 | ||||
-rwxr-xr-x | scripts/misclassifications.rb | 61 | ||||
-rwxr-xr-x | scripts/noael_loael2mmol.rb | 19 | ||||
-rwxr-xr-x | scripts/test-correlation-plot.R | 15 | ||||
-rwxr-xr-x | scripts/test-prediction-plot.R | 21 | ||||
-rwxr-xr-x | scripts/test-prediction.R | 15 | ||||
-rwxr-xr-x | scripts/test-validation.rb | 26 |
20 files changed, 452 insertions, 0 deletions
diff --git a/scripts/correct-predictions.R b/scripts/correct-predictions.R new file mode 100755 index 0000000..f39d4fe --- /dev/null +++ b/scripts/correct-predictions.R @@ -0,0 +1 @@ +correct_predictions = 134 diff --git a/scripts/create-median-correlation.rb b/scripts/create-median-correlation.rb new file mode 100755 index 0000000..9a2f6f5 --- /dev/null +++ b/scripts/create-median-correlation.rb @@ -0,0 +1,28 @@ +require_relative 'include.rb' + +old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv") +new = Dataset.from_csv_file File.join(DATA,"swiss.csv") + +common_compound_ids = (old.compound_ids & new.compound_ids).uniq + +data = [] +common_compound_ids.each do |cid| + c = Compound.find cid + old_values = old.values(c,old.features.first) + new_values = new.values(c,new.features.first) + identical = old_values & new_values + unless identical.empty? + old_values -= identical + new_values -= identical + end + unless old_values.empty? or new_values.empty? + data << [c.smiles,old_values.median,new_values.median] + end +end + +data.sort!{|a,b| a[1] <=> b[1]} + +CSV.open(File.join(DATA,"median-correlation.csv"),"w+") do |csv| + csv << ["SMILES","mazzatorta","swiss"] + data.each{|r| csv << r} +end diff --git a/scripts/create-test.rb b/scripts/create-test.rb new file mode 100755 index 0000000..782f172 --- /dev/null +++ b/scripts/create-test.rb @@ -0,0 +1,36 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox + +old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv") +new = Dataset.from_csv_file File.join(DATA,"swiss.csv") + +common_compound_ids = (old.compound_ids & new.compound_ids).uniq + +data = [] +common_compound_ids.each do |cid| + c = Compound.find cid + old_values = old.values(c,old.features.first) + new_values = new.values(c,new.features.first) + identical = old_values & new_values + unless identical.empty? + old_values -= identical + new_values -= identical + end + identical.each do |v| + data << [c.smiles,v,"mazzatorta, swiss"] + end + old_values.each do |v| + data << [c.smiles,v,"mazzatorta"] + end + new_values.each do |v| + data << [c.smiles,v,"swiss"] + end +end + +data.sort!{|a,b| a[1] <=> b[1]} + +CSV.open(File.join(DATA,"test.csv"),"w+") do |csv| + csv << ["SMILES","LOAEL","Dataset"] + data.each{|r| csv << r} +end diff --git a/scripts/create-training.rb b/scripts/create-training.rb new file mode 100755 index 0000000..d05bc1c --- /dev/null +++ b/scripts/create-training.rb @@ -0,0 +1,36 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox + +old = Dataset.from_csv_file File.join(DATA,"mazzatorta.csv") +new = Dataset.from_csv_file File.join(DATA,"swiss.csv") + +common_compound_ids = (old.compound_ids + new.compound_ids).uniq + +data = [] +common_compound_ids.each do |cid| + c = Compound.find cid + old_values = old.values(c,old.features.first) + new_values = new.values(c,new.features.first) + identical = old_values & new_values + unless identical.empty? + old_values -= identical + new_values -= identical + end + identical.each do |v| + data << [c.smiles,v,"mazzatorta, swiss"] + end + old_values.each do |v| + data << [c.smiles,v,"mazzatorta"] + end + new_values.each do |v| + data << [c.smiles,v,"swiss"] + end +end + +data.sort!{|a,b| a[1] <=> b[1]} + +CSV.open(File.join(DATA,"training.csv"),"w+") do |csv| + csv << ["SMILES","LOAEL","Dataset"] + data.each{|r| csv << r} +end diff --git a/scripts/crossvalidation-plots.R b/scripts/crossvalidation-plots.R new file mode 100755 index 0000000..2bc259f --- /dev/null +++ b/scripts/crossvalidation-plots.R @@ -0,0 +1,15 @@ +library(ggplot2) +library(grid) +library(gridExtra) + +t0 = read.csv("data/training-cv-0.csv",header=T) +t1 = read.csv("data/training-cv-1.csv",header=T) +t2 = read.csv("data/training-cv-2.csv",header=T) + +p0 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t0,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) +p1 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t1,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) +p2 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=t2,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-2,4.5) + ylim(-2,4.5) + +pdf('figure/crossvalidation.pdf') +grid.arrange(p0,p1,p2,ncol=2) +dev.off() diff --git a/scripts/crossvalidation.rb b/scripts/crossvalidation.rb new file mode 100755 index 0000000..79aeb83 --- /dev/null +++ b/scripts/crossvalidation.rb @@ -0,0 +1,25 @@ +require_relative 'include.rb' + +name = File.basename ARGV[0], ".csv" +file = File.join DATA,ARGV[0] +dataset = Dataset.from_csv_file file +model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression") +#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") +#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average") +csv_file = File.join(DATA,ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.csv")) +id_file = File.join(DATA,ARGV[0].sub(/.csv/,"-cv-#{ARGV[1]}.id")) +cv = RegressionCrossValidation.create model +File.open(id_file,"w+"){|f| f.puts cv.id} + +data = [] +cv.predictions.each do |p| + smi = Compound.find(p[0]).smiles + data << [smi,p[1].median,p[2],p[3]] +end + +data.sort!{|a,b| a[1] <=> b[1]} + +CSV.open(csv_file,"w+") do |csv| + csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Confidence"] + data.each{|r| csv << r} +end diff --git a/scripts/dataset-variability.R b/scripts/dataset-variability.R new file mode 100755 index 0000000..775fd03 --- /dev/null +++ b/scripts/dataset-variability.R @@ -0,0 +1,33 @@ +library(ggplot2) +library(grid) +library(gridExtra) + +m = read.csv("data/mazzatorta.csv",header=T) +s = read.csv("data/swiss.csv",header=T) + +m.dupsmi = unique(m$SMILES[duplicated(m$SMILES)]) +s.dupsmi = unique(s$SMILES[duplicated(s$SMILES)]) +m.dup = m[m$SMILES %in% m.dupsmi,] +s.dup = s[s$SMILES %in% s.dupsmi,] + +m.dup$LOAEL= -log10(m.dup$LOAEL) +s.dup$LOAEL= -log10(s.dup$LOAEL) +m.dup$SMILES <- reorder(m.dup$SMILES,m.dup$LOAEL) +s.dup$SMILES <- reorder(s.dup$SMILES,s.dup$LOAEL) + +p1 <- ggplot(m.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + geom_point() + ggtitle("Mazzatorta") + ylim(-1,4) +p2 <- ggplot(s.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + geom_point() + ggtitle("Swiss Federal Office") + ylim(-1,4) + +#pdf('figure/dataset-variability.pdf') +#grid.arrange(p1,p2,ncol=1) +#dev.off() + +data <- read.csv("data/test.csv",header=T) +data$LOAEL = -log(data$LOAEL) +data$SMILES <- reorder(data$SMILES,data$LOAEL) +img = ggplot(data,aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Dataset)) + geom_point() +img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank()) +img = img + scale_fill_discrete(breaks=c("Mazzatorta", "Both", "Swiss Federal Office")) +img = img + +ggsave(file='figure/dataset-variability.pdf', plot=img, width=12,height=8) diff --git a/scripts/functional-groups-smarts.rb b/scripts/functional-groups-smarts.rb new file mode 100755 index 0000000..0fd0520 --- /dev/null +++ b/scripts/functional-groups-smarts.rb @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby +require 'csv' + +smarts = {} +File.open("../functional-groups.txt").each_line do |row| + name,sma = row.chomp.split ": " + smarts[name] = "'#{sma}'" +end + +names = [] +CSV.foreach("../data/functional-groups-reduced4R.csv") do |row| + names << row[0].gsub(" ", "_") +end +names.uniq.each{|name| puts [name,smarts[name]].join ","} diff --git a/scripts/functional-groups.R b/scripts/functional-groups.R new file mode 100755 index 0000000..01f2043 --- /dev/null +++ b/scripts/functional-groups.R @@ -0,0 +1,9 @@ +library("ggplot2") + +data <- read.csv("data/functional-groups-reduced4R.csv",header=F) + +names(data) = c("V1","V2","Dataset") +data$V1 <- reorder(data$V1,-data$V2) + +ggplot(data,aes(x=V1,y=V2,fill=Dataset)) + geom_bar(stat="identity", position=position_dodge()) + xlab("") + ylab("") + coord_flip() +ggsave("figure/functional-groups.pdf") diff --git a/scripts/functional-groups.rb b/scripts/functional-groups.rb new file mode 100755 index 0000000..9da2a18 --- /dev/null +++ b/scripts/functional-groups.rb @@ -0,0 +1,31 @@ +require_relative '../../lazar/lib/lazar' +include OpenTox +old = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","regression","LOAEL_mg_corrected_smiles_mmol.csv") +new = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","regression","swissRat_chron_LOAEL_mmol.csv") + +functional_groups = {} +#functional_groups[:old] = {} +#functional_groups[:new] = {} +table = [] +#File.open("functional-groups.csv","w+") do |file| + File.open("functional-groups.txt").each_line do |line| + name, smarts = line.chomp.split ": " + if smarts + smarts_feature = Smarts.from_smarts smarts + oldcount = 0 + old.compounds.each do |c| + oldcount += Algorithm::Descriptor.smarts_match(c,smarts_feature).first.to_i + end + newcount = 0 + new.compounds.each do |c| + newcount += Algorithm::Descriptor.smarts_match(c,smarts_feature).first.to_i + end + puts "#{name}, #{oldcount}, #{newcount}" if oldcount > 0 and newcount > 0 + else + p name, smarts + end + #table << [name, oldcount, newcount] + end +#end +#print table.to_csv +#old_fp = old.compounds.collect{|c| c.fingerprint("FP4")} diff --git a/scripts/functional-groups4R.rb b/scripts/functional-groups4R.rb new file mode 100755 index 0000000..0b14b7a --- /dev/null +++ b/scripts/functional-groups4R.rb @@ -0,0 +1,30 @@ +require 'csv' +csv = [] +exclude = [ + %{Acetal}, + "Anion", + %r{_bond}, + %{_carbon}, + "Charged", + %{Hetero_}, + %{_rings}, + "Kation", + %{NOS}, + "Salt", + "Spiro", + %{Sugar} +] +CSV.foreach("data/functional-groups.csv") do |row| + keep = true + exclude.each do |patt| + keep = false if row[0].match(patt) + end + if keep and [row[1].to_i,row[2].to_i].max >= 25 + csv << [row[0].gsub('_',' '),row[1].to_i,"Mazzatorta"] + csv << [row[0].gsub('_',' '),row[2].to_i,"Swiss Federal Office"] + else + p row + end +end + +File.open("data/functional-groups-reduced4R.csv","w+"){|f| f.puts csv.collect{|r| r.join ", "}.join("\n")} diff --git a/scripts/include.rb b/scripts/include.rb new file mode 100755 index 0000000..edc3a64 --- /dev/null +++ b/scripts/include.rb @@ -0,0 +1,5 @@ +require_relative '../lazar/lib/lazar' +include OpenTox +DATA = File.join(File.dirname(__FILE__),"data") +#$mongo.database.drop +#$gridfs = $mongo.database.fs diff --git a/scripts/mazzatorta-unique-smiles.rb b/scripts/mazzatorta-unique-smiles.rb new file mode 100755 index 0000000..0b6db2a --- /dev/null +++ b/scripts/mazzatorta-unique-smiles.rb @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox + +csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8') +head = csv_in.shift +data = [] +data = [] +csv_in.each do |line| + c = Compound.from_smiles line[0] + # round to 5 significant digits in order to detect duplicates + mmol = line[1].to_f.signif(5) + data << [c.smiles,mmol,"mazzatorta"] #if c +end +data.sort!{|a,b| a[1] <=> b[1]} +puts ["SMILES","LOAEL","Dataset"].join "," +puts data.collect{|row| row.join ","}.join "\n" diff --git a/scripts/misclassification-predictions.rb b/scripts/misclassification-predictions.rb new file mode 100755 index 0000000..8d8c837 --- /dev/null +++ b/scripts/misclassification-predictions.rb @@ -0,0 +1,15 @@ +require_relative "include.rb" +dataset = Dataset.from_csv_file "data/training.csv" +compounds = CSV.read("data/misclassifications.csv")[1..2].collect{|m| Compound.from_smiles(m[0])} +model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression") +#predictions = compounds.collect{|c| model.predict c} +#predictions.each do |p| +#end +#p compounds[1].smiles +p compounds[1].names +prediction = model.predict compounds[1] +#prediction[:neighbors] = prediction[:neighbors].collect{|n| n.delete(:dataset_ids)} +prediction[:neighbors].each{|n| n.delete(:dataset_ids)} +#prediction[:neighbors] = prediction[:neighbors].collect{|n| n[:tanimoto]} +prediction[:neighbors] = prediction[:neighbors].collect{|n| Compound.find(n["_id"]).smiles} +puts JSON.pretty_generate(prediction) diff --git a/scripts/misclassifications.rb b/scripts/misclassifications.rb new file mode 100755 index 0000000..171077c --- /dev/null +++ b/scripts/misclassifications.rb @@ -0,0 +1,61 @@ +require_relative 'include.rb' + +class Range + def intersection(other) + return nil if (self.max < other.begin or other.max < self.begin) + [self.begin, other.begin].max..[self.max, other.max].min + end + alias_method :&, :intersection +end + +experimental = {} +CSV.foreach(File.join(DATA,"test.csv")) do |row| + experimental[row[0]] ||= [] + experimental[row[0]] << row[1].to_f +end + +predictions = {} +CSV.foreach(File.join(DATA,"training-test-predictions.csv"),:headers => true) do |row| + predictions[row[0]] = [-Math.log10(row[2].to_f),Math.log10(row[3].to_f).abs] +end + +outside_experimental_values = 0 +within_experimental_values = 0 +out = [] +predictions.each do |smi,pred| + exp = experimental[smi].collect{|e| -Math.log10(e)}.uniq + # https://en.wikipedia.org/wiki/Prediction_interval + min = pred[0]-1.96*pred[1] + max = pred[0]+1.96*pred[1] + pred = predictions[smi][0] + ci = predictions[smi][1] + err = nil + if (min..max) & (exp.min..exp.max) + within_experimental_values += 1 + else + outside_experimental_values += 1 + if exp.min < min + err = exp.min - min + elsif exp.max > max + err = exp.max - max + end + end + if err + out << { + :smi => smi, + :experimental => exp, + :min => min, + :max => max, + :prediction => predictions[smi][0], + :ci => predictions[smi][1], + :error => err + } + end +end + + +out.sort!{|a,b| b[:error].abs <=> a[:error].abs} +csv = [["SMILES","Distance"]] + out.collect{|o| [o[:smi], o[:error]]} +File.open("data/misclassifications.csv","w+"){|f| f.puts csv.collect{|r| r.join ", "}.join("\n")} + +#File.open("correct-predictions.R","w+"){|f| f.puts "correct_predictions = #{within_experimental_values}"} diff --git a/scripts/noael_loael2mmol.rb b/scripts/noael_loael2mmol.rb new file mode 100755 index 0000000..3d79aae --- /dev/null +++ b/scripts/noael_loael2mmol.rb @@ -0,0 +1,19 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox +csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8') +head = csv_in.shift +data = [] +csv_in.each do |line| + smi = line[11] + mg = line[19] + unless mg.to_f == 0.0 + c = Compound.from_smiles smi + # round to 5 significant digits in order to detect duplicates + mmol = c.mg_to_mmol(mg.to_f).signif(5) + data << [c.smiles, mmol,"swiss"] + end +end +data.sort!{|a,b| a[1] <=> b[1]} +puts ["SMILES","LOAEL","Dataset"].join "," +puts data.collect{|row| row.join ","}.join "\n" diff --git a/scripts/test-correlation-plot.R b/scripts/test-correlation-plot.R new file mode 100755 index 0000000..74a2739 --- /dev/null +++ b/scripts/test-correlation-plot.R @@ -0,0 +1,15 @@ +library(ggplot2) +library(grid) +library(gridExtra) + +experimental <- read.csv("data/median-correlation.csv",header=T) +p1 = qplot(-log10(mazzatorta),-log10(swiss),data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)",main="Experimental data") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) + +training = read.csv("data/training-test-predictions.csv",header=T) + +p2 = qplot(-log10(LOAEL_predicted),-log10(LOAEL_measured_median),data=training,xlab="-log10(LOAEL predicted)",ylab="-log10(LOAEL measured median)",main="Combined") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4) + +pdf('figure/test-correlation.pdf') +grid.arrange(p1,p2,ncol=1,respect=T) +dev.off() + diff --git a/scripts/test-prediction-plot.R b/scripts/test-prediction-plot.R new file mode 100755 index 0000000..db003d3 --- /dev/null +++ b/scripts/test-prediction-plot.R @@ -0,0 +1,21 @@ +library(ggplot2) + +training = read.csv("data/training-test-predictions.csv",header=T) +test <- read.csv("data/test.csv",header=T) +n = c("SMILES","LOAEL","Source") + +data = data.frame(factor(test$SMILES),test$LOAEL,factor(test$Dataset)) +names(data) = n +data$Type = "experimental" +comb = data.frame(factor(training$SMILES),training$LOAEL_predicted,factor(training$Dataset)) +names(comb) = n +comb$Type = "predicted" +data = rbind(data,comb) +data$LOAEL = -log(data$LOAEL) +data$SMILES <- reorder(data$SMILES,data$LOAEL) +#img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),shape=Source,color=Type)) +img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Type)) +img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank()) +img <- img + geom_point() + +ggsave(file='figure/test-prediction.pdf', plot=img,width=12, height=8) diff --git a/scripts/test-prediction.R b/scripts/test-prediction.R new file mode 100755 index 0000000..9b1ea76 --- /dev/null +++ b/scripts/test-prediction.R @@ -0,0 +1,15 @@ +training = read.csv("data/training-test-predictions.csv",header=T) +test <- read.csv("data/test.csv",header=T) +n = c("SMILES","LOAEL","Source") + +data = data.frame(factor(test$SMILES),test$LOAEL,factor(test$Dataset)) +names(data) = n +data$Type = "experimental" +comb = data.frame(factor(training$SMILES),training$LOAEL_predicted,factor(training$Dataset)) +names(comb) = n +comb$Type = "predicted" +print(data[comb$SMILES,]) +#data = rbind(data,comb) +#data$LOAEL = -log(data$LOAEL) +#data$SMILES <- reorder(data$SMILES,data$LOAEL) +#print(data) diff --git a/scripts/test-validation.rb b/scripts/test-validation.rb new file mode 100755 index 0000000..0bbcc42 --- /dev/null +++ b/scripts/test-validation.rb @@ -0,0 +1,26 @@ +require_relative "include.rb" + +test = Dataset.from_csv_file(File.join(DATA,"test.csv")) + +file = File.join(DATA,ARGV[0]) +dataset = Dataset.from_csv_file file +model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression") +#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") +#model = Model::LazarRegression.create(dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average") +validation = RegressionValidation.create model, dataset, test +csv_file = file.sub(".csv","-test-predictions.csv") +id_file = file.sub(".csv","-test-predictions.id") +File.open(id_file,"w+"){|f| f.puts validation.id} +name = File.basename(ARGV[0],".csv") + +data = [] +validation.predictions.each do |p| + data << [Compound.find(p[0]).smiles, p[1].median, p[2], p[3],"#{name}-prediction"] +end + +data.sort!{|a,b| a[1] <=> b[1]} + +CSV.open(csv_file,"w+") do |csv| + csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","RMSE","Dataset"] + data.each{|r| csv << r} +end |