summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rwxr-xr-xlib/ot_predictions.rb21
-rw-r--r--lib/prediction_data.rb4
-rwxr-xr-xlib/predictions.rb68
-rwxr-xr-xlib/validation_db.rb20
-rw-r--r--report/plot_factory.rb5
-rwxr-xr-xreport/report_content.rb11
-rwxr-xr-xreport/report_factory.rb18
-rwxr-xr-xreport/report_persistance.rb1
-rwxr-xr-xreport/validation_access.rb39
-rwxr-xr-xtest/test_examples.rb18
-rwxr-xr-xtest/test_examples_util.rb80
-rwxr-xr-xvalidation/validation_application.rb47
-rwxr-xr-xvalidation/validation_service.rb250
-rwxr-xr-xvalidation/validation_test.rb104
15 files changed, 479 insertions, 209 deletions
diff --git a/README.md b/README.md
index 4bbec5d..7005b22 100644
--- a/README.md
+++ b/README.md
@@ -6,4 +6,4 @@ OpenTox Validation
[API documentation](http://rdoc.info/github/opentox/validation)
--------------------------------------------------------------
-Copyright (c) 2009-2011 Martin Guetlein, Christoph Helma. See LICENSE for details.
+Copyright (c) 2009-2012 Martin Guetlein, Christoph Helma. See LICENSE for details.
diff --git a/lib/ot_predictions.rb b/lib/ot_predictions.rb
index 3be845b..2752fcc 100755
--- a/lib/ot_predictions.rb
+++ b/lib/ot_predictions.rb
@@ -35,7 +35,7 @@ module Lib
OTPredictions.to_array( [self] )
end
- def self.to_array( predictions, add_pic=false, format=false )
+ def self.to_array( predictions, add_pic=false, format=false, validation_uris=nil )
confidence_available = false
predictions.each do |p|
@@ -43,7 +43,10 @@ module Lib
end
res = []
conf_column = nil
+ count = 0
predictions.each do |p|
+ v_uris = validation_uris[count] if validation_uris
+ count += 1
(0..p.num_instances-1).each do |i|
a = []
@@ -75,6 +78,9 @@ module Lib
conf_column = a.size if conf_column==nil
a << p.confidence_value(i)
end
+ if validation_uris
+ a << v_uris[i]
+ end
a << p.identifier(i)
res << a
end
@@ -90,12 +96,13 @@ module Lib
end
end
header = []
- header << "compound" if add_pic
- header << "actual value"
- header << "predicted value"
- header << "classification" if predictions[0].feature_type=="classification"
- header << "confidence value" if predictions[0].confidence_values_available?
- header << "compound-uri"
+ header << "Compound" if add_pic
+ header << "Actual value"
+ header << "Predicted value"
+ header << "Classification" if predictions[0].feature_type=="classification"
+ header << "Confidence value" if predictions[0].confidence_values_available?
+ header << "Validation URI" if validation_uris
+ header << "Compound URI"
res.insert(0, header)
return res
diff --git a/lib/prediction_data.rb b/lib/prediction_data.rb
index 42da5fc..d387d24 100644
--- a/lib/prediction_data.rb
+++ b/lib/prediction_data.rb
@@ -270,7 +270,7 @@ module Lib
def self.classification_vals(dataset, compound, feature, accept_values)
v_indices = []
values(dataset, compound, feature).each do |v|
- i = accept_values.index(v.to_s)
+ i = accept_values.index(v)
raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+
accept_values.inspect unless v==nil or i!=nil
v_indices << i
@@ -294,4 +294,4 @@ module Lib
v_mod
end
end
-end \ No newline at end of file
+end
diff --git a/lib/predictions.rb b/lib/predictions.rb
index 233267d..d929f1a 100755
--- a/lib/predictions.rb
+++ b/lib/predictions.rb
@@ -577,6 +577,31 @@ module Lib
# return weighted_sample_correlation_coefficient ** 2
#end
+ def concordance_correlation_coefficient
+ begin
+ numerator = 0
+ @predicted_values.size.times do |i|
+ numerator += (@actual_values[i]-@actual_mean) * (@predicted_values[i]-@prediction_mean) if
+ @actual_values[i]!=nil and @predicted_values[i]!=nil
+ end
+ numerator *= 2
+ denominator = total_sum_of_squares
+ denominator += prediction_total_sum_of_squares
+ denominator += @num_predicted * (@actual_mean - @prediction_mean)**2
+ ccc = numerator / denominator
+ ( ccc.infinite? || ccc.nan? ) ? 0 : ccc
+ rescue; 0; end
+ end
+
+ def prediction_total_sum_of_squares
+ #return @variance_actual * ( @num_predicted - 1 )
+ sum = 0
+ @predicted_values.size.times do |i|
+ sum += (@predicted_values[i]-@prediction_mean)**2 if @actual_values[i]!=nil and @predicted_values[i]!=nil
+ end
+ sum
+ end
+
def sample_correlation_coefficient
begin
# formula see http://en.wikipedia.org/wiki/Correlation_and_dependence#Pearson.27s_product-moment_coefficient
@@ -804,22 +829,45 @@ module Lib
end
puts "num values "+p.size.to_s
- pred = Predictions.new(p,a,c,"regression")
+ #a = [1.0,2.0, 3.0,4.0, 5.0]
+ #p = [1.5,2.25,3.0,3.75,4.5]
+
+ #a = [1.0,2.0,3.0,4.0,5.0]
+ #p = [1.5,2.5,3.5,4.5,5.5]
+
+ #p = a.collect{|v| v-0.5}
+ #p = a.collect{|v| v+0.5}
+
+ #p = [2.0,2.5,3.0,3.5,4.0]
+
+ c = Array.new(p.size,nil)
+
+ data = { :predicted_values => p, :actual_values => a, :confidence_values => c,
+ :feature_type => "regression", :accept_values => nil }
+
+ pred = Predictions.new(data)
puts "internal"
#puts "r-square old "+pred.r_square_old.to_s
puts "cor "+pred.sample_correlation_coefficient.to_s
- puts "weighted cor "+pred.weighted_sample_correlation_coefficient.to_s
+ #puts "weighted cor "+pred.weighted_sample_correlation_coefficient.to_s
puts "r-square "+pred.r_square.to_s
+ puts "ccc "+pred.concordance_correlation_coefficient.to_s
puts "R"
- @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r
- @@r.assign "v1",a
- @@r.assign "v2",p
- puts "r cor "+@@r.pull("cor(v1,v2)").to_s
- @@r.eval "fit <- lm(v1 ~ v2)"
- @@r.eval "sum <- summary(fit)"
- puts "r r-square "+@@r.pull("sum$r.squared").to_s
- puts "r adjusted-r-square "+@@r.pull("sum$adj.r.squared").to_s
+ rutil = OpenTox::RUtil.new
+
+ rutil.r.assign "v1",a
+ rutil.r.assign "v2",p
+ puts "r cor "+rutil.r.pull("cor(v1,v2)").to_s
+ rutil.r.eval "fit <- lm(v1 ~ v2)"
+ rutil.r.eval "sum <- summary(fit)"
+ puts "r r-square "+rutil.r.pull("sum$r.squared").to_s
+ puts "r adjusted-r-square "+rutil.r.pull("sum$adj.r.squared").to_s
+ #rutil.r.eval "save.image(\"/tmp/image.R\")"
+ #rutil.r.eval "require(epiR)"
+ #rutil.r.eval "tmp.ccc <- epi.ccc(v1,v2)"
+ #puts "r ccc "+rutil.r.pull("tmp.ccc$rho.c$est").to_s
+ rutil.quit_r
end
def prediction_feature_value_map(proc)
diff --git a/lib/validation_db.rb b/lib/validation_db.rb
index c3a3f71..086853e 100755
--- a/lib/validation_db.rb
+++ b/lib/validation_db.rb
@@ -6,8 +6,9 @@ require "lib/merge.rb"
module Validation
- VAL_PROPS_GENERAL = [ :validation_uri, :validation_type, :model_uri, :algorithm_uri, :training_dataset_uri, :prediction_feature,
- :test_dataset_uri, :test_target_dataset_uri, :prediction_dataset_uri, :date ]
+ VAL_PROPS_GENERAL = [ :validation_uri, :validation_type, :model_uri, :algorithm_uri, :algorithm_params,
+ :training_dataset_uri, :prediction_feature, :test_dataset_uri, :test_target_dataset_uri,
+ :prediction_dataset_uri, :date ]
VAL_PROPS_SUM = [ :num_instances, :num_without_class, :num_unpredicted ]
VAL_PROPS_AVG = [:real_runtime, :percent_without_class, :percent_unpredicted ]
VAL_PROPS = VAL_PROPS_GENERAL + VAL_PROPS_SUM + VAL_PROPS_AVG
@@ -38,10 +39,11 @@ module Validation
# :regression_statistics
VAL_REGR_PROPS = [ :root_mean_squared_error, :mean_absolute_error, :r_square, :weighted_r_square,
:target_variance_actual, :target_variance_predicted, :sum_squared_error, :sample_correlation_coefficient,
- :weighted_mean_absolute_error, :weighted_root_mean_squared_error ]
+ :weighted_mean_absolute_error, :weighted_root_mean_squared_error, :concordance_correlation_coefficient ]
CROSS_VAL_PROPS = [:dataset_uri, :num_folds, :stratified, :random_seed]
- CROSS_VAL_PROPS_REDUNDANT = [:crossvalidation_uri, :algorithm_uri, :date] + CROSS_VAL_PROPS
+ CROSS_VAL_PROPS_REDUNDANT = [:crossvalidation_uri, :algorithm_uri, :algorithm_params,
+ :prediction_feature, :date] + CROSS_VAL_PROPS
ALL_PROPS = VAL_PROPS + VAL_CV_PROPS + VAL_CLASS_PROPS + VAL_REGR_PROPS + CROSS_VAL_PROPS
@@ -55,6 +57,7 @@ module Validation
attribute :validation_type
attribute :model_uri
attribute :algorithm_uri
+ attribute :algorithm_params
attribute :training_dataset_uri
attribute :test_target_dataset_uri
attribute :test_dataset_uri
@@ -77,6 +80,11 @@ module Validation
index :model_uri
index :validation_type
index :crossvalidation_id
+ index :algorithm_uri
+ index :algorithm_params
+ index :prediction_feature
+ index :training_dataset_uri
+ index :test_dataset_uri
attr_accessor :subjectid
@@ -141,6 +149,8 @@ module Validation
class Crossvalidation < Ohm::Model
attribute :algorithm_uri
+ attribute :algorithm_params
+ attribute :prediction_feature
attribute :dataset_uri
attribute :date
attribute :num_folds
@@ -152,6 +162,8 @@ module Validation
attr_accessor :subjectid
index :algorithm_uri
+ index :algorithm_params
+ index :prediction_feature
index :dataset_uri
index :num_folds
index :random_seed
diff --git a/report/plot_factory.rb b/report/plot_factory.rb
index f114dd3..6e90dbc 100644
--- a/report/plot_factory.rb
+++ b/report/plot_factory.rb
@@ -4,7 +4,8 @@ ENV['RANK_PLOTTER_JAR'] = "RankPlotter/RankPlotter.jar" unless ENV['RANK_PLOTTER
CONF_PLOT_RANGE = { :accuracy => [0.45,1.05], :true_positive_rate => [0.45,1.05],:true_negative_rate => [0.45,1.05],
:false_positive_rate => [0.45,1.05], :false_negative_rate => [0.45,1.05], :positive_predictive_value => [0.45,1.05],
- :negative_predictive_value => [0.45,1.05], :r_square => [0, 1.05], :sample_correlation_coefficient => [0, 1.05] }
+ :negative_predictive_value => [0.45,1.05], :r_square => [0, 1.05], :sample_correlation_coefficient => [0, 1.05],
+ :concordance_correlation_coefficient => [0, 1.05] }
class Array
def swap!(i,j)
@@ -124,7 +125,7 @@ module Reports
else
Reports::r_util.feature_value_plot(out_files, validation_set.validations[0].training_feature_dataset_uri,
validation_set.validations[0].test_feature_dataset_uri, "Training Data", "Test Data",
- nil, true, validation_set.validations[0].subjectid, waiting_task )
+ nil, validation_set.validations[0].subjectid, waiting_task )
end
end
diff --git a/report/report_content.rb b/report/report_content.rb
index 80473c5..033b367 100755
--- a/report/report_content.rb
+++ b/report/report_content.rb
@@ -63,20 +63,17 @@ class Reports::ReportContent
end
end
- def add_predictions( validation_set,
- validation_attributes=[],
+ def add_predictions( validation_set,
+ add_validation_uris,
section_title="Predictions",
section_text=nil,
table_title="Predictions")
-
- #PENING
- raise "validation attributes not implemented in get prediction array" if validation_attributes.size>0
-
section_table = @xml_report.add_section(@current_section, section_title)
if validation_set.validations[0].get_predictions
@xml_report.add_paragraph(section_table, section_text) if section_text
+ v_uris = validation_set.validations.collect{|v| Array.new(v.num_instances.to_i,v.validation_uri)} if add_validation_uris
@xml_report.add_table(section_table, table_title, Lib::OTPredictions.to_array(validation_set.validations.collect{|v| v.get_predictions},
- true, true))
+ true, true, v_uris))
else
@xml_report.add_paragraph(section_table, "No prediction info available.")
end
diff --git a/report/report_factory.rb b/report/report_factory.rb
index 2bb74ee..f73ffd9 100755
--- a/report/report_factory.rb
+++ b/report/report_factory.rb
@@ -9,7 +9,7 @@ VAL_ATTR_CLASS = [ :num_instances, :num_unpredicted, :accuracy, :weighted_accura
:area_under_roc, :f_measure, :true_positive_rate, :true_negative_rate, :positive_predictive_value, :negative_predictive_value ]
VAL_ATTR_REGR = [ :num_instances, :num_unpredicted, :root_mean_squared_error,
:weighted_root_mean_squared_error, :mean_absolute_error, :weighted_mean_absolute_error, :r_square, :weighted_r_square,
- :sample_correlation_coefficient ]
+ :sample_correlation_coefficient, :concordance_correlation_coefficient ]
#VAL_ATTR_BOX_PLOT_CLASS = [ :accuracy, :average_area_under_roc,
# :area_under_roc, :f_measure, :true_positive_rate, :true_negative_rate ]
@@ -113,6 +113,9 @@ module Reports::ReportFactory
report.add_confidence_plot(validation_set, :root_mean_squared_error, nil)
report.add_confidence_plot(validation_set, :r_square, nil)
report.align_last_two_images "Confidence Plots"
+ report.add_confidence_plot(validation_set, :sample_correlation_coefficient, nil)
+ report.add_confidence_plot(validation_set, :concordance_correlation_coefficient, nil)
+ report.align_last_two_images "More Confidence Plots"
end
task.progress(70) if task
report.add_train_test_plot( validation_set, false, OpenTox::SubTask.create(task,70,80) )
@@ -121,7 +124,7 @@ module Reports::ReportFactory
report.end_section
report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results")
- report.add_predictions( validation_set )
+ report.add_predictions( validation_set, false )
task.progress(100) if task
report
end
@@ -175,7 +178,8 @@ module Reports::ReportFactory
report.end_section
report.add_result(validation_set,
[:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds, :dataset_uri, :algorithm_uri],
- "Results","Results")
+ "Results","Results") if
+ (cv_set.unique_value(:num_folds).to_i < cv_set.unique_value(:num_instances).to_i)
when "regression"
report.add_result(cv_set, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],res_titel, res_titel, res_text)
report.add_section("Plots")
@@ -189,12 +193,14 @@ module Reports::ReportFactory
report.end_section
report.add_result(validation_set,
[:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri],
- "Results","Results")
+ "Results","Results") if
+ (cv_set.unique_value(:num_folds).to_i < cv_set.unique_value(:num_instances).to_i)
end
task.progress(90) if task
- report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results")
- report.add_predictions( validation_set ) #, [:crossvalidation_fold] )
+ report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") if
+ (cv_set.unique_value(:num_folds).to_i < cv_set.unique_value(:num_instances).to_i)
+ report.add_predictions( validation_set, true )
task.progress(100) if task
report
end
diff --git a/report/report_persistance.rb b/report/report_persistance.rb
index e02387f..ccdebad 100755
--- a/report/report_persistance.rb
+++ b/report/report_persistance.rb
@@ -200,6 +200,7 @@ module Reports
index :report_type
index :validation_uris
index :crossvalidation_uris
+ index :algorithm_uris
attr_accessor :subjectid
diff --git a/report/validation_access.rb b/report/validation_access.rb
index aaa7bdc..e2a3978 100755
--- a/report/validation_access.rb
+++ b/report/validation_access.rb
@@ -145,20 +145,35 @@ class Reports::ValidationDB
end
def test_feature_dataset_uri(validation, subjectid)
- m = OpenTox::Model::Generic.find(validation.model_uri, subjectid)
- feat_gen = nil
- m.metadata[OT.parameters].each do |h|
- if h[DC.title] and h[DC.title]=~/feature_generation/ and h[OT.paramValue]
- feat_gen = h[OT.paramValue]
+ training_features = Lib::DatasetCache.find( training_feature_dataset_uri(validation,subjectid), subjectid )
+ test_dataset = Lib::DatasetCache.find( validation.test_dataset_uri, subjectid )
+ features_found = true
+ training_features.features.keys.each do |f|
+ unless test_dataset.features.keys.include?(f)
+ features_found = false
+ LOGGER.debug "training-feature are not in test-datset #{f}"
break
end
- end if m and m.metadata[OT.parameters]
- raise "no feature creation alg found" unless feat_gen
- feat_gen = File.join(feat_gen,"match") if feat_gen=~/fminer/
- uri = OpenTox::RestClientWrapper.post(feat_gen,{:subjectid => subjectid,
- :feature_dataset_uri=>training_feature_dataset_uri(validation,subjectid),
- :dataset_uri=>validation.test_dataset_uri})
- @@tmp_resources << uri
+ end
+ if features_found
+ LOGGER.debug "all training-features found in test-datset"
+ uri = test_dataset.uri
+ else
+ m = OpenTox::Model::Generic.find(validation.model_uri, subjectid)
+ feat_gen = nil
+ m.metadata[OT.parameters].each do |h|
+ if h[DC.title] and h[DC.title]=~/feature_generation/ and h[OT.paramValue]
+ feat_gen = h[OT.paramValue]
+ break
+ end
+ end if m and m.metadata[OT.parameters]
+ raise "no feature creation alg found" unless feat_gen
+ feat_gen = File.join(feat_gen,"match") if feat_gen=~/fminer/
+ uri = OpenTox::RestClientWrapper.post(feat_gen,{:subjectid => subjectid,
+ :feature_dataset_uri=>training_feature_dataset_uri(validation,subjectid),
+ :dataset_uri=>validation.test_dataset_uri})
+ @@tmp_resources << uri
+ end
uri
end
diff --git a/test/test_examples.rb b/test/test_examples.rb
index f3c0b7e..2b95cf2 100755
--- a/test/test_examples.rb
+++ b/test/test_examples.rb
@@ -274,6 +274,22 @@ module ValidationExamples
end
end
+ ########################################################################################################
+
+ class HamsterLooCrossvalidation < LooCrossValidation
+ def initialize
+ @dataset_file = File.new("data/hamster_carcinogenicity.csv","r")
+ end
+ end
+
+ class LazarHamsterLooCrossvalidation < HamsterLooCrossvalidation
+ def initialize
+ @algorithm_uri = File.join(CONFIG[:services]["opentox-algorithm"],"lazar")
+ @algorithm_params = "feature_generation_uri="+File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc")
+ super
+ end
+ end
+
########################################################################################################
class LazarHamsterMiniCrossvalidation < CrossValidation
@@ -828,6 +844,8 @@ module ValidationExamples
"22e" => [ AmbitVsNtuaTrainingTest ],
"22f" => [ AnotherAmbitJ48TrainingTest ],
"22g" => [ TumTrainingTest ],
+
+ "23a" => [ LazarHamsterLooCrossvalidation ],
}
diff --git a/test/test_examples_util.rb b/test/test_examples_util.rb
index b48096d..82c4c48 100755
--- a/test/test_examples_util.rb
+++ b/test/test_examples_util.rb
@@ -1,4 +1,15 @@
+class Numeric
+ def to_human
+ return "0" if self==0
+ units = %w{B KB MB GB TB}
+ e = (Math.log(self)/Math.log(1024)).floor
+ s = "%.1f" % (to_f / 1024**e)
+ s.sub(/\.?0*$/, units[e])
+ end
+end
+
+
module ValidationExamples
class Util
@@ -335,6 +346,57 @@ module ValidationExamples
end
end
+ def compute_dataset_size
+ if @validation_uri =~ /crossvalidation/
+ cv = OpenTox::Crossvalidation.find(@validation_uri,@subjectid)
+ count = 0
+ size = 0
+ target = nil
+
+ cv.metadata[OT.validation].each do |v|
+ val = OpenTox::Validation.find(v)
+ dataset = {}
+ dataset[:test] = val.metadata[OT.testDataset]
+ dataset[:training] = val.metadata[OT.trainingDataset]
+ #dataset[:target] = val.metadata[OT.testTargetDataset]
+ raise if (target!=nil and target!=val.metadata[OT.testTargetDataset])
+ target = val.metadata[OT.testTargetDataset]
+
+ dataset[:prediction] = val.metadata[OT.predictionDataset]
+ m = val.metadata[OT.model]
+ model = OpenTox::Model::Generic.find(m)
+ dataset[:feature] = model.metadata[OT.featureDataset]
+
+ puts v
+ val_size = 0
+ dataset.each do |k,v|
+ s = size(v)
+ val_size += s
+ puts k.to_s+" "+v+" "+s.to_human
+ end
+ puts val_size.to_human
+ puts ""
+ size += val_size
+
+ count += 1
+ #break if (count>2)
+ end
+
+ puts "total "+size.to_human+" (count: "+count.to_s+")"
+ puts "avg "+(size/count.to_f).to_human
+
+ puts ""
+ puts "orig file: "+target+" "+size(target).to_human
+ end
+ end
+
+ private
+ def size(dataset)
+ f = "/home/martin/opentox-ruby/www/opentox/dataset/data/#{dataset.split("/")[-1]}.json"
+ File.exist?(f) ? File.new(f).size : 0
+ end
+
+ public
def verify_yaml
raise "cannot very validation, validation_uri is null" unless @validation_uri
@@ -443,4 +505,22 @@ module ValidationExamples
"crossvalidation"
end
end
+
+ class LooCrossValidation < ValidationExample
+ def params
+ [:algorithm_uri, :dataset_uri, :prediction_feature]
+ end
+
+ def opt_params
+ [ :algorithm_params ]
+ end
+
+ def report_type
+ "crossvalidation"
+ end
+
+ def validation_type
+ "crossvalidation/loo"
+ end
+ end
end
diff --git a/validation/validation_application.rb b/validation/validation_application.rb
index 32ca971..1bc55f6 100755
--- a/validation/validation_application.rb
+++ b/validation/validation_application.rb
@@ -6,8 +6,16 @@ end
require 'lib/dataset_cache.rb'
require 'validation/validation_service.rb'
+helpers do
+ def check_stratified(params)
+ params[:stratified] = "false" unless params[:stratified]
+ raise OpenTox::BadRequestError.new "stratified != true|false|super, is #{params[:stratified]}" unless
+ params[:stratified]=~/true|false|super/
+ end
+end
+
get '/crossvalidation/?' do
- LOGGER.info "list all crossvalidations"
+ LOGGER.info "list all crossvalidations "+params.inspect
model_uri = params.delete("model") || params.delete("model_uri")
if model_uri
model = OpenTox::Model::Generic.find(model_uri, @subjectid)
@@ -46,17 +54,20 @@ post '/crossvalidation/?' do
raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0
raise OpenTox::BadRequestError.new "illegal param-value num_folds: '"+params[:num_folds].to_s+"', must be integer >1" unless params[:num_folds]==nil or
params[:num_folds].to_i>1
-
+ check_stratified(params)
+
task = OpenTox::Task.create( "Perform crossvalidation", url_for("/crossvalidation", :full) ) do |task| #, params
cv_params = { :dataset_uri => params[:dataset_uri],
:algorithm_uri => params[:algorithm_uri],
+ :algorithm_params => params[:algorithm_params],
+ :prediction_feature => params[:prediction_feature],
+ :stratified => params[:stratified],
:loo => "false",
:subjectid => @subjectid }
[ :num_folds, :random_seed ].each{ |sym| cv_params[sym] = params[sym] if params[sym] }
- cv_params[:stratified] = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified]
cv = Validation::Crossvalidation.create cv_params
cv.subjectid = @subjectid
- cv.perform_cv( params[:prediction_feature], params[:algorithm_params], OpenTox::SubTask.create(task,0,95))
+ cv.perform_cv( OpenTox::SubTask.create(task,0,95) )
# computation of stats is cheap as dataset are already loaded into the memory
Validation::Validation.from_cv_statistics( cv.id, @subjectid, OpenTox::SubTask.create(task,95,100) )
cv.crossvalidation_uri
@@ -87,16 +98,19 @@ post '/crossvalidation/loo/?' do
raise OpenTox::BadRequestError.new "algorithm_uri missing" unless params[:algorithm_uri].to_s.size>0
raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0
raise OpenTox::BadRequestError.new "illegal param: num_folds, stratified, random_seed not allowed for loo-crossvalidation" if params[:num_folds] or
- params[:stratifed] or params[:random_seed]
+ params[:stratified] or params[:random_seed]
task = OpenTox::Task.create( "Perform loo-crossvalidation", url_for("/crossvalidation/loo", :full) ) do |task| #, params
- cv_params = { :dataset_uri => params[:dataset_uri],
+ cv_params = { :dataset_uri => params[:dataset_uri],
+ :algorithm_params => params[:algorithm_params],
+ :prediction_feature => params[:prediction_feature],
:algorithm_uri => params[:algorithm_uri],
:loo => "true" }
cv = Validation::Crossvalidation.create cv_params
cv.subjectid = @subjectid
- cv.perform_cv( params[:prediction_feature], params[:algorithm_params], OpenTox::SubTask.create(task,0,95))
+ cv.perform_cv( OpenTox::SubTask.create(task,0,95))
# computation of stats is cheap as dataset are already loaded into the memory
Validation::Validation.from_cv_statistics( cv.id, @subjectid, OpenTox::SubTask.create(task,95,100) )
+ cv.clean_loo_files( !(params[:algorithm_params] && params[:algorithm_params] =~ /feature_dataset_uri/) )
cv.crossvalidation_uri
end
return_task(task)
@@ -343,12 +357,13 @@ post '/training_test_validation/?' do
task = OpenTox::Task.create( "Perform training-test-validation", url_for("/", :full) ) do |task| #, params
v = Validation::Validation.create :validation_type => "training_test_validation",
:algorithm_uri => params[:algorithm_uri],
+ :algorithm_params => params[:algorithm_params],
:training_dataset_uri => params[:training_dataset_uri],
:test_dataset_uri => params[:test_dataset_uri],
:test_target_dataset_uri => params[:test_target_dataset_uri],
:prediction_feature => params[:prediction_feature]
v.subjectid = @subjectid
- v.validate_algorithm( params[:algorithm_params], task )
+ v.validate_algorithm( task )
v.validation_uri
end
return_task(task)
@@ -402,10 +417,11 @@ post '/bootstrapping' do
:test_target_dataset_uri => params[:dataset_uri],
:prediction_feature => params[:prediction_feature],
:algorithm_uri => params[:algorithm_uri],
+ :algorithm_params => params[:algorithm_params],
:training_dataset_uri => params[:training_dataset_uri],
:test_dataset_uri => params[:test_dataset_uri]
v.subjectid = @subjectid
- v.validate_algorithm( params[:algorithm_params], OpenTox::SubTask.create(task,33,100))
+ v.validate_algorithm( OpenTox::SubTask.create(task,33,100))
v.validation_uri
end
return_task(task)
@@ -452,18 +468,19 @@ post '/training_test_split' do
raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri].to_s.size>0
raise OpenTox::BadRequestError.new "algorithm_uri missing" unless params[:algorithm_uri].to_s.size>0
raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0
+ check_stratified(params)
task = OpenTox::Task.create( "Perform training test split validation", url_for("/training_test_split", :full) ) do |task| #, params
- strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified]
params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature],
- @subjectid, strat, params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33)))
+ @subjectid, params[:stratified], params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33)))
v = Validation::Validation.create :validation_type => "training_test_split",
:training_dataset_uri => params[:training_dataset_uri],
:test_dataset_uri => params[:test_dataset_uri],
:test_target_dataset_uri => params[:dataset_uri],
:prediction_feature => params[:prediction_feature],
- :algorithm_uri => params[:algorithm_uri]
+ :algorithm_uri => params[:algorithm_uri],
+ :algorithm_params => params[:algorithm_params]
v.subjectid = @subjectid
- v.validate_algorithm( params[:algorithm_params], OpenTox::SubTask.create(task,33,100))
+ v.validate_algorithm( OpenTox::SubTask.create(task,33,100))
v.validation_uri
end
return_task(task)
@@ -545,10 +562,10 @@ end
post '/plain_training_test_split' do
LOGGER.info "creating pure training test split "+params.inspect
raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri]
+ check_stratified(params)
task = OpenTox::Task.create( "Create data-split", url_for("/plain_training_test_split", :full) ) do |task|
- strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified]
result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], @subjectid,
- strat, params[:split_ratio], params[:random_seed])
+ params[:stratified], params[:split_ratio], params[:random_seed], task)
content_type "text/uri-list"
result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n"
end
diff --git a/validation/validation_service.rb b/validation/validation_service.rb
index 614363d..8c8b11f 100755
--- a/validation/validation_service.rb
+++ b/validation/validation_service.rb
@@ -111,13 +111,13 @@ module Validation
end
# validates an algorithm by building a model and validating this model
- def validate_algorithm( algorithm_params=nil, task=nil )
+ def validate_algorithm( task=nil )
raise "validation_type missing" unless self.validation_type
raise OpenTox::BadRequestError.new "no algorithm uri: '"+self.algorithm_uri.to_s+"'" if self.algorithm_uri==nil or self.algorithm_uri.to_s.size<1
params = { :dataset_uri => self.training_dataset_uri, :prediction_feature => self.prediction_feature }
- if (algorithm_params!=nil)
- algorithm_params.split(";").each do |alg_params|
+ if (self.algorithm_params!=nil)
+ self.algorithm_params.split(";").each do |alg_params|
alg_param = alg_params.split("=",2)
raise OpenTox::BadRequestError.new "invalid algorithm param: '"+alg_params.to_s+"'" unless alg_param.size==2 or alg_param[0].to_s.size<1 or alg_param[1].to_s.size<1
LOGGER.warn "algorihtm param contains empty space, encode? "+alg_param[1].to_s if alg_param[1] =~ /\s/
@@ -210,9 +210,10 @@ module Validation
algorithm_uri = self.algorithm_uri ? nil : model.metadata[OT.algorithm]
predicted_variable = model.predicted_variable(self.subjectid)
predicted_confidence = model.predicted_confidence(self.subjectid)
- raise "cannot determine whether model '"+model.uri.to_s+"' performs classification or regression, "+
+ raise "cannot determine whether model '"+model.uri.to_s+"' performs classification or regression: '#{feature_type}', "+
"please set rdf-type of predictedVariables feature '"+predicted_variable.to_s+
- "' to NominalFeature or NumericFeature" if (feature_type.to_s!="classification" and feature_type.to_s!="regression")
+ "' to NominalFeature or NumericFeature" if
+ (feature_type.to_s!="classification" and feature_type.to_s!="regression")
compute_prediction_data( feature_type, predicted_variable, predicted_confidence,
prediction_feature, algorithm_uri, task )
end
@@ -300,9 +301,26 @@ module Validation
class Crossvalidation
- def perform_cv ( prediction_feature, algorithm_params=nil, task=nil )
- create_cv_datasets( prediction_feature, OpenTox::SubTask.create(task, 0, 33) )
- perform_cv_validations( algorithm_params, OpenTox::SubTask.create(task, 33, 100) )
+ def perform_cv ( task=nil )
+ create_cv_datasets( OpenTox::SubTask.create(task, 0, 33) )
+ perform_cv_validations( OpenTox::SubTask.create(task, 33, 100) )
+ end
+
+ def clean_loo_files( delete_feature_datasets )
+ Validation.find( :crossvalidation_id => self.id, :validation_type => "crossvalidation" ).each do |v|
+ LOGGER.debug "loo-cleanup> delete training dataset "+v.training_dataset_uri
+ OpenTox::RestClientWrapper.delete v.training_dataset_uri,subjectid
+ if (delete_feature_datasets)
+ begin
+ model = OpenTox::Model::Generic.find(v.model_uri)
+ if model.metadata[OT.featureDataset]
+ LOGGER.debug "loo-cleanup> delete feature dataset "+model.metadata[OT.featureDataset]
+ OpenTox::RestClientWrapper.delete model.metadata[OT.featureDataset],subjectid
+ end
+ rescue
+ end
+ end
+ end
end
# deletes a crossvalidation, all validations are deleted as well
@@ -331,36 +349,35 @@ module Validation
end
# creates the cv folds
- def create_cv_datasets( prediction_feature, task=nil )
+ def create_cv_datasets( task=nil )
if self.loo=="true"
orig_dataset = Lib::DatasetCache.find(self.dataset_uri,self.subjectid)
self.num_folds = orig_dataset.compounds.size
self.random_seed = 0
- self.stratified = false
+ self.stratified = "false"
else
self.random_seed = 1 unless self.random_seed
self.num_folds = 10 unless self.num_folds
- self.stratified = false unless self.stratified
+ self.stratified = "false" unless self.stratified
end
- if copy_cv_datasets( prediction_feature )
+ if copy_cv_datasets()
# dataset folds of a previous crossvalidaiton could be used
task.progress(100) if task
else
- create_new_cv_datasets( prediction_feature, task )
+ create_new_cv_datasets( task )
end
end
# executes the cross-validation (build models and validates them)
- def perform_cv_validations( algorithm_params, task=nil )
+ def perform_cv_validations( task=nil )
- LOGGER.debug "perform cv validations "+algorithm_params.inspect
+ LOGGER.debug "perform cv validations"
i = 0
task_step = 100 / self.num_folds.to_f;
@tmp_validations.each do | val |
validation = Validation.create val
validation.subjectid = self.subjectid
- validation.validate_algorithm( algorithm_params,
- OpenTox::SubTask.create(task, i * task_step, ( i + 1 ) * task_step) )
+ validation.validate_algorithm( OpenTox::SubTask.create(task, i * task_step, ( i + 1 ) * task_step) )
raise "validation '"+validation.validation_uri+"' for crossvaldation could not be finished" unless
validation.finished
i += 1
@@ -377,14 +394,17 @@ module Validation
private
# copies datasets from an older crossvalidation on the same dataset and the same folds
# returns true if successfull, false otherwise
- def copy_cv_datasets( prediction_feature )
+ def copy_cv_datasets( )
+ # for downwards compatibilty: search prediction_feature=nil is ok
cvs = Crossvalidation.find( {
:dataset_uri => self.dataset_uri,
:num_folds => self.num_folds,
:stratified => self.stratified,
:random_seed => self.random_seed,
:loo => self.loo,
- :finished => true} ).reject{ |cv| cv.id == self.id }
+ :finished => true} ).reject{ |cv| (cv.id == self.id ||
+ (cv.prediction_feature &&
+ cv.prediction_feature != self.prediction_feature)) }
cvs.each do |cv|
next if AA_SERVER and !OpenTox::Authorization.authorized?(cv.crossvalidation_uri,"GET",self.subjectid)
tmp_val = []
@@ -402,7 +422,8 @@ module Validation
:crossvalidation_id => self.id,
:crossvalidation_fold => v.crossvalidation_fold,
:prediction_feature => prediction_feature,
- :algorithm_uri => self.algorithm_uri }
+ :algorithm_uri => self.algorithm_uri,
+ :algorithm_params => self.algorithm_params }
end
if tmp_val.size == self.num_folds.to_i
@tmp_validations = tmp_val
@@ -415,111 +436,81 @@ module Validation
# creates cv folds (training and testdatasets)
# stores uris in validation objects
- def create_new_cv_datasets( prediction_feature, task = nil )
+ def create_new_cv_datasets( task = nil )
LOGGER.debug "creating datasets for crossvalidation"
orig_dataset = Lib::DatasetCache.find(self.dataset_uri,self.subjectid)
raise OpenTox::NotFoundError.new "Dataset not found: "+self.dataset_uri.to_s unless orig_dataset
- if self.loo=="true"
- shuffled_compounds = orig_dataset.compounds
- else
- shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed )
- end
+ train_dataset_uris = []
+ test_dataset_uris = []
- unless self.stratified
+ meta = { DC.creator => self.crossvalidation_uri }
+ case stratified
+ when "false"
+ if self.loo=="true"
+ shuffled_compounds = orig_dataset.compounds
+ else
+ shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed )
+ end
split_compounds = shuffled_compounds.chunk( self.num_folds.to_i )
- else
- class_compounds = {} # "inactive" => compounds[], "active" => compounds[] ..
- accept_values = orig_dataset.accept_values(prediction_feature)
- raise OpenTox::BadRequestError.new("cannot apply stratification (not implemented for regression), acceptValue missing for prediction-feature '"+
- prediction_feature.to_s+"' in dataset '"+dataset_uri.to_s+"'") unless accept_values and accept_values.size>0
- accept_values.each do |value|
- class_compounds[value] = []
- shuffled_compounds.each do |c|
- #PENDING accept values are type string, data_entries may be boolean
- class_compounds[value] << c if orig_dataset.data_entries[c][prediction_feature].collect{|v| v.to_s}.include?(value)
- end
- end
- LOGGER.debug "stratified cv: different class values: "+class_compounds.keys.join(", ")
- LOGGER.debug "stratified cv: num instances for each class value: "+class_compounds.values.collect{|c| c.size}.join(", ")
-
- split_class_compounds = [] # inactive_compounds[fold_i][], active_compounds[fold_i][], ..
- class_compounds.values.each do |compounds|
- split_class_compounds << compounds.chunk( self.num_folds.to_i )
- end
- LOGGER.debug "stratified cv: splits for class values: "+split_class_compounds.collect{ |c| c.collect{ |cc| cc.size }.join("/") }.join(", ")
-
- # we cannot just merge the splits of the different class_values of each fold
- # this could lead to folds, which sizes differ for more than 1 compound
- split_compounds = []
- split_class_compounds.each do |split_comp|
- # step 1: sort current split in ascending order
- split_comp.sort!{|x,y| x.size <=> y.size }
- # step 2: add splits
- (0..self.num_folds.to_i-1).each do |i|
- unless split_compounds[i]
- split_compounds[i] = split_comp[i]
+ LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ")
+
+ self.num_folds.to_i.times do |n|
+ test_compounds = []
+ train_compounds = []
+ self.num_folds.to_i.times do |nn|
+ compounds = split_compounds[nn]
+ if n == nn
+ compounds.each{ |compound| test_compounds << compound}
else
- split_compounds[i] += split_comp[i]
- end
+ compounds.each{ |compound| train_compounds << compound}
+ end
end
- # step 3: sort (total) split in descending order
- split_compounds.sort!{|x,y| y.size <=> x.size }
+ raise "internal error, num test compounds not correct,"+
+ " is '#{test_compounds.size}', should be '#{(shuffled_compounds.size/self.num_folds.to_i)}'" unless
+ (shuffled_compounds.size/self.num_folds.to_i - test_compounds.size).abs <= 1
+ raise "internal error, num train compounds not correct, should be '"+(shuffled_compounds.size-test_compounds.size).to_s+
+ "', is '"+train_compounds.size.to_s+"'" unless shuffled_compounds.size - test_compounds.size == train_compounds.size
+ datasetname = 'dataset fold '+(n+1).to_s+' of '+self.num_folds.to_s
+ meta[DC.title] = "training "+datasetname
+ LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s
+ train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys,
+ meta, self.subjectid ).uri
+ train_dataset_uris << train_dataset_uri
+ meta[DC.title] = "test "+datasetname
+ LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s
+ test_features = orig_dataset.features.keys.dclone - [self.prediction_feature]
+ test_dataset_uri = orig_dataset.split( test_compounds, test_features,
+ meta, self.subjectid ).uri
+ test_dataset_uris << test_dataset_uri
end
+ when /true|super/
+ if stratified=="true"
+ features = [ self.prediction_feature ]
+ else
+ features = nil
+ end
+ r_util = OpenTox::RUtil.new
+ train_datasets, test_datasets = r_util.stratified_k_fold_split(orig_dataset,meta,
+ "NA",self.num_folds.to_i,@subjectid,self.random_seed, features)
+ r_util.quit_r
+ train_dataset_uris = train_datasets.collect{|d| d.uri}
+ test_dataset_uris = test_datasets.collect{|d| d.uri}
+ else
+ raise OpenTox::BadRequestError.new
end
- LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ")
-
- test_features = orig_dataset.features.keys.dclone - [prediction_feature]
@tmp_validations = []
-
- (1..self.num_folds.to_i).each do |n|
-
- datasetname = 'cv'+self.id.to_s +
- #'_d'+orig_dataset.name.to_s +
- '_f'+n.to_s+'of'+self.num_folds.to_s+
- '_r'+self.random_seed.to_s+
- '_s'+self.stratified.to_s
- source = $url_provider.url_for('/crossvalidation',:full)
-
- test_compounds = []
- train_compounds = []
-
- (1..self.num_folds.to_i).each do |nn|
- compounds = split_compounds.at(nn-1)
-
- if n == nn
- compounds.each{ |compound| test_compounds.push(compound)}
- else
- compounds.each{ |compound| train_compounds.push(compound)}
- end
- end
-
- raise "internal error, num test compounds not correct" unless (shuffled_compounds.size/self.num_folds.to_i - test_compounds.size).abs <= 1
- raise "internal error, num train compounds not correct, should be '"+(shuffled_compounds.size-test_compounds.size).to_s+
- "', is '"+train_compounds.size.to_s+"'" unless shuffled_compounds.size - test_compounds.size == train_compounds.size
-
- LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s
- #train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, orig_dataset.features, datasetname + '_train', source )
- train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys,
- { DC.title => datasetname + '_train', DC.creator => source }, self.subjectid ).uri
-
- LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s
- #test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, test_features, datasetname + '_test', source )
- test_dataset_uri = orig_dataset.split( test_compounds, test_features,
- { DC.title => datasetname + '_test', DC.creator => source }, self.subjectid ).uri
-
- #make sure self.id is set
- #self.save if self.new?
+ self.num_folds.to_i.times do |n|
tmp_validation = { :validation_type => "crossvalidation",
- :training_dataset_uri => train_dataset_uri,
- :test_dataset_uri => test_dataset_uri,
+ :training_dataset_uri => train_dataset_uris[n],
+ :test_dataset_uri => test_dataset_uris[n],
:test_target_dataset_uri => self.dataset_uri,
- :crossvalidation_id => self.id, :crossvalidation_fold => n,
- :prediction_feature => prediction_feature,
- :algorithm_uri => self.algorithm_uri }
+ :crossvalidation_id => self.id, :crossvalidation_fold => (n+1),
+ :prediction_feature => self.prediction_feature,
+ :algorithm_uri => self.algorithm_uri,
+ :algorithm_params => self.algorithm_params}
@tmp_validations << tmp_validation
-
task.progress( n / self.num_folds.to_f * 100 ) if task
end
end
@@ -618,7 +609,7 @@ module Validation
# splits a dataset into test and training dataset
# returns map with training_dataset_uri and test_dataset_uri
- def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified=false, split_ratio=nil, random_seed=nil, task=nil )
+ def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified="false", split_ratio=nil, random_seed=nil, task=nil )
split_ratio=0.67 unless split_ratio
split_ratio = split_ratio.to_f
random_seed=1 unless random_seed
@@ -634,15 +625,25 @@ module Validation
"' not found in dataset, features are: \n"+
orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature)
else
- LOGGER.warn "no prediciton feature given, all features included in test dataset"
+ LOGGER.warn "no prediciton feature given, all features will be included in test dataset"
end
- if stratified
+ meta = { DC.creator => $url_provider.url_for('/training_test_split',:full) }
+
+ case stratified
+ when /true|super/
+ if stratified=="true"
+ raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature
+ features = [prediction_feature]
+ else
+ LOGGER.warn "prediction feature is ignored for super-stratified splits" if prediction_feature
+ features = nil
+ end
r_util = OpenTox::RUtil.new
- split_sets = r_util.stratified_split( orig_dataset, "NA", df, split_ratio, random_seed )
+ train, test = r_util.stratified_split( orig_dataset, meta, "NA", split_ratio, @subjectid, random_seed, features )
r_util.quit_r
- result = {:training_dataset_uri => split_sets[0], :test_dataset_uri => split_sets[1]}
- else
+ result = {:training_dataset_uri => train.uri, :test_dataset_uri => test.uri}
+ when "false"
compounds = orig_dataset.compounds
raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
split = (compounds.size*split_ratio).to_i
@@ -656,22 +657,18 @@ module Validation
test_compounds = compounds[(split+1)..-1]
task.progress(33) if task
+ meta[DC.title] = "Training dataset split of "+orig_dataset.uri
result = {}
result[:training_dataset_uri] = orig_dataset.split( training_compounds,
- orig_dataset.features.keys,
- { DC.title => "Training dataset split of "+orig_dataset.title.to_s,
- DC.creator => $url_provider.url_for('/training_test_split',:full) },
- subjectid ).uri
+ orig_dataset.features.keys, meta, subjectid ).uri
task.progress(66) if task
+ meta[DC.title] = "Test dataset split of "+orig_dataset.uri
result[:test_dataset_uri] = orig_dataset.split( test_compounds,
- orig_dataset.features.keys.dclone - [prediction_feature],
- { DC.title => "Test dataset split of "+orig_dataset.title.to_s,
- DC.creator => $url_provider.url_for('/training_test_split',:full) },
- subjectid ).uri
+ orig_dataset.features.keys.dclone - [prediction_feature], meta, subjectid ).uri
task.progress(100) if task
- if !stratified and ENV['RACK_ENV'] =~ /test|debug/
+ if ENV['RACK_ENV'] =~ /test|debug/
raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless
Lib::DatasetCache.find(result[:training_dataset_uri],subjectid)
test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid
@@ -680,8 +677,9 @@ module Validation
raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+
test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split)
end
-
LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'"
+ else
+ raise OpenTox::BadRequestError.new "stratified != false|true|super, is #{stratified}"
end
result
end
diff --git a/validation/validation_test.rb b/validation/validation_test.rb
index ae71749..70f3ca4 100755
--- a/validation/validation_test.rb
+++ b/validation/validation_test.rb
@@ -60,6 +60,60 @@ class ValidationTest < Test::Unit::TestCase
begin
$test_case = self
+# dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=0"
+# test_dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=1"
+# #prediction_feature = "http://apps.ideaconsult.net:8080/ambit2/feature/528321"
+# prediction_feature = "http://apps.ideaconsult.net:8080/ambit2/feature/528402"
+# prediction_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/RandomForest"
+# #ad_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/leverage"
+# #ad_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/distanceMahalanobis"
+# #ad_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/pcaRanges"
+# ad_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/RandomForest"
+# post "/training_test_validation",{:training_dataset_uri=>dataset_uri, :test_dataset_uri=>test_dataset_uri,
+# :prediction_feature => prediction_feature, :algorithm_uri=>"http://local-ot/adwrap",
+# :algorithm_params=>"prediction_algorithm=#{prediction_algorithm};ad_algorithm=#{ad_algorithm}"}
+# puts last_response.body
+# uri = last_response.body
+# rep = wait_for_task(uri)
+# puts rep
+#
+# post "/report/method_comparison",
+# {:validation_uris=>"http://local-ot/validation/433,http://local-ot/validation/434,http://local-ot/validation/435,http://local-ot/validation/436,http://local-ot/validation/437,http://local-ot/validation/438,http://local-ot/validation/439,http://local-ot/validation/440,http://local-ot/validation/441,http://local-ot/validation/442,http://local-ot/validation/crossvalidation/30,",
+# :identifier=>"random,random,random,random,random,random,random,random,random,random,crossvalidated,"}
+
+# post "/report/method_comparison",
+# {:validation_uris=>"http://local-ot/validation/389,http://local-ot/validation/390,http://local-ot/validation/391,http://local-ot/validation/392",
+# :identifier=>"split1,split1,split2,split2"}
+
+
+ #post "/report/validation",{:validation_uris=>"http://local-ot/validation/171"}
+ #post "/report/validation",{:validation_uris=>"http://local-ot/validation/389"}
+
+ #dataset_uri = OpenTox::Dataset.create_from_csv_file(File.new("data/EPAFHM.csv").path, nil).uri
+ #puts dataset_uri
+
+# #dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603306?feature_uris[]=http://apps.ideaconsult.net:8080/ambit2/feature/764036"
+# #dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603204"
+# post "/plain_training_test_split",{:dataset_uri=>dataset_uri, :stratified=>"true", :split_ratio=>0.3}
+# puts last_response.body
+# uri = last_response.body
+# rep = wait_for_task(uri)
+# puts rep
+ #OpenTox::RestClientWrapper.post("http://opentox.informatik.uni-freiburg.de/validation/plain_training_test_split",
+ # {:dataset_uri=>dataset_uri, :stratified=>"true", :split_ratio=>0.7407407407})
+
+ #puts OpenTox::Dataset.create_from_csv_file(File.new("data/hamster_carcinogenicity.csv").path, nil).uri
+ #puts OpenTox::Dataset.create_from_csv_file(File.new("data/multi_cell_call.csv").path, nil).uri
+
+ #puts OpenTox::Dataset.find("http://opentox.informatik.uni-freiburg.de/dataset/98").compounds.size
+
+#
+# #post "/plain_training_test_split",{:dataset_uri=>"http://apps.ideaconsult.net:8080/ambit2/dataset/603204", :stratified=>"true"}
+#
+#
+#
+
+
# post "/validate_datasets",{
# :test_dataset_uri=>"http://local-ot/dataset/6907",
# :prediction_dataset_uri=>"http://local-ot/dataset/6909",
@@ -71,20 +125,19 @@ class ValidationTest < Test::Unit::TestCase
# #:regression=>"true"}
# :classification=>"true"}
#
-# puts last_response.body
-# uri = last_response.body
-# rep = wait_for_task(uri)
-# puts rep
+
#get 'crossvalidation/19/statistics'
#get 'crossvalidation/189/statistics'
#puts last_response.body
-# run_test("1b")
+
+ #run_test("13a")
+ # run_test("1a",:validation_uri=>"http://local-ot/validation/513")
#get '/crossvalidation/79/predictions',nil,'HTTP_ACCEPT' => "application/x-yaml"
#puts last_response.body
- run_test("22f") #,:validation_uri=>"http://local-ot/validation/84" )
+ # run_test("22f") #,:validation_uri=>"http://local-ot/validation/84" )
#run_test("21b")
@@ -109,12 +162,6 @@ class ValidationTest < Test::Unit::TestCase
# puts rep
# 205 206 207
-# post '/report/algorithm_comparison',{:validation_uris=>"http://local-ot/validation/crossvalidation/149,http://local-ot/validation/crossvalidation/210",
-# :identifier=>"bbrc,last"}
-# uri = last_response.body
-# rep = wait_for_task(uri)
-# puts rep
-
#run_test("1a", {:validation_uri=>"http://local-ot/validation/305"})
# puts "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
#run_test("3a",{:validation_uri=>"http://local-ot/validation/crossvalidation/6"})
@@ -123,14 +170,33 @@ class ValidationTest < Test::Unit::TestCase
# puts "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
#run_test("14a") #,{:validation_uri=>"http://local-ot/validation/crossvalidation/148"})
# puts "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
-
- #run_test("1a")
-# run_test("3d",{
-# :dataset_uri => "http://local-ot/dataset/2897",
-# :prediction_feature => "http://local-ot/dataset/2897/feature/Hamster%20Carcinogenicity",
+ #run_test("3a")
+ #run_test("3d",{
+ # :dataset_uri => "http://local-ot/dataset/447",
+ # :prediction_feature => "http://local-ot/dataset/447/feature/Hamster%20Carcinogenicity",
+ # :random_seed => 1
+ # })
+
+ #run_test("23a")
+ run_test("23a",{:validation_uri=>"http://local-ot/validation/crossvalidation/53"})
+ #run_test("23a",{:validation_uri=>"http://local-ot/validation/crossvalidation/47"})
+ #23a loo {:validation_uri=>"http://local-ot/validation/crossvalidation/47"})
+ #loo mit datasets auf ortona {:validation_uri=>"http://local-ot/validation/crossvalidation/46"}
+
+# run_test("14d",{
+# :dataset_uri => "http://local-ot/dataset/508",
+# :prediction_feature => "http://local-ot/dataset/508/feature/LC50_mmol",
# :random_seed => 1
# })
+
+ #post '/report/algorithm_comparison',{
+ # :validation_uris=>"http://local-ot/validation/crossvalidation/9,http://local-ot/validation/crossvalidation/10",
+ # :identifier=>"bbrc,last",
+ # :ttest_attributes=>"num_instances,num_without_class,num_unpredicted,real_runtime,percent_without_class,percent_unpredicted"}
+ #uri = last_response.body
+ #rep = wait_for_task(uri)
+ #puts rep
#run_test("14",{
# :dataset_uri => "http://local-ot/dataset/3877",
@@ -189,6 +255,10 @@ class ValidationTest < Test::Unit::TestCase
LOGGER.debug "validation done '"+ex.validation_uri.to_s+"'"
end
+
+ #ex.compute_dataset_size
+ #break
+
if !delete and ex.validation_uri
if SUBJECTID
puts ex.validation_uri+"?subjectid="+CGI.escape(SUBJECTID)