summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-11-20 12:14:13 +0100
committermguetlein <martin.guetlein@gmail.com>2012-11-20 12:14:13 +0100
commit78201e10a1f4695c65f431d07512188363f897fb (patch)
tree6f1759b3cbdc164c045001307fb7fa8bafbd878e
parent9bfa9a593d03375d75663abe3aa9ef33b0163702 (diff)
adjust validation to dataset changes, remove test_target_dataset concept (never worked for multiple compound occurences with different values), fix splitting (multiple compound occurences are now split in different sets as well)oldarch
-rwxr-xr-xexample.rb1
-rw-r--r--lib/prediction_data.rb138
-rwxr-xr-xlib/validation_db.rb4
-rw-r--r--report/plot_factory.rb9
-rwxr-xr-xreport/validation_access.rb15
-rwxr-xr-xtest/test_examples.rb4
-rwxr-xr-xtest/test_examples_util.rb7
-rwxr-xr-xvalidation/validation_application.rb12
-rwxr-xr-xvalidation/validation_service.rb164
-rwxr-xr-xvalidation/validation_test.rb25
10 files changed, 138 insertions, 241 deletions
diff --git a/example.rb b/example.rb
index 636579e..11af160 100755
--- a/example.rb
+++ b/example.rb
@@ -96,7 +96,6 @@ class Example
v = Validation::Validation.new :training_dataset_uri => split_params[:training_dataset_uri],
:validation_type => "test_set_validation",
:test_dataset_uri => split_params[:test_dataset_uri],
- :test_target_dataset_uri => data_uri,
:prediction_feature => URI.decode(@@feature),
:algorithm_uri => @@alg
v.validate_algorithm( @@alg_params, OpenTox::SubTask.new(task, 20, 40) )
diff --git a/lib/prediction_data.rb b/lib/prediction_data.rb
index d387d24..f42cd9f 100644
--- a/lib/prediction_data.rb
+++ b/lib/prediction_data.rb
@@ -1,6 +1,7 @@
module Lib
+
class PredictionData
CHECK_VALUES = ENV['RACK_ENV'] =~ /debug|test/
@@ -53,17 +54,14 @@ module Lib
@compounds
end
- def self.create( feature_type, test_dataset_uris, test_target_dataset_uris,
- prediction_feature, prediction_dataset_uris, predicted_variables, predicted_confidences,
- subjectid=nil, task=nil )
+ def self.create( feature_type, test_dataset_uris, prediction_feature, prediction_dataset_uris,
+ predicted_variables, predicted_confidences, subjectid=nil, task=nil )
test_dataset_uris = [test_dataset_uris] unless test_dataset_uris.is_a?(Array)
- test_target_dataset_uris = [test_target_dataset_uris] unless test_target_dataset_uris.is_a?(Array)
prediction_dataset_uris = [prediction_dataset_uris] unless prediction_dataset_uris.is_a?(Array)
predicted_variables = [predicted_variables] unless predicted_variables.is_a?(Array)
predicted_confidences = [predicted_confidences] unless predicted_confidences.is_a?(Array)
LOGGER.debug "loading prediction -- test-dataset: "+test_dataset_uris.inspect
- LOGGER.debug "loading prediction -- test-target-datset: "+test_target_dataset_uris.inspect
LOGGER.debug "loading prediction -- prediction-dataset: "+prediction_dataset_uris.inspect
LOGGER.debug "loading prediction -- predicted_variable: "+predicted_variables.inspect
LOGGER.debug "loading prediction -- predicted_confidence: "+predicted_confidences.inspect
@@ -84,7 +82,6 @@ module Lib
test_dataset_uris.size.times do |i|
test_dataset_uri = test_dataset_uris[i]
- test_target_dataset_uri = test_target_dataset_uris[i]
prediction_dataset_uri = prediction_dataset_uris[i]
predicted_variable = predicted_variables[i]
predicted_confidence = predicted_confidences[i]
@@ -94,35 +91,18 @@ module Lib
test_dataset = Lib::DatasetCache.find test_dataset_uri,subjectid
raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset
- if test_target_dataset_uri == nil || test_target_dataset_uri.strip.size==0 || test_target_dataset_uri==test_dataset_uri
- test_target_dataset_uri = test_dataset_uri
- test_target_dataset = test_dataset
- raise "prediction_feature not found in test_dataset, specify a test_target_dataset\n"+
- "prediction_feature: '"+prediction_feature.to_s+"'\n"+
- "test_dataset: '"+test_target_dataset_uri.to_s+"'\n"+
- "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil
- else
- test_target_dataset = Lib::DatasetCache.find test_target_dataset_uri,subjectid
- raise "test target datset not found: '"+test_target_dataset_uri.to_s+"'" unless test_target_dataset
- if CHECK_VALUES
- test_dataset.compounds.each do |c|
- raise "test compound not found on test class dataset "+c.to_s unless test_target_dataset.compounds.include?(c)
- end
- end
- raise "prediction_feature not found in test_target_dataset\n"+
- "prediction_feature: '"+prediction_feature.to_s+"'\n"+
- "test_target_dataset: '"+test_target_dataset_uri.to_s+"'\n"+
- "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil
- end
+ raise "prediction_feature not found in test_dataset\n"+
+ "prediction_feature: '"+prediction_feature.to_s+"'\n"+
+ "test_dataset: '"+test_dataset_uri.to_s+"'\n"+
+ "available features are: "+test_dataset.features.inspect if test_dataset.features.keys.index(prediction_feature)==nil
- compounds = test_dataset.compounds
- LOGGER.debug "test dataset size: "+compounds.size.to_s
- raise "test dataset is empty "+test_dataset_uri.to_s unless compounds.size>0
+ LOGGER.debug "test dataset size: "+test_dataset.compounds.size.to_s
+ raise "test dataset is empty "+test_dataset_uri.to_s unless test_dataset.compounds.size>0
if feature_type=="classification"
- av = test_target_dataset.accept_values(prediction_feature)
+ av = test_dataset.accept_values(prediction_feature)
raise "'"+OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+
- test_target_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2
+ test_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2
if accept_values==nil
accept_values=av
else
@@ -131,20 +111,15 @@ module Lib
end
actual_values = []
- tmp_compounds = []
- compounds.each do |c|
+ test_dataset.compounds.size.times do |c_idx|
case feature_type
when "classification"
- vals = classification_vals(test_target_dataset, c, prediction_feature, accept_values)
+ actual_values << classification_val(test_dataset, c_idx, prediction_feature, accept_values)
when "regression"
- vals = regression_vals(test_target_dataset, c, prediction_feature)
- end
- vals.each do |v|
- actual_values << v
- tmp_compounds << c
+ actual_values << numeric_val(test_dataset, c_idx, prediction_feature)
end
+ #raise "WTF #{c_idx} #{test_dataset.compounds[c_idx]} #{actual_values[-1]} #{actual_values[-2]}" if c_idx>0 and test_dataset.compounds[c_idx]==test_dataset.compounds[c_idx-1] and actual_values[-1]!=actual_values[-2]
end
- compounds = tmp_compounds
task.progress( task_status += task_step ) if task # loaded actual values
prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri,subjectid
@@ -160,41 +135,42 @@ module Lib
"prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+
"available features are: "+prediction_dataset.features.inspect if predicted_confidence and prediction_dataset.features.keys.index(predicted_confidence)==nil and prediction_dataset.compounds.size>0
- raise "more predicted than test compounds, #test: "+compounds.size.to_s+" < #prediction: "+
+ raise "more predicted than test compounds, #test: "+test_dataset.compounds.size.to_s+" < #prediction: "+
prediction_dataset.compounds.size.to_s+", test-dataset: "+test_dataset_uri.to_s+", prediction-dataset: "+
- prediction_dataset_uri if compounds.size < prediction_dataset.compounds.size
+ prediction_dataset_uri if test_dataset.compounds.size < prediction_dataset.compounds.size
if CHECK_VALUES
prediction_dataset.compounds.each do |c|
raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+
- compounds.collect{|c| c.to_s}.join("\n") if compounds.index(c)==nil
+ test_dataset.compounds.collect{|c| c.to_s}.join("\n") unless test_dataset.compounds.include?(c)
end
end
predicted_values = []
confidence_values = []
- count = 0
- compounds.each do |c|
- if prediction_dataset.compounds.index(c)==nil
+
+ test_dataset.compounds.size.times do |test_c_idx|
+ c = test_dataset.compounds[test_c_idx]
+ pred_c_idx = prediction_dataset.compound_index(test_dataset,test_c_idx)
+ if pred_c_idx==nil
+ raise "internal error: mapping failed" if prediction_dataset.compounds.include?(c)
predicted_values << nil
confidence_values << nil
else
+ raise "internal error: mapping failed" unless c==prediction_dataset.compounds[pred_c_idx]
case feature_type
when "classification"
- vals = classification_vals(prediction_dataset, c, predicted_variable, accept_values)
+ predicted_values << classification_val(prediction_dataset, pred_c_idx, predicted_variable, accept_values)
when "regression"
- vals = regression_vals(prediction_dataset, c, predicted_variable)
+ predicted_values << numeric_val(prediction_dataset, pred_c_idx, predicted_variable)
end
- raise "not yet implemented: more than one prediction for one compound" if vals.size>1
- predicted_values << vals[0]
if predicted_confidence
- confidence_values << confidence_val(prediction_dataset, c, predicted_confidence)
+ confidence_values << numeric_val(prediction_dataset, pred_c_idx, predicted_confidence)
else
confidence_values << nil
end
end
- count += 1
end
- all_compounds += compounds
+ all_compounds += test_dataset.compounds
all_predicted_values += predicted_values
all_actual_values += actual_values
all_confidence_values += confidence_values
@@ -237,61 +213,23 @@ module Lib
end
private
- def self.regression_vals(dataset, compound, feature)
- v_num = []
- values(dataset, compound, feature).each do |v|
- if v==nil or v.is_a?(Numeric)
- v_num << v
- else
- begin
- v_num << v.to_f
- rescue
- LOGGER.warn "no numeric value for regression: '"+v.to_s+"'"
- v_num << nil
- end
- end
- end
- v_num
- end
-
- def self.confidence_val(dataset, compound, confidence)
- v = values(dataset, compound, confidence)
- raise "not yet implemented: duplicate conf value" if v.size>1
+ def self.numeric_val(dataset, compound_index, feature)
+ v = dataset.data_entry_value(compound_index, feature)
begin
- v = v[0]
v = v.to_f unless v==nil or v.is_a?(Numeric)
v
rescue
- LOGGER.warn "no numeric value for confidence '"+v.to_s+"'"
+ LOGGER.warn "no numeric value for feature '#{feature}' : '#{v}'"
nil
end
end
- def self.classification_vals(dataset, compound, feature, accept_values)
- v_indices = []
- values(dataset, compound, feature).each do |v|
- i = accept_values.index(v)
- raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+
- accept_values.inspect unless v==nil or i!=nil
- v_indices << i
- end
- v_indices
+ def self.classification_val(dataset, compound_index, feature, accept_values)
+ v = dataset.data_entry_value(compound_index, feature)
+ i = accept_values.index(v)
+ raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+
+ accept_values.inspect unless v==nil or i!=nil
+ i
end
-
- def self.values(dataset, compound, feature)
- return [nil] if dataset.data_entries[compound]==nil
- if feature==nil
- v = dataset.data_entries[compound].values[0]
- else
- v = dataset.data_entries[compound][feature]
- end
- return [nil] if v==nil
- # sanitiy checks
- raise "no array "+v.class.to_s+" : '"+v.to_s+"'" unless v.is_a?(Array)
- v.each{|vv| raise "array-elem is array" if vv.is_a?(Array)}
- # replace empty strings with nil
- v_mod = v.collect{|vv| (vv.to_s().size==0 ? nil : vv)}
- v_mod
- end
end
end
diff --git a/lib/validation_db.rb b/lib/validation_db.rb
index 086853e..9d67cf0 100755
--- a/lib/validation_db.rb
+++ b/lib/validation_db.rb
@@ -7,8 +7,7 @@ require "lib/merge.rb"
module Validation
VAL_PROPS_GENERAL = [ :validation_uri, :validation_type, :model_uri, :algorithm_uri, :algorithm_params,
- :training_dataset_uri, :prediction_feature, :test_dataset_uri, :test_target_dataset_uri,
- :prediction_dataset_uri, :date ]
+ :training_dataset_uri, :prediction_feature, :test_dataset_uri, :prediction_dataset_uri, :date ]
VAL_PROPS_SUM = [ :num_instances, :num_without_class, :num_unpredicted ]
VAL_PROPS_AVG = [:real_runtime, :percent_without_class, :percent_unpredicted ]
VAL_PROPS = VAL_PROPS_GENERAL + VAL_PROPS_SUM + VAL_PROPS_AVG
@@ -59,7 +58,6 @@ module Validation
attribute :algorithm_uri
attribute :algorithm_params
attribute :training_dataset_uri
- attribute :test_target_dataset_uri
attribute :test_dataset_uri
attribute :prediction_dataset_uri
attribute :prediction_feature
diff --git a/report/plot_factory.rb b/report/plot_factory.rb
index 6e90dbc..ad73170 100644
--- a/report/plot_factory.rb
+++ b/report/plot_factory.rb
@@ -106,14 +106,13 @@ module Reports
train = []
test = []
validation_set.validations.each do |v|
- [[v.test_dataset_uri, test, v.test_target_dataset_uri],
- [v.training_dataset_uri, train, v.training_dataset_uri]].each do |uri,array,uri2|
+ [[v.test_dataset_uri, test],
+ [v.training_dataset_uri, train]].each do |uri,array|
d = Lib::DatasetCache.find(uri, validation_set.validations[0].subjectid)
- d2 = Lib::DatasetCache.find((uri2 ? uri2 : uri), validation_set.validations[0].subjectid)
d.compounds.each do |c|
- d2.data_entries[c][v.prediction_feature].each do |val|
+ d.data_entries[c][v.prediction_feature].each do |val|
array << val
- end if d2.data_entries[c] and d2.data_entries[c][v.prediction_feature]
+ end if d.data_entries[c] and d.data_entries[c][v.prediction_feature]
end
end
end
diff --git a/report/validation_access.rb b/report/validation_access.rb
index e2a3978..784f928 100755
--- a/report/validation_access.rb
+++ b/report/validation_access.rb
@@ -187,8 +187,8 @@ class Reports::ValidationDB
def get_predictions(validation, filter_params, subjectid, task)
# we need compound info, cannot reuse stored prediction data
data = Lib::PredictionData.create( validation.feature_type, validation.test_dataset_uri,
- validation.test_target_dataset_uri, validation.prediction_feature, validation.prediction_dataset_uri,
- validation.predicted_variable, validation.predicted_confidence, subjectid, OpenTox::SubTask.create(task, 0, 80 ) )
+ validation.prediction_feature, validation.prediction_dataset_uri, validation.predicted_variable,
+ validation.predicted_confidence, subjectid, OpenTox::SubTask.create(task, 0, 80 ) )
data = Lib::PredictionData.filter_data( data.data, data.compounds,
filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions] ) if filter_params!=nil
task.progress(100) if task
@@ -197,14 +197,13 @@ class Reports::ValidationDB
def get_accept_values( validation, subjectid=nil )
# PENDING So far, one has to load the whole dataset to get the accept_value from ambit
- test_target_datasets = validation.test_target_dataset_uri
- test_target_datasets = validation.test_dataset_uri unless test_target_datasets
+ test_datasets = validation.test_dataset_uri
res = nil
- test_target_datasets.split(";").each do |test_target_dataset|
- d = Lib::DatasetCache.find( test_target_dataset, subjectid )
- raise "cannot get test target dataset for accept values, dataset: "+test_target_dataset.to_s unless d
+ test_datasets.split(";").each do |test_dataset|
+ d = Lib::DatasetCache.find( test_dataset, subjectid )
+ raise "cannot get test target dataset for accept values, dataset: "+test_dataset.to_s unless d
accept_values = d.accept_values(validation.prediction_feature)
- raise "cannot get accept values from dataset "+test_target_dataset.to_s+" for feature "+
+ raise "cannot get accept values from dataset "+test_dataset.to_s+" for feature "+
validation.prediction_feature+":\n"+d.features[validation.prediction_feature].to_yaml unless accept_values!=nil
raise "different accept values" if res && res!=accept_values
res = accept_values
diff --git a/test/test_examples.rb b/test/test_examples.rb
index 2b95cf2..a07456b 100755
--- a/test/test_examples.rb
+++ b/test/test_examples.rb
@@ -203,11 +203,9 @@ module ValidationExamples
class HamsterTrainingTest < TrainingTestValidation
def initialize
-# @test_target_dataset_file = File.new("data/hamster_carcinogenicity.yaml","r")
# @training_dataset_file = File.new("data/hamster_carcinogenicity.train.yaml","r")
# @test_dataset_file = File.new("data/hamster_carcinogenicity.test.yaml","r")
- @test_target_dataset_file = File.new("data/hamster_carcinogenicity.csv","r")
@training_dataset_file = File.new("data/hamster_carcinogenicity.train.csv","r")
@test_dataset_file = File.new("data/hamster_carcinogenicity.test.csv","r")
@@ -667,11 +665,9 @@ module ValidationExamples
class HamsterTrainingTest < TrainingTestValidation
def initialize
-# @test_target_dataset_file = File.new("data/hamster_carcinogenicity.yaml","r")
# @training_dataset_file = File.new("data/hamster_carcinogenicity.train.yaml","r")
# @test_dataset_file = File.new("data/hamster_carcinogenicity.test.yaml","r")
- @test_target_dataset_file = File.new("data/hamster_carcinogenicity.csv","r")
@training_dataset_file = File.new("data/hamster_carcinogenicity.train.csv","r")
@test_dataset_file = File.new("data/hamster_carcinogenicity.test.csv","r")
diff --git a/test/test_examples_util.rb b/test/test_examples_util.rb
index 82c4c48..b7f170a 100755
--- a/test/test_examples_util.rb
+++ b/test/test_examples_util.rb
@@ -238,8 +238,6 @@ module ValidationExamples
:model_uri,
:test_dataset_uri,
:test_dataset_file,
- :test_target_dataset_uri,
- :test_target_dataset_file,
:training_dataset_uri,
:training_dataset_file,
:dataset_uri,
@@ -258,7 +256,6 @@ module ValidationExamples
def upload_files
[[:test_dataset_uri, :test_dataset_file],
- [:test_target_dataset_uri, :test_target_dataset_file],
[:training_dataset_uri, :training_dataset_file],
[:dataset_uri, :dataset_file]].each do |a|
uri = a[0]
@@ -438,7 +435,7 @@ module ValidationExamples
end
def opt_params
- [ :prediction_feature, :test_target_dataset_uri ]
+ [ :prediction_feature ]
end
def validation_type
@@ -452,7 +449,7 @@ module ValidationExamples
end
def opt_params
- [ :algorithm_params, :test_target_dataset_uri ]
+ [ :algorithm_params ]
end
def validation_type
diff --git a/validation/validation_application.rb b/validation/validation_application.rb
index 1bc55f6..f146b59 100755
--- a/validation/validation_application.rb
+++ b/validation/validation_application.rb
@@ -308,7 +308,6 @@ post '/test_set_validation' do
v = Validation::Validation.create :validation_type => "test_set_validation",
:model_uri => params[:model_uri],
:test_dataset_uri => params[:test_dataset_uri],
- :test_target_dataset_uri => params[:test_target_dataset_uri],
:prediction_feature => params[:prediction_feature]
v.subjectid = @subjectid
v.validate_model( task )
@@ -340,7 +339,6 @@ get '/test_set_validation' do
post_command = OpenTox::PostCommand.new request.url,"Perform test-set-validation"
post_command.attributes << OpenTox::PostAttribute.new("model_uri")
post_command.attributes << OpenTox::PostAttribute.new("test_dataset_uri")
- post_command.attributes << OpenTox::PostAttribute.new("test_target_dataset_uri",false,nil,"Specify if target endpoint values are not available in test dataset.")
post_command.attributes << OpenTox::PostAttribute.new("prediction_feature",false,nil,"Default is 'dependentVariables' of the model.")
content_type "text/html"
OpenTox.text_to_html uri_list,@subjectid,related_links,description,post_command
@@ -360,7 +358,6 @@ post '/training_test_validation/?' do
:algorithm_params => params[:algorithm_params],
:training_dataset_uri => params[:training_dataset_uri],
:test_dataset_uri => params[:test_dataset_uri],
- :test_target_dataset_uri => params[:test_target_dataset_uri],
:prediction_feature => params[:prediction_feature]
v.subjectid = @subjectid
v.validate_algorithm( task )
@@ -392,7 +389,6 @@ get '/training_test_validation' do
post_command.attributes << OpenTox::PostAttribute.new("algorithm_uri")
post_command.attributes << OpenTox::PostAttribute.new("training_dataset_uri")
post_command.attributes << OpenTox::PostAttribute.new("test_dataset_uri")
- post_command.attributes << OpenTox::PostAttribute.new("test_target_dataset_uri",false,nil,"Specify if target endpoint values are not available in test dataset.")
post_command.attributes << OpenTox::PostAttribute.new("prediction_feature")
post_command.attributes << OpenTox::PostAttribute.new("algorithm_params",false,nil,"Params used for model building, separate with ';', example: param1=v1;param2=v2")
content_type "text/html"
@@ -414,7 +410,6 @@ post '/bootstrapping' do
params[:random_seed], OpenTox::SubTask.create(task,0,33)) )
LOGGER.info "params after bootstrapping: "+params.inspect
v = Validation::Validation.create :validation_type => "bootstrapping",
- :test_target_dataset_uri => params[:dataset_uri],
:prediction_feature => params[:prediction_feature],
:algorithm_uri => params[:algorithm_uri],
:algorithm_params => params[:algorithm_params],
@@ -470,12 +465,11 @@ post '/training_test_split' do
raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0
check_stratified(params)
task = OpenTox::Task.create( "Perform training test split validation", url_for("/training_test_split", :full) ) do |task| #, params
- params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature],
+ params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], (params[:stratified].to_s=~/true/ ? params[:prediction_feature] : nil),
@subjectid, params[:stratified], params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33)))
v = Validation::Validation.create :validation_type => "training_test_split",
:training_dataset_uri => params[:training_dataset_uri],
:test_dataset_uri => params[:test_dataset_uri],
- :test_target_dataset_uri => params[:dataset_uri],
:prediction_feature => params[:prediction_feature],
:algorithm_uri => params[:algorithm_uri],
:algorithm_params => params[:algorithm_params]
@@ -543,7 +537,6 @@ post '/cleanup_datasets/?' do
end
Validation::Validation.all.each do |val|
used_datasets << val.training_dataset_uri
- used_datasets << val.test_target_dataset_uri
used_datasets << val.test_dataset_uri
used_datasets << val.prediction_dataset_uri
end
@@ -595,7 +588,8 @@ post '/validate_datasets' do
feature_type = "regression" if params.delete("regression")!=nil
v = Validation::Validation.create params
v.subjectid = @subjectid
- v.compute_validation_stats(feature_type,predicted_variable,predicted_confidence,nil,nil,false,task)
+ v.compute_prediction_data(feature_type,predicted_variable,predicted_confidence,v.prediction_feature,nil,task)
+ v.compute_validation_stats()#feature_type,predicted_variable,predicted_confidence,nil,nil,false,task)
end
v.validation_uri
end
diff --git a/validation/validation_service.rb b/validation/validation_service.rb
index 8c8b11f..2967bd0 100755
--- a/validation/validation_service.rb
+++ b/validation/validation_service.rb
@@ -64,20 +64,20 @@ module Validation
# PENDING: model and referenced datasets are deleted as well, keep it that way?
def delete_validation( delete_all=true )
if (delete_all)
- to_delete = [:model_uri, :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri, :prediction_dataset_uri ]
+ to_delete = [:model_uri, :training_dataset_uri, :test_dataset_uri, :prediction_dataset_uri ]
case self.validation_type
when "test_set_validation"
- to_delete -= [ :model_uri, :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri ]
+ to_delete -= [ :model_uri, :training_dataset_uri, :test_dataset_uri ]
when "bootstrapping"
- to_delete -= [ :test_target_dataset_uri ]
+ to_delete -= []
when "training_test_validation"
- to_delete -= [ :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri ]
+ to_delete -= [ :training_dataset_uri, :test_dataset_uri ]
when "training_test_split"
- to_delete -= [ :test_target_dataset_uri ]
+ to_delete -= []
when "validate_datasets"
to_delete = []
when "crossvalidation"
- to_delete -= [ :test_target_dataset_uri ]
+ to_delete -= []
when "crossvalidation_statistics"
to_delete = []
else
@@ -189,12 +189,11 @@ module Validation
models = cv_vals.collect{|v| OpenTox::Model::Generic.find(v.model_uri, subjectid)}
feature_type = models.first.feature_type(subjectid)
test_dataset_uris = cv_vals.collect{|v| v.test_dataset_uri}
- test_target_dataset_uris = cv_vals.collect{|v| v.test_target_dataset_uri}
prediction_feature = cv_vals.first.prediction_feature
prediction_dataset_uris = cv_vals.collect{|v| v.prediction_dataset_uri}
predicted_variables = models.collect{|m| m.predicted_variable(subjectid)}
predicted_confidences = models.collect{|m| m.predicted_confidence(subjectid)}
- p_data = Lib::PredictionData.create( feature_type, test_dataset_uris, test_target_dataset_uris, prediction_feature,
+ p_data = Lib::PredictionData.create( feature_type, test_dataset_uris, prediction_feature,
prediction_dataset_uris, predicted_variables, predicted_confidences, subjectid, waiting_task )
self.prediction_data = p_data.data
p_data.data
@@ -225,7 +224,7 @@ module Validation
LOGGER.debug "computing prediction stats"
p_data = Lib::PredictionData.create( feature_type,
- self.test_dataset_uri, self.test_target_dataset_uri, self.prediction_feature,
+ self.test_dataset_uri, self.prediction_feature,
self.prediction_dataset_uri, predicted_variable, predicted_confidence, self.subjectid,
OpenTox::SubTask.create(task, 0, 80) )
self.prediction_data = p_data.data
@@ -418,7 +417,6 @@ module Validation
tmp_val << { :validation_type => "crossvalidation",
:training_dataset_uri => v.training_dataset_uri,
:test_dataset_uri => v.test_dataset_uri,
- :test_target_dataset_uri => self.dataset_uri,
:crossvalidation_id => self.id,
:crossvalidation_fold => v.crossvalidation_fold,
:prediction_feature => prediction_feature,
@@ -448,39 +446,38 @@ module Validation
case stratified
when "false"
if self.loo=="true"
- shuffled_compounds = orig_dataset.compounds
+ shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a
else
- shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed )
+ shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a.shuffle( self.random_seed )
end
- split_compounds = shuffled_compounds.chunk( self.num_folds.to_i )
- LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ")
+ split_compound_indices = shuffled_compound_indices.chunk( self.num_folds.to_i )
+ LOGGER.debug "cv: num instances for each fold: "+split_compound_indices.collect{|c| c.size}.join(", ")
self.num_folds.to_i.times do |n|
- test_compounds = []
- train_compounds = []
+ test_compound_indices = []
+ train_compound_indices = []
self.num_folds.to_i.times do |nn|
- compounds = split_compounds[nn]
+ compound_indices = split_compound_indices[nn]
if n == nn
- compounds.each{ |compound| test_compounds << compound}
+ compound_indices.each{ |compound| test_compound_indices << compound}
else
- compounds.each{ |compound| train_compounds << compound}
+ compound_indices.each{ |compound| train_compound_indices << compound}
end
end
raise "internal error, num test compounds not correct,"+
- " is '#{test_compounds.size}', should be '#{(shuffled_compounds.size/self.num_folds.to_i)}'" unless
- (shuffled_compounds.size/self.num_folds.to_i - test_compounds.size).abs <= 1
- raise "internal error, num train compounds not correct, should be '"+(shuffled_compounds.size-test_compounds.size).to_s+
- "', is '"+train_compounds.size.to_s+"'" unless shuffled_compounds.size - test_compounds.size == train_compounds.size
+ " is '#{test_compound_indices.size}', should be '#{(shuffled_compound_indices.size/self.num_folds.to_i)}'" unless
+ (shuffled_compound_indices.size/self.num_folds.to_i - test_compound_indices.size).abs <= 1
+ raise "internal error, num train compounds not correct, should be '"+(shuffled_compound_indices.size-test_compound_indices.size).to_s+
+ "', is '"+train_compound_indices.size.to_s+"'" unless shuffled_compound_indices.size - test_compound_indices.size == train_compound_indices.size
datasetname = 'dataset fold '+(n+1).to_s+' of '+self.num_folds.to_s
meta[DC.title] = "training "+datasetname
- LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s
- train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys,
+ LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compound_indices.size.to_s
+ train_dataset_uri = orig_dataset.split( train_compound_indices, orig_dataset.features.keys,
meta, self.subjectid ).uri
train_dataset_uris << train_dataset_uri
meta[DC.title] = "test "+datasetname
- LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s
- test_features = orig_dataset.features.keys.dclone - [self.prediction_feature]
- test_dataset_uri = orig_dataset.split( test_compounds, test_features,
+ LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compound_indices.size.to_s
+ test_dataset_uri = orig_dataset.split( test_compound_indices, orig_dataset.features.keys,
meta, self.subjectid ).uri
test_dataset_uris << test_dataset_uri
end
@@ -505,7 +502,6 @@ module Validation
tmp_validation = { :validation_type => "crossvalidation",
:training_dataset_uri => train_dataset_uris[n],
:test_dataset_uri => test_dataset_uris[n],
- :test_target_dataset_uri => self.dataset_uri,
:crossvalidation_id => self.id, :crossvalidation_fold => (n+1),
:prediction_feature => self.prediction_feature,
:algorithm_uri => self.algorithm_uri,
@@ -537,25 +533,20 @@ module Validation
LOGGER.warn "no prediciton feature given, all features included in test dataset"
end
- compounds = orig_dataset.compounds
- raise OpenTox::NotFoundError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
-
- compounds.each do |c|
- raise OpenTox::NotFoundError.new "Bootstrapping not yet implemented for duplicate compounds" if
- orig_dataset.data_entries[c][prediction_feature].size > 1
- end
+ compound_indices = (0..(orig_dataset.compounds.size-1)).to_a
+ raise OpenTox::NotFoundError.new "Cannot split datset, num compounds in dataset < 2 ("+compound_indices.size.to_s+")" if compound_indices.size<2
srand random_seed.to_i
while true
- training_compounds = []
- compounds.size.times do
- training_compounds << compounds[rand(compounds.size)]
+ training_compound_indices = []
+ compound_indices.size.times do
+ training_compound_indices << compound_indices[rand(compound_indices.size)]
end
- test_compounds = []
- compounds.each do |c|
- test_compounds << c unless training_compounds.include?(c)
+ test_compound_indices = []
+ compound_indices.each do |idx|
+ test_compound_indices << idx unless training_compound_indices.include?(idx)
end
- if test_compounds.size > 0
+ if test_compound_indices.size > 0
break
else
srand rand(10000)
@@ -563,47 +554,26 @@ module Validation
end
LOGGER.debug "bootstrapping on dataset "+orig_dataset_uri+
- " into training ("+training_compounds.size.to_s+") and test ("+test_compounds.size.to_s+")"+
- ", duplicates in training dataset: "+test_compounds.size.to_s
+ " into training ("+training_compound_indices.size.to_s+") and test ("+test_compound_indices.size.to_s+")"+
+ ", duplicates in training dataset: "+test_compound_indices.size.to_s
task.progress(33) if task
result = {}
-# result[:training_dataset_uri] = orig_dataset.create_new_dataset( training_compounds,
-# orig_dataset.features,
-# "Bootstrapping training dataset of "+orig_dataset.title.to_s,
-# $sinatra.url_for('/bootstrapping',:full) )
- result[:training_dataset_uri] = orig_dataset.split( training_compounds,
+ result[:training_dataset_uri] = orig_dataset.split( training_compound_indices,
orig_dataset.features.keys,
{ DC.title => "Bootstrapping training dataset of "+orig_dataset.title.to_s,
DC.creator => $url_provider.url_for('/bootstrapping',:full) },
subjectid ).uri
task.progress(66) if task
-# result[:test_dataset_uri] = orig_dataset.create_new_dataset( test_compounds,
-# orig_dataset.features.dclone - [prediction_feature],
-# "Bootstrapping test dataset of "+orig_dataset.title.to_s,
-# $sinatra.url_for('/bootstrapping',:full) )
- result[:test_dataset_uri] = orig_dataset.split( test_compounds,
- orig_dataset.features.keys.dclone - [prediction_feature],
+ result[:test_dataset_uri] = orig_dataset.split( test_compound_indices,
+ orig_dataset.features.keys,
{ DC.title => "Bootstrapping test dataset of "+orig_dataset.title.to_s,
DC.creator => $url_provider.url_for('/bootstrapping',:full)} ,
subjectid ).uri
task.progress(100) if task
- if ENV['RACK_ENV'] =~ /test|debug/
- training_dataset = Lib::DatasetCache.find result[:training_dataset_uri],subjectid
- raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless training_dataset
- training_dataset.load_all
- value_count = 0
- training_dataset.compounds.each do |c|
- value_count += training_dataset.data_entries[c][prediction_feature].size
- end
- raise "training compounds error" unless value_count==training_compounds.size
- raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless
- Lib::DatasetCache.find result[:test_dataset_uri], subjectid
- end
LOGGER.debug "bootstrapping done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'"
-
return result
end
@@ -620,12 +590,17 @@ module Validation
orig_dataset = Lib::DatasetCache.find orig_dataset_uri, subjectid
orig_dataset.load_all subjectid
raise OpenTox::NotFoundError.new "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset
+
if prediction_feature
- raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+
- "' not found in dataset, features are: \n"+
- orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature)
- else
- LOGGER.warn "no prediciton feature given, all features will be included in test dataset"
+ if stratified==/true/
+ raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+
+ "' not found in dataset, features are: \n"+orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature)
+ else
+ LOGGER.warn "prediction_feature argument is ignored for non-stratified splits" if prediction_feature
+ prediction_feature=nil
+ end
+ elsif stratified==/true/
+ raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature
end
meta = { DC.creator => $url_provider.url_for('/training_test_split',:full) }
@@ -633,10 +608,8 @@ module Validation
case stratified
when /true|super/
if stratified=="true"
- raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature
features = [prediction_feature]
else
- LOGGER.warn "prediction feature is ignored for super-stratified splits" if prediction_feature
features = nil
end
r_util = OpenTox::RUtil.new
@@ -644,39 +617,36 @@ module Validation
r_util.quit_r
result = {:training_dataset_uri => train.uri, :test_dataset_uri => test.uri}
when "false"
- compounds = orig_dataset.compounds
- raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
- split = (compounds.size*split_ratio).to_i
+ compound_indices = (0..(orig_dataset.compounds.size-1)).to_a
+ raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compound_indices.size.to_s+")" if compound_indices.size<2
+ split = (compound_indices.size*split_ratio).round
split = [split,1].max
- split = [split,compounds.size-2].min
+ split = [split,compound_indices.size-2].min
LOGGER.debug "splitting dataset "+orig_dataset_uri+
- " into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compounds.size-1).to_s+
+ " into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compound_indices.size-1).to_s+
" (shuffled with seed "+random_seed.to_s+")"
- compounds.shuffle!( random_seed )
- training_compounds = compounds[0..split]
- test_compounds = compounds[(split+1)..-1]
+ compound_indices.shuffle!( random_seed )
+ training_compound_indices = compound_indices[0..(split-1)]
+ test_compound_indices = compound_indices[split..-1]
task.progress(33) if task
meta[DC.title] = "Training dataset split of "+orig_dataset.uri
result = {}
- result[:training_dataset_uri] = orig_dataset.split( training_compounds,
- orig_dataset.features.keys, meta, subjectid ).uri
+ train_data = orig_dataset.split( training_compound_indices,
+ orig_dataset.features.keys, meta, subjectid )
+ raise "Train dataset num coumpounds != "+(orig_dataset.compounds.size*split_ratio).round.to_s+", instead: "+train_data.compounds.size.to_s unless
+ train_data.compounds.size==(orig_dataset.compounds.size*split_ratio).round
+ result[:training_dataset_uri] = train_data.uri
task.progress(66) if task
meta[DC.title] = "Test dataset split of "+orig_dataset.uri
- result[:test_dataset_uri] = orig_dataset.split( test_compounds,
- orig_dataset.features.keys.dclone - [prediction_feature], meta, subjectid ).uri
+ test_data = orig_dataset.split( test_compound_indices,
+ orig_dataset.features.keys, meta, subjectid )
+ raise "Test dataset num coumpounds != "+(orig_dataset.compounds.size*(1-split_ratio)).round.to_s+", instead: "+test_data.compounds.size.to_s unless
+ test_data.compounds.size==(orig_dataset.compounds.size*(1-split_ratio)).round
+ result[:test_dataset_uri] = test_data.uri
task.progress(100) if task
- if ENV['RACK_ENV'] =~ /test|debug/
- raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless
- Lib::DatasetCache.find(result[:training_dataset_uri],subjectid)
- test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid
- raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data
- test_data.load_compounds subjectid
- raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+
- test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split)
- end
LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'"
else
raise OpenTox::BadRequestError.new "stratified != false|true|super, is #{stratified}"
diff --git a/validation/validation_test.rb b/validation/validation_test.rb
index 70f3ca4..a6dd2a7 100755
--- a/validation/validation_test.rb
+++ b/validation/validation_test.rb
@@ -59,6 +59,22 @@ class ValidationTest < Test::Unit::TestCase
def test_it
begin
$test_case = self
+
+ post '/validate_datasets',{:test_dataset_uri=>"http://local-ot/dataset/14111",
+ :prediction_dataset_uri=>"http://local-ot/dataset/14113",
+ :prediction_feature=>"http://local-ot/dataset/14109/feature/Hamster%20Carcinogenicity",
+ :predicted_variable=>"http://local-ot/model/21/predicted/value",
+ :predicted_confidence=>"http://local-ot/model/21/predicted/confidence",
+ :classification=>"true"}
+
+#D, [2012-11-07T12:38:11.291069 #31035] DEBUG -- : validation :: loading prediction -- test-dataset: ["http://local-ot/dataset/14099"] :: /validation_service.rb:227:in `compute_prediction_data'
+# D, [2012-11-07T12:38:11.291174 #31035] DEBUG -- : validation :: loading prediction -- test-target-datset: ["http://local-ot/dataset/14097"] :: /validation_service.rb:227:in `compute_prediction_data'
+# D, [2012-11-07T12:38:11.291281 #31035] DEBUG -- : validation :: loading prediction -- prediction-dataset: ["http://local-ot/dataset/14101"] :: /validation_service.rb:227:in `compute_prediction_data'
+# D, [2012-11-07T12:38:11.291398 #31035] DEBUG -- : validation :: loading prediction -- predicted_variable: ["http://local-ot/model/19/predicted/value"] :: /validation_service.rb:227:in `compute_prediction_data'
+# D, [2012-11-07T12:38:11.291506 #31035] DEBUG -- : validation :: loading prediction -- predicted_confidence: ["http://local-ot/model/19/predicted/confidence"] :: /validation_service.rb:227:in `compute_prediction_data'
+# D, [2012-11-07T12:38:11.291611 #31035] DEBUG -- : validation :: loading prediction -- prediction_feature: http://local-ot/dataset/14097/feature/Hamster%20Carcinogenicity :: /validation_service.rb:227:in `compute_prediction_data'
+
+ exit
# dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=0"
# test_dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=1"
@@ -117,7 +133,6 @@ class ValidationTest < Test::Unit::TestCase
# post "/validate_datasets",{
# :test_dataset_uri=>"http://local-ot/dataset/6907",
# :prediction_dataset_uri=>"http://local-ot/dataset/6909",
-# :test_target_dataset_uri=>"http://local-ot/dataset/6905",
# :prediction_feature=>"http://local-ot/dataset/6905/feature/Hamster%20Carcinogenicity",
# #:model_uri=>"http://local-ot/model/1078",
# :predicted_variable=>"http://local-ot/dataset/6909/feature/prediction/Hamster%20Carcinogenicity/value",
@@ -351,7 +366,6 @@ end
# post "/validate_datasets",{
# :test_dataset_uri=>"http://apps.deaconsult.net:8080/ambit2/dataset/R3924",
# :prediction_dataset_uri=>"http://apps.ideaconsult.net:8080/ambit2/dataset/R3924?feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F52%2Fpredicted",
- # #:test_target_dataset_uri=>"http://local-ot/dataset/202",
# :prediction_feature=>"http://apps.ideaconsult.net:8080/ambit2/feature/21715",
# :predicted_feature=>"http://apps.ideaconsult.net:8080/ambit2/feature/28944",
# :regression=>"true"}
@@ -363,7 +377,6 @@ end
#get "/crossvalidation/19/predictions",nil,'HTTP_ACCEPT' => "application/x-yaml" #/statistics"
# post "",:model_uri=>"http://local-ot/model/1",:test_dataset_uri=>"http://local-ot/dataset/3",
- # :test_target_dataset_uri=>"http://local-ot/dataset/1"
# get "/crossvalidation/2",nil,'HTTP_ACCEPT' => "application/rdf+xml"
#puts last_response.body
@@ -384,7 +397,6 @@ end
# post "/validate_datasets",{
# :test_dataset_uri=>"http://local-ot/dataset/204",
# :prediction_dataset_uri=>"http://local-ot/dataset/206",
- # :test_target_dataset_uri=>"http://local-ot/dataset/202",
# :prediction_feature=>"http://ot-dev.in-silico.ch/toxcreate/feature#IRIS%20unit%20risk",
# :predicted_feature=>"http://ot-dev.in-silico.ch/toxcreate/feature#IRIS%20unit%20risk_lazar_regression",
# :regression=>"true"}
@@ -394,7 +406,6 @@ end
# post "/validate_datasets",{
# :test_dataset_uri=>"http://apps.ideaconsult.net:8080/ambit2/dataset/9?max=10",
# :prediction_dataset_uri=>"http://apps.ideaconsult.net:8080/ambit2/dataset/9?max=10",
-# #:test_target_dataset_uri=>"http://local-ot/dataset/202",
# :prediction_feature=>"http://apps.ideaconsult.net:8080/ambit2/feature/21573",
# :predicted_feature=>"http://apps.ideaconsult.net:8080/ambit2/feature/21573",
# #:regression=>"true"}
@@ -406,7 +417,6 @@ end
# post "/validate_datasets",{
# :test_dataset_uri=>"http://local-ot/dataset/89",
# :prediction_dataset_uri=>"http://local-ot/dataset/91",
- # :test_target_dataset_uri=>"http://local-ot/dataset/87",
# :prediction_feature=>"http://local-ot/dataset/1/feature/hamster_carcinogenicity",
# :predicted_feature=>"",
## :regression=>"true"}
@@ -419,7 +429,6 @@ end
# post "/validate_datasets",{
# :test_dataset_uri=>"http://local-ot/dataset/390",
# :prediction_dataset_uri=>"http://local-ot/dataset/392",
-# :test_target_dataset_uri=>"http://local-ot/dataset/388",
# :prediction_feature=>"http://local-ot/dataset/388/feature/repdose_classification",
# :model_uri=>"http://local-ot/model/31"}
# #:regression=>"true"}
@@ -432,7 +441,6 @@ end
# post "/validate_datasets",{
# :test_dataset_uri=>"http://opentox.informatik.uni-freiburg.de/dataset/409",
# :prediction_dataset_uri=>"http://opentox.informatik.uni-freiburg.de/dataset/410",
-# :test_target_dataset_uri=>"https://ambit.uni-plovdiv.bg:8443/ambit2/dataset/R401560",
# :prediction_feature=>"https://ambit.uni-plovdiv.bg:8443/ambit2/feature/22190",
# :predicted_feature=>"https://ambit.uni-plovdiv.bg:8443/ambit2/feature/218304",
# :regression=>"true",
@@ -453,7 +461,6 @@ end
# post "/validate_datasets",{
# :test_dataset_uri=>"http://local-ot/dataset/94",
# :prediction_dataset_uri=>'http://local-ot/dataset/96',
-# :test_target_dataset_uri=>'http://local-ot/dataset/92',
# :prediction_feature=>'http://local-ot/dataset/92/feature/Hamster%20Carcinogenicity',
# :predicted_feature=>"",
# :classification=>"true",