diff options
-rw-r--r-- | example.rb | 3 | ||||
-rw-r--r-- | lib/ot_predictions.rb | 21 | ||||
-rw-r--r-- | lib/validation_db.rb | 3 | ||||
-rw-r--r-- | report/util.rb | 2 | ||||
-rw-r--r-- | report/validation_access.rb | 2 | ||||
-rw-r--r-- | validation/validation_application.rb | 7 | ||||
-rw-r--r-- | validation/validation_service.rb | 28 | ||||
-rw-r--r-- | validation/validation_test.rb | 138 |
8 files changed, 123 insertions, 81 deletions
@@ -63,9 +63,10 @@ class Example #delete_all(@@config[:services]["opentox-model"]) OpenTox::RestClientWrapper.delete @@config[:services]["opentox-model"] - split_params = Validation::Util.train_test_dataset_split(data_uri, 0.9, 1) + split_params = Validation::Util.train_test_dataset_split(data_uri, URI.decode(@@feature), 0.9, 1) v = Validation::Validation.new :training_dataset_uri => split_params[:training_dataset_uri], :test_dataset_uri => split_params[:test_dataset_uri], + :test_class_dataset_uri => data_uri, :prediction_feature => URI.decode(@@feature), :algorithm_uri => @@alg v.validate_algorithm( @@alg_params ) diff --git a/lib/ot_predictions.rb b/lib/ot_predictions.rb index 9e6f5c9..0176dcf 100644 --- a/lib/ot_predictions.rb +++ b/lib/ot_predictions.rb @@ -15,10 +15,11 @@ module Lib return @compounds[instance_index] end - def initialize(is_classification, test_dataset_uri, prediction_feature, prediction_dataset_uri, predicted_variable) + def initialize(is_classification, test_dataset_uri, test_target_datset_uri, prediction_feature, prediction_dataset_uri, predicted_variable) LOGGER.debug("loading prediciton via test-dateset:'"+test_dataset_uri.to_s+ - "' and prediction-dataset:'"+prediction_dataset_uri.to_s+ + "', test-target-datset:'"+test_target_datset_uri.to_s+ + "', prediction-dataset:'"+prediction_dataset_uri.to_s+ "', prediction_feature: '"+prediction_feature.to_s+"' "+ "', predicted_variable: '"+predicted_variable.to_s+"'") @@ -31,7 +32,19 @@ module Lib test_dataset = OpenTox::Dataset.find test_dataset_uri raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset - raise "test dataset feature not found: '"+prediction_feature.to_s+"', available features: "+test_dataset.features.inspect if test_dataset.features.index(prediction_feature)==nil + + if test_target_datset_uri == nil || test_target_datset_uri==test_dataset_uri + test_class_dataset = test_dataset + else + test_class_dataset = OpenTox::Dataset.find test_target_datset_uri + raise "test target datset not found: '"+test_target_datset_uri.to_s+"'" unless test_class_dataset + if CHECK_VALUES + test_dataset.compounds.each do |c| + raise "test compound not found on test class dataset "+c.to_s unless test_class_dataset.compounds.include?(c) + end + end + end + raise "test dataset feature not found: '"+prediction_feature.to_s+"', available features: "+test_class_dataset.features.inspect if test_class_dataset.features.index(prediction_feature)==nil @compounds = test_dataset.compounds LOGGER.debug "test dataset size: "+@compounds.size.to_s @@ -41,7 +54,7 @@ module Lib actual_values = [] @compounds.each do |c| - value = test_dataset.get_value(c, prediction_feature) + value = test_class_dataset.get_value(c, prediction_feature) if is_classification value = value.to_s unless value==nil diff --git a/lib/validation_db.rb b/lib/validation_db.rb index cb6708c..322d4cb 100644 --- a/lib/validation_db.rb +++ b/lib/validation_db.rb @@ -8,7 +8,7 @@ require "lib/merge.rb" module Lib VAL_PROPS_GENERAL = [ :id, :uri, :model_uri, :algorithm_uri, :training_dataset_uri, :prediction_feature, - :test_dataset_uri, :prediction_dataset_uri, :created_at ] + :test_dataset_uri, :test_class_dataset_uri, :prediction_dataset_uri, :created_at ] VAL_PROPS_SUM = [ :num_instances, :num_without_class, :num_unpredicted ] VAL_PROPS_AVG = [:real_runtime, :percent_without_class, :percent_unpredicted ] VAL_PROPS = VAL_PROPS_GENERAL + VAL_PROPS_SUM + VAL_PROPS_AVG @@ -55,6 +55,7 @@ module Lib property :model_uri, String, :length => 255 property :algorithm_uri, String, :length => 255 property :training_dataset_uri, String, :length => 255 + property :test_class_dataset_uri, String, :length => 255 property :test_dataset_uri, String, :length => 255 property :prediction_dataset_uri, String, :length => 255 property :prediction_feature, String, :length => 255 diff --git a/report/util.rb b/report/util.rb index ca12963..06a3232 100644 --- a/report/util.rb +++ b/report/util.rb @@ -9,7 +9,7 @@ class Array end def remove_common_prefix() - if self.size > 2 + if self.size > 1 prefix = self.common_prefix if prefix.size>0 return self.collect{|word| word[prefix.size..-1]} diff --git a/report/validation_access.rb b/report/validation_access.rb index 7241218..426da99 100644 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -97,7 +97,7 @@ class Reports::ValidationDB < Reports::ValidationAccess end def get_predictions(validation) - Lib::OTPredictions.new( validation.classification?, validation.test_dataset_uri, + Lib::OTPredictions.new( validation.classification?, validation.test_dataset_uri, validation.test_class_dataset_uri, validation.prediction_feature, validation.prediction_dataset_uri, validation.predicted_variable) end diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 01d74a3..d5647ce 100644 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -138,7 +138,7 @@ post '/?' do else halt 400, "illegal parameter combination for validation, use either\n"+ "* model_uri, test_dataset_uri\n"+ - "* algorithm_uri, training_dataset_uri, test_dataset_uri, prediction_feature\n" + "* algorithm_uri, training_dataset_uri, test_dataset_uri, prediction_feature\n"+ "params given: "+params.inspect end content_type "text/uri-list" @@ -153,9 +153,10 @@ post '/training_test_split' do halt 400, "algorithm_uri missing" unless params[:algorithm_uri] halt 400, "prediction_feature missing" unless params[:prediction_feature] - params.merge!(Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:split_ratio], params[:random_seed])) + params.merge!(Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], params[:split_ratio], params[:random_seed])) v = Validation::Validation.new :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri], + :test_class_dataset_uri => params[:dataset_uri], :prediction_feature => params[:prediction_feature], :algorithm_uri => params[:algorithm_uri] v.validate_algorithm( params[:algorithm_params]) @@ -169,7 +170,7 @@ post '/plain_training_test_split' do LOGGER.info "creating pure training test split "+params.inspect halt 400, "dataset_uri missing" unless params[:dataset_uri] - result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:split_ratio], params[:random_seed]) + result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], params[:split_ratio], params[:random_seed]) content_type "text/uri-list" result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n" end diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 3cf1d56..7966d16 100644 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -132,7 +132,7 @@ module Validation update :algorithm_uri => model.algorithm unless @algorithm_uri LOGGER.debug "computing prediction stats" - prediction = Lib::OTPredictions.new( model.classification?, @test_dataset_uri, @prediction_feature, @prediction_dataset_uri, model.predicted_variables ) + prediction = Lib::OTPredictions.new( model.classification?, @test_dataset_uri, @test_class_dataset_uri, @prediction_feature, @prediction_dataset_uri, model.predicted_variables ) if prediction.classification? update :classification_statistics => prediction.compute_stats else @@ -265,6 +265,8 @@ module Validation end LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ") + test_features = orig_dataset.features.dclone - [prediction_feature] + (1..@num_folds).each do |n| datasetname = 'cv'+@id.to_s + @@ -291,13 +293,14 @@ module Validation raise "internal error, num train compounds not correct" unless shuffled_compounds.size - test_compounds.size == train_compounds.size LOGGER.debug "training set: "+datasetname+"_train" - train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, datasetname + '_train', source ) + train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, orig_dataset.features, datasetname + '_train', source ) LOGGER.debug "test set: "+datasetname+"_test" - test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, datasetname + '_test', source ) + test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, test_features, datasetname + '_test', source ) validation = Validation.new :training_dataset_uri => train_dataset_uri, :test_dataset_uri => test_dataset_uri, + :test_class_dataset_uri => @dataset_uri, :crossvalidation_id => @id, :crossvalidation_fold => n, :prediction_feature => prediction_feature, :algorithm_uri => @algorithm_uri @@ -310,15 +313,23 @@ module Validation # splits a dataset into test and training dataset # returns map with training_dataset_uri and test_dataset_uri - def self.train_test_dataset_split( orig_dataset_uri, split_ratio=nil, random_seed=nil ) + def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, split_ratio=nil, random_seed=nil ) split_ratio=0.67 unless split_ratio random_seed=1 unless random_seed orig_dataset = OpenTox::Dataset.find orig_dataset_uri $sinatra.halt 400, "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset - $sinatra.halt 400, "Split ratio invalid: "+split_ratio unless split_ratio and split_ratio=split_ratio.to_f - $sinatra.halt 400, "Split ratio not >0 and <1" unless split_ratio>0 && split_ratio<1 + $sinatra.halt 400, "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f + $sinatra.halt 400, "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1 + if prediction_feature + $sinatra.halt 404, "prediction_feature is already encoded: "+prediction_feature.to_s if prediction_feature=~/%20/ + prediction_feature = URI.encode(prediction_feature) + $sinatra.halt 400, "Prediction feature not found in dataset features: "+prediction_feature.to_s+ + ", features are: \n"+orig_dataset.features.inspect unless orig_dataset.features.include?(prediction_feature) + else + LOGGER.warn "no prediciton feature given, all features included in test dataset" + end compounds = orig_dataset.compounds $sinatra.halt 400, "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 @@ -338,12 +349,15 @@ module Validation {:training_dataset_uri => train_compounds, :test_dataset_uri => test_compounds}.each do |sym, compound_array| if sym == :training_dataset_uri + features = orig_dataset.features title = "Training dataset split of "+orig_dataset.title.to_s else + features = orig_dataset.features.dclone - [prediction_feature] title = "Test dataset split of "+orig_dataset.title.to_s end source = $sinatra.url_for('/training_test_split',:full) - result[sym] = orig_dataset.create_new_dataset( compound_array, title, source ) + + result[sym] = orig_dataset.create_new_dataset( compound_array, features, title, source ) end $sinatra.halt 400, "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless OpenTox::Dataset.find result[:training_dataset_uri] diff --git a/validation/validation_test.rb b/validation/validation_test.rb index 3a3d904..df3ffab 100644 --- a/validation/validation_test.rb +++ b/validation/validation_test.rb @@ -10,7 +10,7 @@ LOGGER = Logger.new(STDOUT) LOGGER.datetime_format = "%Y-%m-%d %H:%M:%S " class Example - attr_accessor :alg, :train_data, :test_data, :model, :pred_data, :act_feat, :pred_feat, :classification, :alg_params, :val, :orig_data + attr_accessor :alg, :train_data, :test_data, :model, :pred_data, :act_feat, :pred_feat, :classification, :alg_params, :val, :orig_data, :num_folds, :random_seed end class ValidationTest < Test::Unit::TestCase @@ -24,11 +24,13 @@ class ValidationTest < Test::Unit::TestCase #ex = ex_ntua #ex = ex_tum + #ex = ex_local #create_validation(ex) #validate_model(ex) #validate_algorithm(ex) #validate_split(ex) + #xval(ex) #test_dataset = OpenTox::Dataset.find ex_ntua2.pred_data #puts ex_ntua2.pred_data.to_s+", "+test_dataset.compounds.size.to_s+" compounds" @@ -52,25 +54,30 @@ class ValidationTest < Test::Unit::TestCase begin orig = OpenTox::Dataset.find(ex.orig_data) raise "not correct, upload" if (orig == nil || orig.compounds.size!=85) - rescue + rescue => e + puts e.message upload_uri = upload_data(dataset, File.new("data/hamster_carcinogenicity.yaml","r")) ex.orig_data = upload_uri end + ex.act_feat = "http://localhost/toxmodel/feature#Hamster Carcinogenicity (DSSTOX/CPDB)" + ex.train_data = File.join(dataset,"2") ex.test_data = File.join(dataset,"3") begin train = OpenTox::Dataset.find(ex.train_data) test = OpenTox::Dataset.find(ex.test_data) - raise "not correct, split" if (train == nil || test == nil || train.compounds.size>=85 || test.compounds.size>=test.compounds.size) - rescue - post '/plain_training_test_split', { :dataset_uri => ex.orig_data, :split_ratio=>0.75, :random_seed=>6} + raise "not correct, split "+train.to_s+" "+test.to_s+ + " "+train.compounds.size.to_s+" "+test.compounds.size.to_s if (train == nil || test == nil || train.compounds.size>=85 || test.compounds.size>=train.compounds.size) + rescue => e + puts e.message + post '/plain_training_test_split', { :dataset_uri => ex.orig_data, :prediction_feature=>ex.act_feat, :split_ratio=>0.75, :random_seed=>6} split = last_response.body.split("\n") ex.train_data = split[0] ex.test_data = split[1] end - ex.act_feat = "http://localhost/toxmodel/feature#Hamster Carcinogenicity (DSSTOX/CPDB)" + # example model #ex.model = "http://opentox.ntua.gr:3000/model/29" #ex.pred_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/261687" @@ -124,14 +131,14 @@ class ValidationTest < Test::Unit::TestCase #ex.alg = "http://opentox.informatik.tu-muenchen.de:8080/OpenTox-dev/algorithm/GaussP" #mini - #ex.train_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342" - #ex.test_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342" - #ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/03141" + ex.train_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342" + ex.test_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342" + ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/03141" #big - ex.train_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/639" - ex.test_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/640" - ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/264185" + #ex.train_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/639" + #ex.test_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/640" + #ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/264185" #ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/264187" #test # example model @@ -183,57 +190,60 @@ class ValidationTest < Test::Unit::TestCase # end # end # -# def test_cv -# begin -# data_uri = upload_data(WS_DATA, FILE) -# -## first_validation=nil -## 2.times do -# -# num_folds = 9 -# post '/crossvalidation', { :dataset_uri => data_uri, :algorithm_uri => WS_CLASS_ALG, :prediction_feature => FEATURE_URI, -# :algorithm_params => "feature_generation_uri="+WS_FEATURE_ALG, :num_folds => num_folds, :random_seed => 2 } -# -# uri = last_response.body -# if OpenTox::Utils.task_uri?(uri) -# puts "task: "+uri.to_s -# uri = OpenTox::Task.find(uri).wait_for_resource.to_s + def xval(ex) + begin + #data_uri = upload_data(WS_DATA, FILE) + +# first_validation=nil +# 2.times do + + raise "no orig data" unless ex.orig_data + num_folds = ex.num_folds ? ex.num_folds : 3 + random_seed = ex.random_seed ? ex.random_seed : 1 + + post '/crossvalidation', { :dataset_uri => ex.orig_data, :algorithm_uri => ex.alg, :prediction_feature => ex.act_feat, + :algorithm_params => ex.alg_params, :num_folds => num_folds, :random_seed => random_seed } + + uri = last_response.body + if OpenTox::Utils.task_uri?(uri) + puts "task: "+uri.to_s + uri = OpenTox::Task.find(uri).wait_for_resource.to_s + end + puts "crossvalidation: "+uri + + assert last_response.ok? + crossvalidation_id = uri.split("/")[-1] + add_resource("/crossvalidation/"+crossvalidation_id) + puts "id:"+crossvalidation_id + + get '/crossvalidation/'+crossvalidation_id + puts last_response.body + assert last_response.ok? || last_response.status==202 + + get '/crossvalidation/'+crossvalidation_id+'/validations' + puts "validations:\n"+last_response.body + assert last_response.ok? + assert last_response.body.split("\n").size == num_folds, "num-folds:"+num_folds.to_s+" but num lines is "+last_response.body.split("\n").size.to_s + +# if first_validation +# # assert that both cross validaitons use the same datasets +# first_validation2 = last_response.body.split("\n")[0].split("/")[-1] +# +# get '/'+first_validation+'/test_dataset_uri' +# assert last_response.ok? +# first_val_test_data = last_response.body +# +# get '/'+first_validation2+'/test_dataset_uri' +# assert last_response.ok? +# first_val2_test_data = last_response.body +# assert first_val_test_data==first_val2_test_data # end -# puts "crossvalidation: "+uri -# -# assert last_response.ok? -# crossvalidation_id = uri.split("/")[-1] -# add_resource("/crossvalidation/"+crossvalidation_id) -# puts "id:"+crossvalidation_id -# -# get '/crossvalidation/'+crossvalidation_id -# puts last_response.body -# assert last_response.ok? || last_response.status==202 -# -# get '/crossvalidation/'+crossvalidation_id+'/validations' -# puts "validations:\n"+last_response.body -# assert last_response.ok? -# assert last_response.body.split("\n").size == num_folds, "num-folds:"+num_folds.to_s+" but num lines is "+last_response.body.split("\n").size.to_s -# -## if first_validation -## # assert that both cross validaitons use the same datasets -## first_validation2 = last_response.body.split("\n")[0].split("/")[-1] -## -## get '/'+first_validation+'/test_dataset_uri' -## assert last_response.ok? -## first_val_test_data = last_response.body -## -## get '/'+first_validation2+'/test_dataset_uri' -## assert last_response.ok? -## first_val2_test_data = last_response.body -## assert first_val_test_data==first_val2_test_data -## end -## first_validation = last_response.body.split("\n")[0].split("/")[-1] -## end -# ensure -# #delete_resources -# end -# end +# first_validation = last_response.body.split("\n")[0].split("/")[-1] +# end + ensure + #delete_resources + end + end # def validate_model(ex) begin @@ -258,6 +268,8 @@ class ValidationTest < Test::Unit::TestCase # model_uri = "http://opentox.ntua.gr:3000/model/9" # data_uri_test = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342" + raise "model not defined" unless ex.model + post '', {:test_dataset_uri => ex.test_data, :model_uri => ex.model} #, :prediction_feature => FEATURE_URI} puts last_response.body @@ -384,7 +396,7 @@ class ValidationTest < Test::Unit::TestCase #data_uri=WS_DATA+"/"+DATA # post '/training_test_split', { :dataset_uri => data_uri, :algorithm_uri => WS_CLASS_ALG, :prediction_feature => FEATURE_URI, # :algorithm_params => "feature_generation_uri="+WS_FEATURE_ALG, :split_ratio=>0.75, :random_seed=>6} - post '/training_test_split', { :dataset_uri => ex.orig_data, :algorithm_uri => ex.alg, :prediction_feature => ex.pred_feat, + post '/training_test_split', { :dataset_uri => ex.orig_data, :algorithm_uri => ex.alg, :prediction_feature => ex.act_feat, :algorithm_params => ex.alg_params, :split_ratio=>0.75, :random_seed=>6} puts last_response.body |