summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--example.rb3
-rw-r--r--lib/ot_predictions.rb21
-rw-r--r--lib/validation_db.rb3
-rw-r--r--report/util.rb2
-rw-r--r--report/validation_access.rb2
-rw-r--r--validation/validation_application.rb7
-rw-r--r--validation/validation_service.rb28
-rw-r--r--validation/validation_test.rb138
8 files changed, 123 insertions, 81 deletions
diff --git a/example.rb b/example.rb
index a9eb1e6..d5ee251 100644
--- a/example.rb
+++ b/example.rb
@@ -63,9 +63,10 @@ class Example
#delete_all(@@config[:services]["opentox-model"])
OpenTox::RestClientWrapper.delete @@config[:services]["opentox-model"]
- split_params = Validation::Util.train_test_dataset_split(data_uri, 0.9, 1)
+ split_params = Validation::Util.train_test_dataset_split(data_uri, URI.decode(@@feature), 0.9, 1)
v = Validation::Validation.new :training_dataset_uri => split_params[:training_dataset_uri],
:test_dataset_uri => split_params[:test_dataset_uri],
+ :test_class_dataset_uri => data_uri,
:prediction_feature => URI.decode(@@feature),
:algorithm_uri => @@alg
v.validate_algorithm( @@alg_params )
diff --git a/lib/ot_predictions.rb b/lib/ot_predictions.rb
index 9e6f5c9..0176dcf 100644
--- a/lib/ot_predictions.rb
+++ b/lib/ot_predictions.rb
@@ -15,10 +15,11 @@ module Lib
return @compounds[instance_index]
end
- def initialize(is_classification, test_dataset_uri, prediction_feature, prediction_dataset_uri, predicted_variable)
+ def initialize(is_classification, test_dataset_uri, test_target_datset_uri, prediction_feature, prediction_dataset_uri, predicted_variable)
LOGGER.debug("loading prediciton via test-dateset:'"+test_dataset_uri.to_s+
- "' and prediction-dataset:'"+prediction_dataset_uri.to_s+
+ "', test-target-datset:'"+test_target_datset_uri.to_s+
+ "', prediction-dataset:'"+prediction_dataset_uri.to_s+
"', prediction_feature: '"+prediction_feature.to_s+"' "+
"', predicted_variable: '"+predicted_variable.to_s+"'")
@@ -31,7 +32,19 @@ module Lib
test_dataset = OpenTox::Dataset.find test_dataset_uri
raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset
- raise "test dataset feature not found: '"+prediction_feature.to_s+"', available features: "+test_dataset.features.inspect if test_dataset.features.index(prediction_feature)==nil
+
+ if test_target_datset_uri == nil || test_target_datset_uri==test_dataset_uri
+ test_class_dataset = test_dataset
+ else
+ test_class_dataset = OpenTox::Dataset.find test_target_datset_uri
+ raise "test target datset not found: '"+test_target_datset_uri.to_s+"'" unless test_class_dataset
+ if CHECK_VALUES
+ test_dataset.compounds.each do |c|
+ raise "test compound not found on test class dataset "+c.to_s unless test_class_dataset.compounds.include?(c)
+ end
+ end
+ end
+ raise "test dataset feature not found: '"+prediction_feature.to_s+"', available features: "+test_class_dataset.features.inspect if test_class_dataset.features.index(prediction_feature)==nil
@compounds = test_dataset.compounds
LOGGER.debug "test dataset size: "+@compounds.size.to_s
@@ -41,7 +54,7 @@ module Lib
actual_values = []
@compounds.each do |c|
- value = test_dataset.get_value(c, prediction_feature)
+ value = test_class_dataset.get_value(c, prediction_feature)
if is_classification
value = value.to_s unless value==nil
diff --git a/lib/validation_db.rb b/lib/validation_db.rb
index cb6708c..322d4cb 100644
--- a/lib/validation_db.rb
+++ b/lib/validation_db.rb
@@ -8,7 +8,7 @@ require "lib/merge.rb"
module Lib
VAL_PROPS_GENERAL = [ :id, :uri, :model_uri, :algorithm_uri, :training_dataset_uri, :prediction_feature,
- :test_dataset_uri, :prediction_dataset_uri, :created_at ]
+ :test_dataset_uri, :test_class_dataset_uri, :prediction_dataset_uri, :created_at ]
VAL_PROPS_SUM = [ :num_instances, :num_without_class, :num_unpredicted ]
VAL_PROPS_AVG = [:real_runtime, :percent_without_class, :percent_unpredicted ]
VAL_PROPS = VAL_PROPS_GENERAL + VAL_PROPS_SUM + VAL_PROPS_AVG
@@ -55,6 +55,7 @@ module Lib
property :model_uri, String, :length => 255
property :algorithm_uri, String, :length => 255
property :training_dataset_uri, String, :length => 255
+ property :test_class_dataset_uri, String, :length => 255
property :test_dataset_uri, String, :length => 255
property :prediction_dataset_uri, String, :length => 255
property :prediction_feature, String, :length => 255
diff --git a/report/util.rb b/report/util.rb
index ca12963..06a3232 100644
--- a/report/util.rb
+++ b/report/util.rb
@@ -9,7 +9,7 @@ class Array
end
def remove_common_prefix()
- if self.size > 2
+ if self.size > 1
prefix = self.common_prefix
if prefix.size>0
return self.collect{|word| word[prefix.size..-1]}
diff --git a/report/validation_access.rb b/report/validation_access.rb
index 7241218..426da99 100644
--- a/report/validation_access.rb
+++ b/report/validation_access.rb
@@ -97,7 +97,7 @@ class Reports::ValidationDB < Reports::ValidationAccess
end
def get_predictions(validation)
- Lib::OTPredictions.new( validation.classification?, validation.test_dataset_uri,
+ Lib::OTPredictions.new( validation.classification?, validation.test_dataset_uri, validation.test_class_dataset_uri,
validation.prediction_feature, validation.prediction_dataset_uri, validation.predicted_variable)
end
diff --git a/validation/validation_application.rb b/validation/validation_application.rb
index 01d74a3..d5647ce 100644
--- a/validation/validation_application.rb
+++ b/validation/validation_application.rb
@@ -138,7 +138,7 @@ post '/?' do
else
halt 400, "illegal parameter combination for validation, use either\n"+
"* model_uri, test_dataset_uri\n"+
- "* algorithm_uri, training_dataset_uri, test_dataset_uri, prediction_feature\n"
+ "* algorithm_uri, training_dataset_uri, test_dataset_uri, prediction_feature\n"+
"params given: "+params.inspect
end
content_type "text/uri-list"
@@ -153,9 +153,10 @@ post '/training_test_split' do
halt 400, "algorithm_uri missing" unless params[:algorithm_uri]
halt 400, "prediction_feature missing" unless params[:prediction_feature]
- params.merge!(Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:split_ratio], params[:random_seed]))
+ params.merge!(Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], params[:split_ratio], params[:random_seed]))
v = Validation::Validation.new :training_dataset_uri => params[:training_dataset_uri],
:test_dataset_uri => params[:test_dataset_uri],
+ :test_class_dataset_uri => params[:dataset_uri],
:prediction_feature => params[:prediction_feature],
:algorithm_uri => params[:algorithm_uri]
v.validate_algorithm( params[:algorithm_params])
@@ -169,7 +170,7 @@ post '/plain_training_test_split' do
LOGGER.info "creating pure training test split "+params.inspect
halt 400, "dataset_uri missing" unless params[:dataset_uri]
- result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:split_ratio], params[:random_seed])
+ result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], params[:split_ratio], params[:random_seed])
content_type "text/uri-list"
result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n"
end
diff --git a/validation/validation_service.rb b/validation/validation_service.rb
index 3cf1d56..7966d16 100644
--- a/validation/validation_service.rb
+++ b/validation/validation_service.rb
@@ -132,7 +132,7 @@ module Validation
update :algorithm_uri => model.algorithm unless @algorithm_uri
LOGGER.debug "computing prediction stats"
- prediction = Lib::OTPredictions.new( model.classification?, @test_dataset_uri, @prediction_feature, @prediction_dataset_uri, model.predicted_variables )
+ prediction = Lib::OTPredictions.new( model.classification?, @test_dataset_uri, @test_class_dataset_uri, @prediction_feature, @prediction_dataset_uri, model.predicted_variables )
if prediction.classification?
update :classification_statistics => prediction.compute_stats
else
@@ -265,6 +265,8 @@ module Validation
end
LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ")
+ test_features = orig_dataset.features.dclone - [prediction_feature]
+
(1..@num_folds).each do |n|
datasetname = 'cv'+@id.to_s +
@@ -291,13 +293,14 @@ module Validation
raise "internal error, num train compounds not correct" unless shuffled_compounds.size - test_compounds.size == train_compounds.size
LOGGER.debug "training set: "+datasetname+"_train"
- train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, datasetname + '_train', source )
+ train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, orig_dataset.features, datasetname + '_train', source )
LOGGER.debug "test set: "+datasetname+"_test"
- test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, datasetname + '_test', source )
+ test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, test_features, datasetname + '_test', source )
validation = Validation.new :training_dataset_uri => train_dataset_uri,
:test_dataset_uri => test_dataset_uri,
+ :test_class_dataset_uri => @dataset_uri,
:crossvalidation_id => @id, :crossvalidation_fold => n,
:prediction_feature => prediction_feature,
:algorithm_uri => @algorithm_uri
@@ -310,15 +313,23 @@ module Validation
# splits a dataset into test and training dataset
# returns map with training_dataset_uri and test_dataset_uri
- def self.train_test_dataset_split( orig_dataset_uri, split_ratio=nil, random_seed=nil )
+ def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, split_ratio=nil, random_seed=nil )
split_ratio=0.67 unless split_ratio
random_seed=1 unless random_seed
orig_dataset = OpenTox::Dataset.find orig_dataset_uri
$sinatra.halt 400, "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset
- $sinatra.halt 400, "Split ratio invalid: "+split_ratio unless split_ratio and split_ratio=split_ratio.to_f
- $sinatra.halt 400, "Split ratio not >0 and <1" unless split_ratio>0 && split_ratio<1
+ $sinatra.halt 400, "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f
+ $sinatra.halt 400, "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1
+ if prediction_feature
+ $sinatra.halt 404, "prediction_feature is already encoded: "+prediction_feature.to_s if prediction_feature=~/%20/
+ prediction_feature = URI.encode(prediction_feature)
+ $sinatra.halt 400, "Prediction feature not found in dataset features: "+prediction_feature.to_s+
+ ", features are: \n"+orig_dataset.features.inspect unless orig_dataset.features.include?(prediction_feature)
+ else
+ LOGGER.warn "no prediciton feature given, all features included in test dataset"
+ end
compounds = orig_dataset.compounds
$sinatra.halt 400, "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
@@ -338,12 +349,15 @@ module Validation
{:training_dataset_uri => train_compounds, :test_dataset_uri => test_compounds}.each do |sym, compound_array|
if sym == :training_dataset_uri
+ features = orig_dataset.features
title = "Training dataset split of "+orig_dataset.title.to_s
else
+ features = orig_dataset.features.dclone - [prediction_feature]
title = "Test dataset split of "+orig_dataset.title.to_s
end
source = $sinatra.url_for('/training_test_split',:full)
- result[sym] = orig_dataset.create_new_dataset( compound_array, title, source )
+
+ result[sym] = orig_dataset.create_new_dataset( compound_array, features, title, source )
end
$sinatra.halt 400, "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless OpenTox::Dataset.find result[:training_dataset_uri]
diff --git a/validation/validation_test.rb b/validation/validation_test.rb
index 3a3d904..df3ffab 100644
--- a/validation/validation_test.rb
+++ b/validation/validation_test.rb
@@ -10,7 +10,7 @@ LOGGER = Logger.new(STDOUT)
LOGGER.datetime_format = "%Y-%m-%d %H:%M:%S "
class Example
- attr_accessor :alg, :train_data, :test_data, :model, :pred_data, :act_feat, :pred_feat, :classification, :alg_params, :val, :orig_data
+ attr_accessor :alg, :train_data, :test_data, :model, :pred_data, :act_feat, :pred_feat, :classification, :alg_params, :val, :orig_data, :num_folds, :random_seed
end
class ValidationTest < Test::Unit::TestCase
@@ -24,11 +24,13 @@ class ValidationTest < Test::Unit::TestCase
#ex = ex_ntua
#ex = ex_tum
+ #ex = ex_local
#create_validation(ex)
#validate_model(ex)
#validate_algorithm(ex)
#validate_split(ex)
+ #xval(ex)
#test_dataset = OpenTox::Dataset.find ex_ntua2.pred_data
#puts ex_ntua2.pred_data.to_s+", "+test_dataset.compounds.size.to_s+" compounds"
@@ -52,25 +54,30 @@ class ValidationTest < Test::Unit::TestCase
begin
orig = OpenTox::Dataset.find(ex.orig_data)
raise "not correct, upload" if (orig == nil || orig.compounds.size!=85)
- rescue
+ rescue => e
+ puts e.message
upload_uri = upload_data(dataset, File.new("data/hamster_carcinogenicity.yaml","r"))
ex.orig_data = upload_uri
end
+ ex.act_feat = "http://localhost/toxmodel/feature#Hamster Carcinogenicity (DSSTOX/CPDB)"
+
ex.train_data = File.join(dataset,"2")
ex.test_data = File.join(dataset,"3")
begin
train = OpenTox::Dataset.find(ex.train_data)
test = OpenTox::Dataset.find(ex.test_data)
- raise "not correct, split" if (train == nil || test == nil || train.compounds.size>=85 || test.compounds.size>=test.compounds.size)
- rescue
- post '/plain_training_test_split', { :dataset_uri => ex.orig_data, :split_ratio=>0.75, :random_seed=>6}
+ raise "not correct, split "+train.to_s+" "+test.to_s+
+ " "+train.compounds.size.to_s+" "+test.compounds.size.to_s if (train == nil || test == nil || train.compounds.size>=85 || test.compounds.size>=train.compounds.size)
+ rescue => e
+ puts e.message
+ post '/plain_training_test_split', { :dataset_uri => ex.orig_data, :prediction_feature=>ex.act_feat, :split_ratio=>0.75, :random_seed=>6}
split = last_response.body.split("\n")
ex.train_data = split[0]
ex.test_data = split[1]
end
- ex.act_feat = "http://localhost/toxmodel/feature#Hamster Carcinogenicity (DSSTOX/CPDB)"
+
# example model
#ex.model = "http://opentox.ntua.gr:3000/model/29"
#ex.pred_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/261687"
@@ -124,14 +131,14 @@ class ValidationTest < Test::Unit::TestCase
#ex.alg = "http://opentox.informatik.tu-muenchen.de:8080/OpenTox-dev/algorithm/GaussP"
#mini
- #ex.train_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342"
- #ex.test_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342"
- #ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/03141"
+ ex.train_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342"
+ ex.test_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342"
+ ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/03141"
#big
- ex.train_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/639"
- ex.test_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/640"
- ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/264185"
+ #ex.train_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/639"
+ #ex.test_data = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/640"
+ #ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/264185"
#ex.act_feat = "http://ambit.uni-plovdiv.bg:8080/ambit2/feature/264187" #test
# example model
@@ -183,57 +190,60 @@ class ValidationTest < Test::Unit::TestCase
# end
# end
#
-# def test_cv
-# begin
-# data_uri = upload_data(WS_DATA, FILE)
-#
-## first_validation=nil
-## 2.times do
-#
-# num_folds = 9
-# post '/crossvalidation', { :dataset_uri => data_uri, :algorithm_uri => WS_CLASS_ALG, :prediction_feature => FEATURE_URI,
-# :algorithm_params => "feature_generation_uri="+WS_FEATURE_ALG, :num_folds => num_folds, :random_seed => 2 }
-#
-# uri = last_response.body
-# if OpenTox::Utils.task_uri?(uri)
-# puts "task: "+uri.to_s
-# uri = OpenTox::Task.find(uri).wait_for_resource.to_s
+ def xval(ex)
+ begin
+ #data_uri = upload_data(WS_DATA, FILE)
+
+# first_validation=nil
+# 2.times do
+
+ raise "no orig data" unless ex.orig_data
+ num_folds = ex.num_folds ? ex.num_folds : 3
+ random_seed = ex.random_seed ? ex.random_seed : 1
+
+ post '/crossvalidation', { :dataset_uri => ex.orig_data, :algorithm_uri => ex.alg, :prediction_feature => ex.act_feat,
+ :algorithm_params => ex.alg_params, :num_folds => num_folds, :random_seed => random_seed }
+
+ uri = last_response.body
+ if OpenTox::Utils.task_uri?(uri)
+ puts "task: "+uri.to_s
+ uri = OpenTox::Task.find(uri).wait_for_resource.to_s
+ end
+ puts "crossvalidation: "+uri
+
+ assert last_response.ok?
+ crossvalidation_id = uri.split("/")[-1]
+ add_resource("/crossvalidation/"+crossvalidation_id)
+ puts "id:"+crossvalidation_id
+
+ get '/crossvalidation/'+crossvalidation_id
+ puts last_response.body
+ assert last_response.ok? || last_response.status==202
+
+ get '/crossvalidation/'+crossvalidation_id+'/validations'
+ puts "validations:\n"+last_response.body
+ assert last_response.ok?
+ assert last_response.body.split("\n").size == num_folds, "num-folds:"+num_folds.to_s+" but num lines is "+last_response.body.split("\n").size.to_s
+
+# if first_validation
+# # assert that both cross validaitons use the same datasets
+# first_validation2 = last_response.body.split("\n")[0].split("/")[-1]
+#
+# get '/'+first_validation+'/test_dataset_uri'
+# assert last_response.ok?
+# first_val_test_data = last_response.body
+#
+# get '/'+first_validation2+'/test_dataset_uri'
+# assert last_response.ok?
+# first_val2_test_data = last_response.body
+# assert first_val_test_data==first_val2_test_data
# end
-# puts "crossvalidation: "+uri
-#
-# assert last_response.ok?
-# crossvalidation_id = uri.split("/")[-1]
-# add_resource("/crossvalidation/"+crossvalidation_id)
-# puts "id:"+crossvalidation_id
-#
-# get '/crossvalidation/'+crossvalidation_id
-# puts last_response.body
-# assert last_response.ok? || last_response.status==202
-#
-# get '/crossvalidation/'+crossvalidation_id+'/validations'
-# puts "validations:\n"+last_response.body
-# assert last_response.ok?
-# assert last_response.body.split("\n").size == num_folds, "num-folds:"+num_folds.to_s+" but num lines is "+last_response.body.split("\n").size.to_s
-#
-## if first_validation
-## # assert that both cross validaitons use the same datasets
-## first_validation2 = last_response.body.split("\n")[0].split("/")[-1]
-##
-## get '/'+first_validation+'/test_dataset_uri'
-## assert last_response.ok?
-## first_val_test_data = last_response.body
-##
-## get '/'+first_validation2+'/test_dataset_uri'
-## assert last_response.ok?
-## first_val2_test_data = last_response.body
-## assert first_val_test_data==first_val2_test_data
-## end
-## first_validation = last_response.body.split("\n")[0].split("/")[-1]
-## end
-# ensure
-# #delete_resources
-# end
-# end
+# first_validation = last_response.body.split("\n")[0].split("/")[-1]
+# end
+ ensure
+ #delete_resources
+ end
+ end
#
def validate_model(ex)
begin
@@ -258,6 +268,8 @@ class ValidationTest < Test::Unit::TestCase
# model_uri = "http://opentox.ntua.gr:3000/model/9"
# data_uri_test = "http://ambit.uni-plovdiv.bg:8080/ambit2/dataset/342"
+ raise "model not defined" unless ex.model
+
post '', {:test_dataset_uri => ex.test_data, :model_uri => ex.model} #, :prediction_feature => FEATURE_URI}
puts last_response.body
@@ -384,7 +396,7 @@ class ValidationTest < Test::Unit::TestCase
#data_uri=WS_DATA+"/"+DATA
# post '/training_test_split', { :dataset_uri => data_uri, :algorithm_uri => WS_CLASS_ALG, :prediction_feature => FEATURE_URI,
# :algorithm_params => "feature_generation_uri="+WS_FEATURE_ALG, :split_ratio=>0.75, :random_seed=>6}
- post '/training_test_split', { :dataset_uri => ex.orig_data, :algorithm_uri => ex.alg, :prediction_feature => ex.pred_feat,
+ post '/training_test_split', { :dataset_uri => ex.orig_data, :algorithm_uri => ex.alg, :prediction_feature => ex.act_feat,
:algorithm_params => ex.alg_params, :split_ratio=>0.75, :random_seed=>6}
puts last_response.body