From d1f3a0ffb95e0b3b3bd6fa9ff0de7f4f0779256f Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 26 Jun 2012 13:04:05 +0200 Subject: fix median for medianConfidence, add confidence to validation visualization, adjust to new super strat methods --- lib/predictions.rb | 12 ++---------- validation/validation_application.rb | 16 +++++++++++++--- validation/validation_service.rb | 19 ++++++++++--------- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/lib/predictions.rb b/lib/predictions.rb index eb13b31..63578fd 100755 --- a/lib/predictions.rb +++ b/lib/predictions.rb @@ -1,14 +1,6 @@ require "lib/prediction_data.rb" - - -class Array - - def median #array has to be sorted! - (self[size/2] + self[(size+1)/2]) / 2.0 - end - -end +require "statsample" module Lib @@ -668,7 +660,7 @@ module Lib # data for (roc-)plots ################################################################################### def median_confidence - @confidence_values.median if confidence_values_available? + @confidence_values.to_scale.median if confidence_values_available? end def get_roc_prediction_values(class_value) diff --git a/validation/validation_application.rb b/validation/validation_application.rb index f9a5b1c..689241c 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -9,8 +9,8 @@ require 'validation/validation_service.rb' helpers do def check_stratified(params) params[:stratified] = "false" unless params[:stratified] - raise OpenTox::BadRequestError.new "stratified != true|false|super|anti, is #{params[:stratified]}" unless - params[:stratified]=~/true|false|super|anti/ + raise OpenTox::BadRequestError.new "stratified != true|false|super|super4|anti, is #{params[:stratified]}" unless + params[:stratified]=~/^(true|false|super|super4|anti)$/ end end @@ -583,8 +583,10 @@ post '/plain_training_test_split' do raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri] check_stratified(params) task = OpenTox::Task.create( "Create data-split", url_for("/plain_training_test_split", :full) ) do |task| + split_features = nil + split_features = params[:split_features].split(";") if params[:split_features] result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], @subjectid, - params[:stratified], params[:split_ratio], params[:random_seed], params[:missing_values], task) + params[:stratified], params[:split_ratio], params[:random_seed], params[:missing_values], task, split_features) content_type "text/uri-list" res = result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n" LOGGER.info "plain training test split done #{res.to_s.gsub("\n"," \\n ")}" @@ -651,6 +653,7 @@ get '/:id/viz' do m = OpenTox::Model::Generic.find(validation.model_uri) predicted_feature = m.predicted_variable(nil) + confidence_feature = m.predicted_confidence(nil) actual_feature = validation.prediction_feature d = OpenTox::Dataset.create @@ -665,6 +668,8 @@ get '/:id/viz' do d.add_feature(correct_classified_feature) predicted_nice_feature = "http://predicted" d.add_feature(predicted_nice_feature) + confidence_nice_feature = "http://confidence" + d.add_feature(confidence_nice_feature) [training, test].each do |data| data.compounds.each do |c| @@ -689,6 +694,11 @@ get '/:id/viz' do d.add(c,predicted_nice_feature,p[0],true) d.add(c,correct_classified_feature,p[0]==a[0] ? "correct" : "miss",true) end + if prediction.data_entries[c][confidence_feature] + conf = prediction.data_entries[c][confidence_feature] + raise if conf.size!=1 + d.add(c,confidence_nice_feature,conf[0],true) + end end d.to_csv end diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 5fded24..1b29630 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -500,8 +500,8 @@ module Validation meta, self.subjectid ).uri test_dataset_uris << test_dataset_uri end - when /true|super/ - raise "DO NOT USED SUPER-STRATIFICATION FOR VAL-EXPERIMENTS AND CV, IF SO SOLVE _MISSING_VAULE_NA_ PROBLEM" if stratified=="super" + when /^(true|super|super4)$/ + raise "DO NOT USED SUPER-STRATIFICATION FOR VAL-EXPERIMENTS AND CV, IF SO SOLVE _MISSING_VAULE_NA_ PROBLEM" if stratified=~/super/ if stratified=="true" features = [ self.prediction_feature ] else @@ -627,7 +627,7 @@ module Validation # splits a dataset into test and training dataset # returns map with training_dataset_uri and test_dataset_uri def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified="false", - split_ratio=nil, random_seed=nil, missing_values=nil, task=nil ) + split_ratio=nil, random_seed=nil, missing_values=nil, task=nil, features=nil ) split_ratio=0.67 unless split_ratio split_ratio = split_ratio.to_f @@ -651,30 +651,31 @@ module Validation meta = { DC.creator => $url_provider.url_for('/training_test_split',:full) } case stratified - when /true|super|anti/ + when /^(true|super|super4|anti)$/ if stratified=="true" raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature + LOGGER.warn "split features are ignored for stratified splits (use super instead)" if features features = [prediction_feature] else LOGGER.warn "prediction feature is ignored for super- or anti-stratified splits" if prediction_feature - features = nil end r_util = OpenTox::RUtil.new - train, test = r_util.stratified_split( orig_dataset, meta, missing_values, split_ratio, @subjectid, random_seed, features, stratified=="anti" ) + train, test = r_util.stratified_split( orig_dataset, meta, missing_values, split_ratio, @subjectid, random_seed, features, stratified ) r_util.quit_r result = {:training_dataset_uri => train.uri, :test_dataset_uri => test.uri} when "false" + LOGGER.warn "split features are ignored for non-stratified splits (use super instead)" if features compounds = orig_dataset.compounds raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 split = (compounds.size*split_ratio).to_i split = [split,1].max split = [split,compounds.size-2].min LOGGER.debug "splitting dataset "+orig_dataset_uri+ - " into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compounds.size-1).to_s+ + " into train:0-"+(split-1).to_s+" and test:"+split.to_s+"-"+(compounds.size-1).to_s+ " (shuffled with seed "+random_seed.to_s+")" compounds.shuffle!( random_seed ) - training_compounds = compounds[0..split] - test_compounds = compounds[(split+1)..-1] + training_compounds = compounds[0..(split-1)] + test_compounds = compounds[split..-1] task.progress(33) if task meta[DC.title] = "Training dataset split of "+orig_dataset.uri -- cgit v1.2.3