From ba9d3e628e424a84d8c892c4fdf49c2258a95352 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Fri, 12 Oct 2012 10:27:15 +0200 Subject: add contra splitting, provide access to dataset-splits, minor stuff --- lib/predictions.rb | 135 ++++++++++++++++++++--------------- report/plot_factory.rb | 2 +- report/report_service.rb | 2 +- report/validation_access.rb | 3 + validation/validation_application.rb | 45 +++++++++++- validation/validation_service.rb | 10 ++- validation/validation_test.rb | 32 +++++---- 7 files changed, 154 insertions(+), 75 deletions(-) diff --git a/lib/predictions.rb b/lib/predictions.rb index 63578fd..4708630 100755 --- a/lib/predictions.rb +++ b/lib/predictions.rb @@ -1,4 +1,6 @@ +require "rubygems" + require "lib/prediction_data.rb" require "statsample" @@ -813,66 +815,87 @@ module Lib require "rubygems" require "opentox-ruby" - max_deviation = rand * 0.9 - avg_deviation = max_deviation * 0.5 - - p = [] - a = [] - c = [] - (100 + rand(1000)).times do |i| - r = rand - deviation = rand * max_deviation - a << r - p << r + ((rand<0.5 ? -1 : 1) * deviation) - #c << 0.5 - if (deviation > avg_deviation) - c << 0.4 - else - c << 0.6 - end - #puts a[-1].to_s+" "+p[-1].to_s - end - puts "num values "+p.size.to_s + max_deviation = rand * 0.5 - #a = [1.0,2.0, 3.0,4.0, 5.0] - #p = [1.5,2.25,3.0,3.75,4.5] + weired_index = 0 - #a = [1.0,2.0,3.0,4.0,5.0] - #p = [1.5,2.5,3.5,4.5,5.5] - - #p = a.collect{|v| v-0.5} - #p = a.collect{|v| v+0.5} + 10.times do - #p = [2.0,2.5,3.0,3.5,4.0] - - c = Array.new(p.size,nil) + p = [] + a = [] + 500.times do |i| + r = rand + deviation = rand * max_deviation + a << r + p << r + ((rand<0.5 ? -1 : 1) * deviation) + end + puts "num values "+p.size.to_s + c = Array.new(p.size,nil) + data = { :predicted_values => p, :actual_values => a, :confidence_values => c, + :feature_type => "regression", :accept_values => nil } + pred = Predictions.new(data) + puts "ccc "+"%.3f" % pred.concordance_correlation_coefficient.to_s + puts "rmse "+"%.3f" % pred.root_mean_squared_error.to_s + ccc_1 = pred.concordance_correlation_coefficient + rmse_1 = pred.root_mean_squared_error + + p = [] + a = [] + 2500.times do |i| + r = rand + deviation = rand * max_deviation + a << r + p << r + ((rand<0.5 ? -1 : 1) * deviation) + end + puts "num values "+p.size.to_s + c = Array.new(p.size,nil) + data = { :predicted_values => p, :actual_values => a, :confidence_values => c, + :feature_type => "regression", :accept_values => nil } + pred = Predictions.new(data) + puts "ccc "+"%.3f" % pred.concordance_correlation_coefficient.to_s + puts "rmse "+"%.3f" % pred.root_mean_squared_error.to_s + ccc_2 = pred.concordance_correlation_coefficient + rmse_2 = pred.root_mean_squared_error + + ccc_d = ccc_1 - ccc_2 + rmse_d = rmse_1 - rmse_2 + + dev = 0.0005 + + puts "ccc-d "+"%.3f" % ccc_d + puts "rmse-d "+"%.3f" % rmse_d + + if ccc_d.abs>dev or rmse_d.abs>dev + + if ((ccc_d>0 and rmse_d<0) or (ccc_d<0 and rmse_d>0)) #ccc_d.abs>dev and rmse_d.abs>dev and + puts "weired" + weired_index += 1 + else + puts "not weired" + weired_index -= 1 + end + puts weired_index + else + #puts "only small" + end + + end - data = { :predicted_values => p, :actual_values => a, :confidence_values => c, - :feature_type => "regression", :accept_values => nil } - - pred = Predictions.new(data) - puts "internal" - #puts "r-square old "+pred.r_square_old.to_s - puts "cor "+pred.sample_correlation_coefficient.to_s - #puts "weighted cor "+pred.weighted_sample_correlation_coefficient.to_s - puts "r-square "+pred.r_square.to_s - puts "ccc "+pred.concordance_correlation_coefficient.to_s - - puts "R" - rutil = OpenTox::RUtil.new - - rutil.r.assign "v1",a - rutil.r.assign "v2",p - puts "r cor "+rutil.r.pull("cor(v1,v2)").to_s - rutil.r.eval "fit <- lm(v1 ~ v2)" - rutil.r.eval "sum <- summary(fit)" - puts "r r-square "+rutil.r.pull("sum$r.squared").to_s - puts "r adjusted-r-square "+rutil.r.pull("sum$adj.r.squared").to_s - rutil.r.eval "save.image(\"/tmp/image.R\")" - #rutil.r.eval "require(epiR)" - #rutil.r.eval "tmp.ccc <- epi.ccc(v1,v2)" - #puts "r ccc "+rutil.r.pull("tmp.ccc$rho.c$est").to_s - rutil.quit_r +# puts "R" +# rutil = OpenTox::RUtil.new +# +# rutil.r.assign "v1",a +# rutil.r.assign "v2",p +# puts "r cor "+rutil.r.pull("cor(v1,v2)").to_s +# rutil.r.eval "fit <- lm(v1 ~ v2)" +# rutil.r.eval "sum <- summary(fit)" +# puts "r r-square "+rutil.r.pull("sum$r.squared").to_s +# puts "r adjusted-r-square "+rutil.r.pull("sum$adj.r.squared").to_s +# rutil.r.eval "save.image(\"/tmp/image.R\")" +# #rutil.r.eval "require(epiR)" +# #rutil.r.eval "tmp.ccc <- epi.ccc(v1,v2)" +# #puts "r ccc "+rutil.r.pull("tmp.ccc$rho.c$est").to_s +# rutil.quit_r end def prediction_feature_value_map(proc) diff --git a/report/plot_factory.rb b/report/plot_factory.rb index 61c3eea..a2be092 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -125,7 +125,7 @@ module Reports else Reports::r_util.feature_value_plot(out_files, validation_set.validations[0].training_feature_dataset_uri, validation_set.validations[0].test_feature_dataset_uri, "Training Data", "Test Data", - nil, validation_set.validations[0].subjectid, waiting_task ) + validation_set.validations[0].prediction_feature, validation_set.validations[0].subjectid, waiting_task ) end end diff --git a/report/report_service.rb b/report/report_service.rb index f315b04..f11a7a8 100644 --- a/report/report_service.rb +++ b/report/report_service.rb @@ -25,7 +25,7 @@ module Reports def initialize(home_uri) raise "supposed to be a singleton" if defined?@@instance raise "plz specify home_uri" unless home_uri - LOGGER.info "init report service" + #LOGGER.info "init report service" @home_uri = home_uri @@instance = self end diff --git a/report/validation_access.rb b/report/validation_access.rb index 4d9ed9f..463337b 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -158,6 +158,9 @@ class Reports::ValidationDB if features_found LOGGER.debug "all training-features found in test-datset" uri = test_dataset.uri + elsif validation.model_uri=~/superservice/ + uri = OpenTox::RestClientWrapper.post(validation.model_uri+"/test_dataset_features", + {:dataset_uri=>validation.test_dataset_uri}).to_s else m = OpenTox::Model::Generic.find(validation.model_uri, subjectid) feat_gen = nil diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 9233502..c873a72 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -9,8 +9,8 @@ require 'validation/validation_service.rb' helpers do def check_stratified(params) params[:stratified] = "false" unless params[:stratified] - raise OpenTox::BadRequestError.new "stratified != true|false|super|super4|super5|anti, is #{params[:stratified]}" unless - params[:stratified]=~/^(true|false|super|super4|super5|anti)$/ + raise OpenTox::BadRequestError.new "stratified != true|false|super|super4|super5|contra, is #{params[:stratified]}" unless + params[:stratified]=~/^(true|false|super|super4|super5|contra)$/ end end @@ -745,6 +745,47 @@ end # return validation.send(params[:attribute]) #end + +def get_splits(id) + require "#{ENV['HOME']}/workspace/ValidationExperiments/dataset_split.rb" + validation = Validation::Validation[id] + raise OpenTox::NotFoundError.new "Validation '#{id}' not found." unless validation + Exp::DatasetSplit.find({:train_dataset_uri => validation.training_dataset_uri,:test_dataset_uri => validation.test_dataset_uri,}) +end + +def get_split(id,id2) + get_splits(id).each do |s| + return s if id2.to_s==s.id.to_s + end + raise "not found: dataset split with id #{id2}" unless split +end + +get '/:id/split' do + splits = get_splits(params[:id]) + base_uri = "http://local-ot/validation/#{params[:id]}/split/" + uris = splits.collect{|s| base_uri+s.id} + if request.env['HTTP_ACCEPT'] =~ /text\/html/ + content_type "text/html" + OpenTox.text_to_html uris.join("\n") + else + content_type "text/uri-list" + uris.join("\n") + end +end + +get '/:id/split/:id2' do + split = get_split(params[:id],params[:id2]) + content_type "text/html" + split.inspect + OpenTox.text_to_html ["http://local-ot/validation/#{params[:id]}/split/#{params[:id2]}/viz",split].to_yaml +end + +get '/:id/split/:id2/viz' do + split = get_split(params[:id],params[:id2]) + content_type("image/svg+xml") + result = body(File.new(split.svg_path)) +end + get '/:id/migrate_median_confidence' do LOGGER.debug "migrate median confidence" validation = Validation::Validation[params[:id]] diff --git a/validation/validation_service.rb b/validation/validation_service.rb index e099e25..f57b3a5 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -653,15 +653,21 @@ module Validation meta = { DC.creator => $url_provider.url_for('/training_test_split',:full) } case stratified - when /^(true|super|super4|super5|anti)$/ + when /^(true|super|super4|super5|contra)$/ raise "store split clusters not available for true stratified splits" if store_split_clusters and stratified=="true" if stratified=="true" raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature LOGGER.warn "split features are ignored for stratified splits (use super instead)" if features features = [prediction_feature] + elsif stratified=="contra" + raise OpenTox::BadRequestError.new "prediction feature required for anti-stratified splits" unless prediction_feature + LOGGER.debug "prediction feature is removed for anti-stratified splits" + features = orig_dataset.features.keys-[prediction_feature] + raise unless features.size==orig_dataset.features.size-1 else LOGGER.warn "prediction feature is ignored for super- or anti-stratified splits" if prediction_feature end + LOGGER.debug "Using "+features.size.to_s+"/"+orig_dataset.features.size.to_s+" features for splitting" if features r_util = OpenTox::RUtil.new train, test = r_util.stratified_split( orig_dataset, meta, missing_values, split_ratio, @subjectid, random_seed, features, stratified, store_split_clusters ) @@ -705,7 +711,7 @@ module Validation end LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" else - raise OpenTox::BadRequestError.new "stratified != false|true|super, is #{stratified}" + raise OpenTox::BadRequestError.new "stratified != false|true|super|contra, is #{stratified}" end result end diff --git a/validation/validation_test.rb b/validation/validation_test.rb index 1a41881..a7fae88 100755 --- a/validation/validation_test.rb +++ b/validation/validation_test.rb @@ -60,6 +60,10 @@ class ValidationTest < Test::Unit::TestCase begin $test_case = self + get '2568/split' + puts last_response.body + exit + # dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=0" # test_dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=1" # #prediction_feature = "http://apps.ideaconsult.net:8080/ambit2/feature/528321" @@ -85,16 +89,16 @@ class ValidationTest < Test::Unit::TestCase # {:validation_uris=>"http://local-ot/validation/389,http://local-ot/validation/390,http://local-ot/validation/391,http://local-ot/validation/392", # :identifier=>"split1,split1,split2,split2"} - post "/training_test_validation",{:prediction_feature=>"http://local-ot/dataset/9264/feature/endpoint", - :training_dataset_uri=>"http://local-ot/dataset/119127", - :algorithm_uri=>"http://local-ot/weka/RandomForest", - :test_dataset_uri=>"http://local-ot/dataset/119128", - :test_target_dataset_uri=>"http://local-ot/dataset/9264", - :algorithm_params=>""} - uri = last_response.body - rep = wait_for_task(uri) - puts rep - exit +# post "/training_test_validation",{:prediction_feature=>"http://local-ot/dataset/9264/feature/endpoint", +# :training_dataset_uri=>"http://local-ot/dataset/119127", +# :algorithm_uri=>"http://local-ot/weka/RandomForest", +# :test_dataset_uri=>"http://local-ot/dataset/119128", +# :test_target_dataset_uri=>"http://local-ot/dataset/9264", +# :algorithm_params=>""} +# uri = last_response.body +# rep = wait_for_task(uri) +# puts rep +# exit # # #post "/report/validation",{:validation_uris=>"http://local-ot/validation/22849",:min_confidence=>0.5} # get "/22849",{:min_confidence=>0.5} @@ -108,10 +112,12 @@ class ValidationTest < Test::Unit::TestCase # #dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603306?feature_uris[]=http://apps.ideaconsult.net:8080/ambit2/feature/764036" # #dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603204" - #dataset_uri = "http://local-ot/dataset/9264" - #post "/plain_training_test_split",{:dataset_uri=>dataset_uri, :stratified=>"super", :split_ratio=>0.25} + dataset_uri = "http://local-ot/dataset/12084" + prediction_feature = "http://local-ot/dataset/12084/feature/bbrc/1" + post "/plain_training_test_split",{:dataset_uri=>dataset_uri, :stratified=>"contra", :split_ratio=>0.25, :missing_values=>"0", + :prediction_feature => prediction_feature} - get '31355/median_confidence' + #get '31355/median_confidence' puts last_response.body exit -- cgit v1.2.3