From 6061f3abcd6d0ecd28eb1ba2ec5bff488fbd0961 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Fri, 15 Jun 2012 14:03:59 +0200 Subject: dataset add() speedup, handle missing values in dataframe to dataset, modfied debug msges --- example.rb | 2 +- validation/validation_application.rb | 13 +++++++------ validation/validation_service.rb | 14 +++++++++----- validation/validation_test.rb | 9 ++++++++- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/example.rb b/example.rb index 636579e..d6fba95 100755 --- a/example.rb +++ b/example.rb @@ -1,5 +1,5 @@ -require 'lib/test_util.rb' +#require 'lib/test_util.rb' class Example diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 6b2ef3a..258a681 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -488,7 +488,8 @@ post '/training_test_split' do check_stratified(params) task = OpenTox::Task.create( "Perform training test split validation", url_for("/training_test_split", :full) ) do |task| #, params params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], - @subjectid, params[:stratified], params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33))) + @subjectid, params[:stratified], params[:split_ratio], params[:random_seed], params[:missing_values], + OpenTox::SubTask.create(task,0,33))) v = Validation::Validation.create :validation_type => "training_test_split", :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri], @@ -583,10 +584,10 @@ post '/plain_training_test_split' do check_stratified(params) task = OpenTox::Task.create( "Create data-split", url_for("/plain_training_test_split", :full) ) do |task| result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], @subjectid, - params[:stratified], params[:split_ratio], params[:random_seed], task) + params[:stratified], params[:split_ratio], params[:random_seed], params[:missing_values], task) content_type "text/uri-list" res = result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n" - LOGGER.info "plain training test split done #{res}" + LOGGER.info "plain training test split done #{res.to_s.gsub("\n"," \\n ")}" res end return_task(task) @@ -672,7 +673,7 @@ get '/:id/viz' do data.features.each do |f,m| d.add_feature(f,m) data.data_entries[c][f].each do |v| - d.add(c,f,v) + d.add(c,f,v,true) end if data.data_entries[c][f] end end @@ -685,8 +686,8 @@ get '/:id/viz' do [p,a].each do |v| raise p.class.to_s+" "+p.inspect unless p.is_a?(Array) and p.size==1 end - d.add(c,predicted_nice_feature,p[0]) - d.add(c,correct_classified_feature,p[0]==a[0] ? "correct" : "miss") + d.add(c,predicted_nice_feature,p[0],true) + d.add(c,correct_classified_feature,p[0]==a[0] ? "correct" : "miss",true) end end d.to_csv diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 3966d7e..5398ace 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -491,6 +491,7 @@ module Validation test_dataset_uris << test_dataset_uri end when /true|super/ + raise "DO NOT USED SUPER-STRATIFICATION FOR VAL-EXPERIMENTS AND CV, IF SO SOLVE _MISSING_VAULE_NA_ PROBLEM" if stratified=="super" if stratified=="true" features = [ self.prediction_feature ] else @@ -540,7 +541,7 @@ module Validation "' not found in dataset, features are: \n"+ orig_dataset.features.inspect unless orig_dataset.features.include?(prediction_feature) else - LOGGER.warn "no prediciton feature given, all features included in test dataset" + LOGGER.debug "no prediciton feature given, all features included in test dataset" end compounds = orig_dataset.compounds @@ -615,12 +616,15 @@ module Validation # splits a dataset into test and training dataset # returns map with training_dataset_uri and test_dataset_uri - def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified="false", split_ratio=nil, random_seed=nil, task=nil ) + def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified="false", + split_ratio=nil, random_seed=nil, missing_values=nil, task=nil ) + split_ratio=0.67 unless split_ratio split_ratio = split_ratio.to_f random_seed=1 unless random_seed random_seed = random_seed.to_i - + missing_values = "NA" unless missing_values + raise OpenTox::NotFoundError.new "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f raise OpenTox::NotFoundError.new "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1 orig_dataset = Lib::DatasetCache.find orig_dataset_uri, subjectid @@ -631,7 +635,7 @@ module Validation "' not found in dataset, features are: \n"+ orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature) else - LOGGER.warn "no prediciton feature given, all features will be included in test dataset" + LOGGER.debug "no prediciton feature given, all features will be included in test dataset" end meta = { DC.creator => $url_provider.url_for('/training_test_split',:full) } @@ -646,7 +650,7 @@ module Validation features = nil end r_util = OpenTox::RUtil.new - train, test = r_util.stratified_split( orig_dataset, meta, "NA", split_ratio, @subjectid, random_seed, features, stratified=="anti" ) + train, test = r_util.stratified_split( orig_dataset, meta, missing_values, split_ratio, @subjectid, random_seed, features, stratified=="anti" ) r_util.quit_r result = {:training_dataset_uri => train.uri, :test_dataset_uri => test.uri} when "false" diff --git a/validation/validation_test.rb b/validation/validation_test.rb index ce6d5c1..78f35d2 100755 --- a/validation/validation_test.rb +++ b/validation/validation_test.rb @@ -85,7 +85,14 @@ class ValidationTest < Test::Unit::TestCase # {:validation_uris=>"http://local-ot/validation/389,http://local-ot/validation/390,http://local-ot/validation/391,http://local-ot/validation/392", # :identifier=>"split1,split1,split2,split2"} - #post "/report/validation",{:validation_uris=>"http://local-ot/validation/22849",:min_confidence=>0.5} + post "/training_test_validation",{:prediction_feature=>"http://opentox.informatik.uni-freiburg.de/dataset/2/feature/MTP", + :training_dataset_uri=>"http://opentox.informatik.uni-freiburg.de/dataset/4755", + :algorithm_uri=>"http://opentox.informatik.uni-freiburg.de/superservice", + :test_dataset_uri=>"http://opentox.informatik.uni-freiburg.de/dataset/3556", + :algorithm_params=>"prediction_algorithm=http://opentox.informatik.uni-freiburg.de/weka/M5P;create_bbrc_features=false;ad_algorithm=http://opentox.informatik.uni-freiburg.de/appdomain/EuclideanDistance"} + exit + + #post "/report/validation",{:validation_uris=>"http://local-ot/validation/22849",:min_confidence=>0.5} get "/22849",{:min_confidence=>0.5} exit -- cgit v1.2.3