From 9f2be4ca3bded1543142f5e3654693ce65aadb44 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 18 Jan 2012 17:54:11 +0100 Subject: add super-stratification split for training test splitting --- lib/r-util.rb | 82 +++++++++++++++++++++++ lib/stratification.R | 123 +++++++++++++++++++++++++++++++++++ validation/validation_application.rb | 8 ++- validation/validation_service.rb | 65 +++++++++--------- 4 files changed, 240 insertions(+), 38 deletions(-) create mode 100644 lib/r-util.rb create mode 100644 lib/stratification.R diff --git a/lib/r-util.rb b/lib/r-util.rb new file mode 100644 index 0000000..0d58389 --- /dev/null +++ b/lib/r-util.rb @@ -0,0 +1,82 @@ +# pending: package dir hack --------- +# CONFIG[:base_dir] = "/home//opentox-ruby/www" +# PACKAGE_DIR = "/home//opentox-ruby/r-packages" +package_dir = CONFIG[:base_dir].split("/") +package_dir[-1] = "r-packages" +package_dir = package_dir.join("/") +PACKAGE_DIR = package_dir + + + +module Lib + + module RUtil + + def self.dataset_to_dataframe( dataset ) + LOGGER.debug "convert dataset to dataframe #{dataset.uri}" + all_features = [] + dataset.features.each do |f| + feat_name = "feature_#{f[0].split("/")[-1]}" + LOGGER.debug "- adding feature: #{feat_name}" + feat = OpenTox::Feature.find(f[0]) + nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature) + values = [] + dataset.compounds.each do |c| + val = dataset.data_entries[c][f[0]] + raise "not yet implemented" if val!=nil && val.size>1 + v = val==nil ? "" : val[0].to_s + v = "NA" if v.size()==0 + values << v + end + all_features << feat_name + @@r.assign feat_name,values + @@r.eval "#{feat_name} <- as.numeric(#{feat_name})" unless nominal + end + df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" + cmd = "#{df_name} <- data.frame(#{all_features.join(",")})" + @@r.eval cmd + #@@r.eval "head(#{df_name})" + df_name + end + + def self.stratified_split( dataframe, pct=0.3, seed=42 ) + @@r.eval "set.seed(#{seed})" + @@r.eval "split <- stratified_split(#{dataframe}, ratio=#{pct})" + split = @@r.pull 'split' + split.collect{|s| s.to_i} + end + + def self.package_installed?( package ) + @@r.eval ".libPaths(\"#{PACKAGE_DIR}\")" + p = @@r.pull "installed.packages()[,1]" + p.include?(package) + end + + def self.install_packages( package ) + unless package_installed? package + @@r.eval "install.packages(\"#{package}\", repos=\"http://cran.r-project.org\", dependencies=T, lib=\"#{PACKAGE_DIR}\")" + end + end + + def self.library( package ) + install_packages( package ) + @@r.eval "library(\"#{package}\")" + end + + def self.init_r + @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r + library("sampling") + library("gam") + @@r.eval "source(\"#{PACKAGE_DIR}/stratification.R\")" + end + + def self.quit_r + begin + @@r.quit + @@r = nil + rescue + end + end + + end +end diff --git a/lib/stratification.R b/lib/stratification.R new file mode 100644 index 0000000..9aa8d1f --- /dev/null +++ b/lib/stratification.R @@ -0,0 +1,123 @@ +library("sampling") +library("gam") + +nominal_to_binary <- function( orig_data ) +{ + data = as.data.frame( orig_data ) + result = NULL + for (i in 1:ncol(data)) + { + #print(i) + if (is.numeric( data[,i] ) ) + { + if (is.null(result)) + result = data.frame(data[,i]) + else + result = data.frame(result, data[,i]) + colnames(result)[ncol(result)] <- colnames(data)[i] + } + else + { + vals = unique(data[,i]) + for (j in 1:length(vals)) + { + #print(j) + bins = c() + for (k in 1:nrow(data)) + { + if(data[,i][k] == vals[j]) + bins = c(bins,1) + else + bins = c(bins,0) + } + #print(bins) + if (is.null(result)) + result = data.frame(bins) + else + result = data.frame(result, bins) + colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j]) + if (length(vals)==2) break + } + } + } + result +} + +process_data <- function( data ) +{ + if (!is.numeric(data)) + data.num = nominal_to_binary(data) + else + data.num = data + if(any(is.na(data.num))) + data.repl = na.gam.replace(data.num) + else + data.repl = data.num + data.repl +} + +stratified_split <- function( data, ratio=0.3 ) +{ + data.processed = as.matrix(process_data( data )) + pik = rep(ratio,times=nrow(data.processed)) + data.strat = cbind(pik,data.processed) + samplecube(data.strat,pik,order=2,comment=F) +} + +stratified_k_fold_split <- function( data, num_folds=10 ) +{ + print(paste(num_folds,"-fold-split, data-size",nrow(data))) + data.processed = as.matrix(process_data( data )) + folds = rep(0, times=nrow(data)) + for (i in 1:(num_folds-1)) + { + prop = 1/(num_folds-(i-1)) + print(paste("fold",i,"/",num_folds," prop",prop)) + pik = rep(prop,times=nrow(data)) + for (j in 1:nrow(data)) + if(folds[j]!=0) + pik[j]=0 + data.strat = cbind(pik,data.processed) + s<-samplecube(data.strat,pik,order=2,comment=F) + print(paste("fold size: ",sum(s))) + for (j in 1:nrow(data)) + if (s[j] == 1) + folds[j]=i + } + for (j in 1:nrow(data)) + if (folds[j] == 0) + folds[j]=num_folds + folds +} + +plot_split <- function( data, split ) +{ + data.processed = process_data( data ) + data.pca <- prcomp(data.processed, scale=TRUE) + data.2d =as.data.frame(data.pca$x)[1:2] + plot( NULL, + xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]), + xlab = "pc 1", ylab = "pc 2") + for (j in 0:max(split)) + { + set = c() + for (i in 1:nrow(data)) + if (split[i] == j) + set = c(set,i) + points(data.2d[set,], pch = 2, col=(j+1)) + } +} + +#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) +#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) +#data<-rbind(a,b) +#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) +#data<-rbind(data,c) +#data=iris +#split = stratified_k_fold_split(data, num_folds=3) +#split = stratified_split(data, ratio=0.3) +#plot_split(data,split) + + + + diff --git a/validation/validation_application.rb b/validation/validation_application.rb index c02b5f3..cda09fa 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -453,8 +453,9 @@ post '/training_test_split' do raise OpenTox::BadRequestError.new "algorithm_uri missing" unless params[:algorithm_uri].to_s.size>0 raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0 task = OpenTox::Task.create( "Perform training test split validation", url_for("/training_test_split", :full) ) do |task| #, params + strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], - @subjectid, params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33))) + @subjectid, strat, params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33))) v = Validation::Validation.create :validation_type => "training_test_split", :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri], @@ -544,8 +545,9 @@ end post '/plain_training_test_split' do LOGGER.info "creating pure training test split "+params.inspect raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri] - - result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], params[:split_ratio], params[:random_seed]) + strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] + result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], @subjectid, + strat, params[:split_ratio], params[:random_seed]) content_type "text/uri-list" result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n" end diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 889c652..dceead9 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -2,6 +2,7 @@ require "lib/validation_db.rb" require "lib/ot_predictions.rb" +require "lib/r-util.rb" require "validation/validation_format.rb" @@ -618,17 +619,17 @@ module Validation # splits a dataset into test and training dataset # returns map with training_dataset_uri and test_dataset_uri - def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, split_ratio=nil, random_seed=nil, task=nil ) + def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified=false, split_ratio=nil, random_seed=nil, task=nil ) split_ratio=0.67 unless split_ratio split_ratio = split_ratio.to_f random_seed=1 unless random_seed random_seed = random_seed.to_i + raise OpenTox::NotFoundError.new "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f + raise OpenTox::NotFoundError.new "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1 orig_dataset = Lib::DatasetCache.find orig_dataset_uri, subjectid orig_dataset.load_all subjectid raise OpenTox::NotFoundError.new "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset - raise OpenTox::NotFoundError.new "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f - raise OpenTox::NotFoundError.new "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1 if prediction_feature raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+ "' not found in dataset, features are: \n"+ @@ -637,55 +638,49 @@ module Validation LOGGER.warn "no prediciton feature given, all features included in test dataset" end - compounds = orig_dataset.compounds - raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 - split = (compounds.size*split_ratio).to_i - split = [split,1].max - split = [split,compounds.size-2].min - - LOGGER.debug "splitting dataset "+orig_dataset_uri+ + if stratified + Lib::RUtil.init_r + df = Lib::RUtil.dataset_to_dataframe( orig_dataset ) + split = Lib::RUtil.stratified_split( df, split_ratio, random_seed ) + Lib::RUtil.quit_r + raise "internal error" unless split.size==orig_dataset.compounds.size + task.progress(33) if task + + training_compounds = [] + split.size.times{|i| training_compounds << orig_dataset.compounds[i] if split[i]==1} + test_compounds = orig_dataset.compounds - training_compounds + else + compounds = orig_dataset.compounds + raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 + split = (compounds.size*split_ratio).to_i + split = [split,1].max + split = [split,compounds.size-2].min + LOGGER.debug "splitting dataset "+orig_dataset_uri+ " into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compounds.size-1).to_s+ " (shuffled with seed "+random_seed.to_s+")" - compounds.shuffle!( random_seed ) + compounds.shuffle!( random_seed ) + training_compounds = compounds[0..split] + test_compounds = compounds[(split+1)..-1] + end task.progress(33) if task - - result = {} -# result[:training_dataset_uri] = orig_dataset.create_new_dataset( compounds[0..split], -# orig_dataset.features, -# "Training dataset split of "+orig_dataset.title.to_s, -# $sinatra.url_for('/training_test_split',:full) ) -# orig_dataset.data_entries.each do |k,v| -# puts k.inspect+" =>"+v.inspect -# puts v.values[0].to_s+" "+v.values[0].class.to_s -# end + result = {} - result[:training_dataset_uri] = orig_dataset.split( compounds[0..split], + result[:training_dataset_uri] = orig_dataset.split( training_compounds, orig_dataset.features.keys, { DC.title => "Training dataset split of "+orig_dataset.title.to_s, DC.creator => $url_provider.url_for('/training_test_split',:full) }, subjectid ).uri task.progress(66) if task -# d = Lib::DatasetCache.find(result[:training_dataset_uri]) -# d.data_entries.values.each do |v| -# puts v.inspect -# puts v.values[0].to_s+" "+v.values[0].class.to_s -# end -# raise "stop here" - -# result[:test_dataset_uri] = orig_dataset.create_new_dataset( compounds[(split+1)..-1], -# orig_dataset.features.dclone - [prediction_feature], -# "Test dataset split of "+orig_dataset.title.to_s, -# $sinatra.url_for('/training_test_split',:full) ) - result[:test_dataset_uri] = orig_dataset.split( compounds[(split+1)..-1], + result[:test_dataset_uri] = orig_dataset.split( test_compounds, orig_dataset.features.keys.dclone - [prediction_feature], { DC.title => "Test dataset split of "+orig_dataset.title.to_s, DC.creator => $url_provider.url_for('/training_test_split',:full) }, subjectid ).uri task.progress(100) if task - if ENV['RACK_ENV'] =~ /test|debug/ + if !stratified and ENV['RACK_ENV'] =~ /test|debug/ raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid -- cgit v1.2.3