diff options
Diffstat (limited to 'lib/stratification.R')
-rw-r--r-- | lib/stratification.R | 123 |
1 files changed, 0 insertions, 123 deletions
diff --git a/lib/stratification.R b/lib/stratification.R deleted file mode 100644 index 9aa8d1f..0000000 --- a/lib/stratification.R +++ /dev/null @@ -1,123 +0,0 @@ -library("sampling") -library("gam") - -nominal_to_binary <- function( orig_data ) -{ - data = as.data.frame( orig_data ) - result = NULL - for (i in 1:ncol(data)) - { - #print(i) - if (is.numeric( data[,i] ) ) - { - if (is.null(result)) - result = data.frame(data[,i]) - else - result = data.frame(result, data[,i]) - colnames(result)[ncol(result)] <- colnames(data)[i] - } - else - { - vals = unique(data[,i]) - for (j in 1:length(vals)) - { - #print(j) - bins = c() - for (k in 1:nrow(data)) - { - if(data[,i][k] == vals[j]) - bins = c(bins,1) - else - bins = c(bins,0) - } - #print(bins) - if (is.null(result)) - result = data.frame(bins) - else - result = data.frame(result, bins) - colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j]) - if (length(vals)==2) break - } - } - } - result -} - -process_data <- function( data ) -{ - if (!is.numeric(data)) - data.num = nominal_to_binary(data) - else - data.num = data - if(any(is.na(data.num))) - data.repl = na.gam.replace(data.num) - else - data.repl = data.num - data.repl -} - -stratified_split <- function( data, ratio=0.3 ) -{ - data.processed = as.matrix(process_data( data )) - pik = rep(ratio,times=nrow(data.processed)) - data.strat = cbind(pik,data.processed) - samplecube(data.strat,pik,order=2,comment=F) -} - -stratified_k_fold_split <- function( data, num_folds=10 ) -{ - print(paste(num_folds,"-fold-split, data-size",nrow(data))) - data.processed = as.matrix(process_data( data )) - folds = rep(0, times=nrow(data)) - for (i in 1:(num_folds-1)) - { - prop = 1/(num_folds-(i-1)) - print(paste("fold",i,"/",num_folds," prop",prop)) - pik = rep(prop,times=nrow(data)) - for (j in 1:nrow(data)) - if(folds[j]!=0) - pik[j]=0 - data.strat = cbind(pik,data.processed) - s<-samplecube(data.strat,pik,order=2,comment=F) - print(paste("fold size: ",sum(s))) - for (j in 1:nrow(data)) - if (s[j] == 1) - folds[j]=i - } - for (j in 1:nrow(data)) - if (folds[j] == 0) - folds[j]=num_folds - folds -} - -plot_split <- function( data, split ) -{ - data.processed = process_data( data ) - data.pca <- prcomp(data.processed, scale=TRUE) - data.2d =as.data.frame(data.pca$x)[1:2] - plot( NULL, - xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]), - xlab = "pc 1", ylab = "pc 2") - for (j in 0:max(split)) - { - set = c() - for (i in 1:nrow(data)) - if (split[i] == j) - set = c(set,i) - points(data.2d[set,], pch = 2, col=(j+1)) - } -} - -#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) -#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) -#data<-rbind(a,b) -#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) -#data<-rbind(data,c) -#data=iris -#split = stratified_k_fold_split(data, num_folds=3) -#split = stratified_split(data, ratio=0.3) -#plot_split(data,split) - - - - |