diff options
Diffstat (limited to 'lib/stratification.R')
-rw-r--r-- | lib/stratification.R | 123 |
1 files changed, 123 insertions, 0 deletions
diff --git a/lib/stratification.R b/lib/stratification.R new file mode 100644 index 0000000..9aa8d1f --- /dev/null +++ b/lib/stratification.R @@ -0,0 +1,123 @@ +library("sampling") +library("gam") + +nominal_to_binary <- function( orig_data ) +{ + data = as.data.frame( orig_data ) + result = NULL + for (i in 1:ncol(data)) + { + #print(i) + if (is.numeric( data[,i] ) ) + { + if (is.null(result)) + result = data.frame(data[,i]) + else + result = data.frame(result, data[,i]) + colnames(result)[ncol(result)] <- colnames(data)[i] + } + else + { + vals = unique(data[,i]) + for (j in 1:length(vals)) + { + #print(j) + bins = c() + for (k in 1:nrow(data)) + { + if(data[,i][k] == vals[j]) + bins = c(bins,1) + else + bins = c(bins,0) + } + #print(bins) + if (is.null(result)) + result = data.frame(bins) + else + result = data.frame(result, bins) + colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j]) + if (length(vals)==2) break + } + } + } + result +} + +process_data <- function( data ) +{ + if (!is.numeric(data)) + data.num = nominal_to_binary(data) + else + data.num = data + if(any(is.na(data.num))) + data.repl = na.gam.replace(data.num) + else + data.repl = data.num + data.repl +} + +stratified_split <- function( data, ratio=0.3 ) +{ + data.processed = as.matrix(process_data( data )) + pik = rep(ratio,times=nrow(data.processed)) + data.strat = cbind(pik,data.processed) + samplecube(data.strat,pik,order=2,comment=F) +} + +stratified_k_fold_split <- function( data, num_folds=10 ) +{ + print(paste(num_folds,"-fold-split, data-size",nrow(data))) + data.processed = as.matrix(process_data( data )) + folds = rep(0, times=nrow(data)) + for (i in 1:(num_folds-1)) + { + prop = 1/(num_folds-(i-1)) + print(paste("fold",i,"/",num_folds," prop",prop)) + pik = rep(prop,times=nrow(data)) + for (j in 1:nrow(data)) + if(folds[j]!=0) + pik[j]=0 + data.strat = cbind(pik,data.processed) + s<-samplecube(data.strat,pik,order=2,comment=F) + print(paste("fold size: ",sum(s))) + for (j in 1:nrow(data)) + if (s[j] == 1) + folds[j]=i + } + for (j in 1:nrow(data)) + if (folds[j] == 0) + folds[j]=num_folds + folds +} + +plot_split <- function( data, split ) +{ + data.processed = process_data( data ) + data.pca <- prcomp(data.processed, scale=TRUE) + data.2d =as.data.frame(data.pca$x)[1:2] + plot( NULL, + xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]), + xlab = "pc 1", ylab = "pc 2") + for (j in 0:max(split)) + { + set = c() + for (i in 1:nrow(data)) + if (split[i] == j) + set = c(set,i) + points(data.2d[set,], pch = 2, col=(j+1)) + } +} + +#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) +#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) +#data<-rbind(a,b) +#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) +#data<-rbind(data,c) +#data=iris +#split = stratified_k_fold_split(data, num_folds=3) +#split = stratified_split(data, ratio=0.3) +#plot_split(data,split) + + + + |