summaryrefslogtreecommitdiff
path: root/lib/stratification.R
diff options
context:
space:
mode:
Diffstat (limited to 'lib/stratification.R')
-rw-r--r--lib/stratification.R123
1 files changed, 123 insertions, 0 deletions
diff --git a/lib/stratification.R b/lib/stratification.R
new file mode 100644
index 0000000..9aa8d1f
--- /dev/null
+++ b/lib/stratification.R
@@ -0,0 +1,123 @@
+library("sampling")
+library("gam")
+
+nominal_to_binary <- function( orig_data )
+{
+ data = as.data.frame( orig_data )
+ result = NULL
+ for (i in 1:ncol(data))
+ {
+ #print(i)
+ if (is.numeric( data[,i] ) )
+ {
+ if (is.null(result))
+ result = data.frame(data[,i])
+ else
+ result = data.frame(result, data[,i])
+ colnames(result)[ncol(result)] <- colnames(data)[i]
+ }
+ else
+ {
+ vals = unique(data[,i])
+ for (j in 1:length(vals))
+ {
+ #print(j)
+ bins = c()
+ for (k in 1:nrow(data))
+ {
+ if(data[,i][k] == vals[j])
+ bins = c(bins,1)
+ else
+ bins = c(bins,0)
+ }
+ #print(bins)
+ if (is.null(result))
+ result = data.frame(bins)
+ else
+ result = data.frame(result, bins)
+ colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
+ if (length(vals)==2) break
+ }
+ }
+ }
+ result
+}
+
+process_data <- function( data )
+{
+ if (!is.numeric(data))
+ data.num = nominal_to_binary(data)
+ else
+ data.num = data
+ if(any(is.na(data.num)))
+ data.repl = na.gam.replace(data.num)
+ else
+ data.repl = data.num
+ data.repl
+}
+
+stratified_split <- function( data, ratio=0.3 )
+{
+ data.processed = as.matrix(process_data( data ))
+ pik = rep(ratio,times=nrow(data.processed))
+ data.strat = cbind(pik,data.processed)
+ samplecube(data.strat,pik,order=2,comment=F)
+}
+
+stratified_k_fold_split <- function( data, num_folds=10 )
+{
+ print(paste(num_folds,"-fold-split, data-size",nrow(data)))
+ data.processed = as.matrix(process_data( data ))
+ folds = rep(0, times=nrow(data))
+ for (i in 1:(num_folds-1))
+ {
+ prop = 1/(num_folds-(i-1))
+ print(paste("fold",i,"/",num_folds," prop",prop))
+ pik = rep(prop,times=nrow(data))
+ for (j in 1:nrow(data))
+ if(folds[j]!=0)
+ pik[j]=0
+ data.strat = cbind(pik,data.processed)
+ s<-samplecube(data.strat,pik,order=2,comment=F)
+ print(paste("fold size: ",sum(s)))
+ for (j in 1:nrow(data))
+ if (s[j] == 1)
+ folds[j]=i
+ }
+ for (j in 1:nrow(data))
+ if (folds[j] == 0)
+ folds[j]=num_folds
+ folds
+}
+
+plot_split <- function( data, split )
+{
+ data.processed = process_data( data )
+ data.pca <- prcomp(data.processed, scale=TRUE)
+ data.2d =as.data.frame(data.pca$x)[1:2]
+ plot( NULL,
+ xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]),
+ xlab = "pc 1", ylab = "pc 2")
+ for (j in 0:max(split))
+ {
+ set = c()
+ for (i in 1:nrow(data))
+ if (split[i] == j)
+ set = c(set,i)
+ points(data.2d[set,], pch = 2, col=(j+1))
+ }
+}
+
+#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
+#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
+#data<-rbind(a,b)
+#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
+#data<-rbind(data,c)
+#data=iris
+#split = stratified_k_fold_split(data, num_folds=3)
+#split = stratified_split(data, ratio=0.3)
+#plot_split(data,split)
+
+
+
+