summaryrefslogtreecommitdiff
path: root/lib/stratification.R
diff options
context:
space:
mode:
Diffstat (limited to 'lib/stratification.R')
-rw-r--r--lib/stratification.R123
1 files changed, 0 insertions, 123 deletions
diff --git a/lib/stratification.R b/lib/stratification.R
deleted file mode 100644
index 9aa8d1f..0000000
--- a/lib/stratification.R
+++ /dev/null
@@ -1,123 +0,0 @@
-library("sampling")
-library("gam")
-
-nominal_to_binary <- function( orig_data )
-{
- data = as.data.frame( orig_data )
- result = NULL
- for (i in 1:ncol(data))
- {
- #print(i)
- if (is.numeric( data[,i] ) )
- {
- if (is.null(result))
- result = data.frame(data[,i])
- else
- result = data.frame(result, data[,i])
- colnames(result)[ncol(result)] <- colnames(data)[i]
- }
- else
- {
- vals = unique(data[,i])
- for (j in 1:length(vals))
- {
- #print(j)
- bins = c()
- for (k in 1:nrow(data))
- {
- if(data[,i][k] == vals[j])
- bins = c(bins,1)
- else
- bins = c(bins,0)
- }
- #print(bins)
- if (is.null(result))
- result = data.frame(bins)
- else
- result = data.frame(result, bins)
- colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
- if (length(vals)==2) break
- }
- }
- }
- result
-}
-
-process_data <- function( data )
-{
- if (!is.numeric(data))
- data.num = nominal_to_binary(data)
- else
- data.num = data
- if(any(is.na(data.num)))
- data.repl = na.gam.replace(data.num)
- else
- data.repl = data.num
- data.repl
-}
-
-stratified_split <- function( data, ratio=0.3 )
-{
- data.processed = as.matrix(process_data( data ))
- pik = rep(ratio,times=nrow(data.processed))
- data.strat = cbind(pik,data.processed)
- samplecube(data.strat,pik,order=2,comment=F)
-}
-
-stratified_k_fold_split <- function( data, num_folds=10 )
-{
- print(paste(num_folds,"-fold-split, data-size",nrow(data)))
- data.processed = as.matrix(process_data( data ))
- folds = rep(0, times=nrow(data))
- for (i in 1:(num_folds-1))
- {
- prop = 1/(num_folds-(i-1))
- print(paste("fold",i,"/",num_folds," prop",prop))
- pik = rep(prop,times=nrow(data))
- for (j in 1:nrow(data))
- if(folds[j]!=0)
- pik[j]=0
- data.strat = cbind(pik,data.processed)
- s<-samplecube(data.strat,pik,order=2,comment=F)
- print(paste("fold size: ",sum(s)))
- for (j in 1:nrow(data))
- if (s[j] == 1)
- folds[j]=i
- }
- for (j in 1:nrow(data))
- if (folds[j] == 0)
- folds[j]=num_folds
- folds
-}
-
-plot_split <- function( data, split )
-{
- data.processed = process_data( data )
- data.pca <- prcomp(data.processed, scale=TRUE)
- data.2d =as.data.frame(data.pca$x)[1:2]
- plot( NULL,
- xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]),
- xlab = "pc 1", ylab = "pc 2")
- for (j in 0:max(split))
- {
- set = c()
- for (i in 1:nrow(data))
- if (split[i] == j)
- set = c(set,i)
- points(data.2d[set,], pch = 2, col=(j+1))
- }
-}
-
-#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
-#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
-#data<-rbind(a,b)
-#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
-#data<-rbind(data,c)
-#data=iris
-#split = stratified_k_fold_split(data, num_folds=3)
-#split = stratified_split(data, ratio=0.3)
-#plot_split(data,split)
-
-
-
-