summaryrefslogtreecommitdiff
path: root/lib/stratification.R
blob: 9aa8d1f90ba4802277a14f94e90ed9c2048a6eba (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
library("sampling")
library("gam")

nominal_to_binary <- function( orig_data )
{
  data = as.data.frame( orig_data )
  result = NULL
  for (i in 1:ncol(data))
  {
     #print(i)
     if (is.numeric( data[,i] ) )
     {
        if (is.null(result))
          result = data.frame(data[,i])
        else
          result = data.frame(result, data[,i])
        colnames(result)[ncol(result)] <- colnames(data)[i]
     }
     else
     {
        vals = unique(data[,i])
        for (j in 1:length(vals))
        {
           #print(j)
           bins = c()
           for (k in 1:nrow(data))
           {
              if(data[,i][k] == vals[j])
                bins = c(bins,1)
              else
                bins = c(bins,0)
           }
           #print(bins)
           if (is.null(result))
             result = data.frame(bins)
           else
             result = data.frame(result, bins)
           colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
           if (length(vals)==2) break
        }
     }
  }
  result
}

process_data <- function( data )
{
  if (!is.numeric(data))
    data.num = nominal_to_binary(data)
  else
    data.num = data
  if(any(is.na(data.num)))
  	data.repl = na.gam.replace(data.num)
  else
  	data.repl = data.num
  data.repl
}

stratified_split <- function( data, ratio=0.3 )
{
    data.processed = as.matrix(process_data( data ))
    pik = rep(ratio,times=nrow(data.processed))
    data.strat = cbind(pik,data.processed)
    samplecube(data.strat,pik,order=2,comment=F)
}

stratified_k_fold_split <- function( data, num_folds=10 )
{
  print(paste(num_folds,"-fold-split, data-size",nrow(data)))
  data.processed = as.matrix(process_data( data ))
  folds = rep(0, times=nrow(data))
  for (i in 1:(num_folds-1))
  {
    prop = 1/(num_folds-(i-1))
    print(paste("fold",i,"/",num_folds," prop",prop))
    pik = rep(prop,times=nrow(data))
    for (j in 1:nrow(data))
      if(folds[j]!=0)
        pik[j]=0
    data.strat = cbind(pik,data.processed)
    s<-samplecube(data.strat,pik,order=2,comment=F)
    print(paste("fold size: ",sum(s)))
    for (j in 1:nrow(data))
      if (s[j] == 1)
        folds[j]=i
  }
  for (j in 1:nrow(data))
    if (folds[j] == 0)
      folds[j]=num_folds
  folds
}

plot_split <- function( data, split )
{
  data.processed = process_data( data )
	data.pca <- prcomp(data.processed, scale=TRUE)
  data.2d =as.data.frame(data.pca$x)[1:2]
  plot( NULL,
        xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]), 
        xlab = "pc 1", ylab = "pc 2")
  for (j in 0:max(split))
  {
    set = c()
    for (i in 1:nrow(data))
      if (split[i] == j)
        set = c(set,i)
    points(data.2d[set,], pch = 2, col=(j+1))
  }
}

#a<-matrix(rnorm(100, mean=50,  sd=4), ncol=5)
#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
#data<-rbind(a,b)
#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
#data<-rbind(data,c)
#data=iris
#split = stratified_k_fold_split(data, num_folds=3)
#split = stratified_split(data, ratio=0.3)
#plot_split(data,split)