summaryrefslogtreecommitdiff
path: root/lib/stratification.R
diff options
context:
space:
mode:
Diffstat (limited to 'lib/stratification.R')
-rw-r--r--lib/stratification.R230
1 files changed, 205 insertions, 25 deletions
diff --git a/lib/stratification.R b/lib/stratification.R
index 8a838fa..f12b302 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -113,10 +113,59 @@ cluster_hierarchical <- function(data, hclust_method="ward", max_num_clusters=15
cut
}
-stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NULL, preprocess="none" )
+# input: classes [ 1 2 4 3 1 2 4 3 2 1 2 3 3 2 ]
+# input : ratio 0.1
+# output: split [ 1 0 0 1 0 0 0 0 0 1 0 0 0 0 ]
+explicit_stratified_split <- function( class, ratio=0.3 )
+{
+# print(class)
+ num_instances = length(class)
+# print(num_instances)
+
+ # create a list with indexes of each class
+ class_idx <- list()
+ for (i in 1:max(class))
+ class_idx[[i]] <- vector()
+ for (i in 1:num_instances)
+ class_idx[[class[i]]] <- c(class_idx[[class[i]]],i)
+ #print(class_idx)
+ # shuffle each index list
+ for (i in 1:max(class))
+ class_idx[[i]] <- sample(class_idx[[i]])
+ # print(class_idx)
+
+ selected <- vector()
+ sum_total <- 0
+
+ classes = sample(1:max(class))
+ # iterate of classes in random order
+ for (i in classes)
+ {
+ # count the numbers of selections that should be made
+ sum_total <- sum_total + length(class_idx[[i]])
+ selected_total <- round_it(sum_total*ratio)
+ # print(selected_total)
+ # count the number of new selections
+ selected_new <- selected_total - length(selected)
+ # print(selected_new)
+ # add selections
+ selected <- c(selected, class_idx[[i]][1:selected_new])
+ # print(selected)
+ # print("")
+ }
+
+ # convert selected indexs to split array
+ split <- array(0,num_instances)
+ for(i in selected)
+ split[i] <- 1
+ as.vector(split)
+}
+
+stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL, preprocess="none" )
{
data.processed = as.matrix(process_data( data, colnames ))
print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio," preprocess: ",preprocess))
+ cl = NULL
if (preprocess == "none")
{
@@ -137,11 +186,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
pik = rep(ratio,times=nrow(data.processed))
data.strat = cbind(pik,data.processed)
- samplecube(data.strat,pik,order=2,comment=F)
+ split = samplecube(data.strat,pik,order=2,comment=F)
}
else if (method == "cluster_knn")
{
- cl = cluster_knn(data.processed)
+ cl = as.vector(cluster_knn(data.processed))
# require("caret")
# res = createDataPartition(cl,p=ratio)
# split = rep(1, times=nrow(data))
@@ -149,12 +198,26 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
# if ( is.na(match(j,res$Resample1)) )
# split[j]=0
# split
- stratified_split(cl,ratio,"samplecube")
+
+ split = stratified_split(cl,ratio,"samplecube")$split
}
else if (method == "cluster_hierarchical")
{
- cl = cluster_hierarchical(data.processed)
- stratified_split(cl,ratio,"samplecube")
+ cl = as.vector(cluster_hierarchical(data.processed))
+
+ #require("caret")
+ #res = createDataPartition(cl,p=ratio)
+ #split = rep(1, times=nrow(data))
+ #for (j in 1:nrow(data))
+ #if ( is.na(match(j,res$Resample1)) )
+ # split[j]=0
+
+ if (method_2 == "samplecube")
+ split = stratified_split(cl,ratio,"samplecube")$split
+ else if (method_2 == "explicit")
+ split = explicit_stratified_split(cl,ratio)
+ else
+ stop("unknown method")
}
## else if (method == "cluster2")
## {
@@ -199,6 +262,8 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
## }
else
stop("unknown method")
+
+ list(split=split, cluster=cl)
}
anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
@@ -213,7 +278,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
data.processed = as.matrix(process_data( data, colnames ))
print(paste("anti-split using #features: ",ncol(data.processed)))
num_c = floor(1/ratio)
- cl = cluster(data.processed, num_c, num_c)
+ cl = cluster_knn(data.processed, num_c, num_c)
#print(cl)
idx = -1
min = 1000000
@@ -244,7 +309,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
for(j in 1:nrow(data))
split[j] = 1-split[j]
#print(split)
- as.vector(split)
+ list(split=as.vector(split),cluster=cl)
}
stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
@@ -382,7 +447,7 @@ plot_pre_process <- function( data, method="pca" )
stop("unknown method")
}
-plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... )
+plot_split <- function( data, color_idx, circle_idx=NULL, ... )
{
if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
stop("data not suitable for plotting, plot_pre_process() first")
@@ -390,40 +455,155 @@ plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... )
plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
if (is.null(names))
names <- c("split 1","split 2")
- colos = as.double(rep(2:(max(split)+2)))
- legend("topleft",names,pch=2,col=colos)
+ #colos = as.double(rep(2:(max(color_idx)+2)))
+ #legend("topleft",names,pch=2,col=colos)
+
+ #legend("topleft",names[1],pch=as.double(c(1)),col="red")
- for (j in max(color_idx):0)
+ ## for (j in max(color_idx):0)
+ ## {
+ ## for (k in max(shape_idx):0)
+ ## {
+ ## set = c()
+ ## for (i in 1:nrow(data))
+ ## if (color_idx[i]==j && shape_idx[i]==k)
+ ## set = c(set,i)
+ ## points(data[set,], pch = 15+k, col=(j+2))
+ ## }
+ ## }
+ col_offset = 2
+ if (!is.null(circle_idx))
+ col_offset = 1
+ for (j in 0:max(color_idx))
{
- for (k in max(shape_idx):0)
- {
- set = c()
- for (i in 1:nrow(data))
- if (color_idx[i]==j && shape_idx[i]==k)
- set = c(set,i)
- points(data[set,], pch = 15+k, col=(j+2))
- }
+ set = c()
+ for (i in 1:nrow(data))
+ if (color_idx[i]==j)
+ set = c(set,i)
+ points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset)
+ }
+ if (!is.null(circle_idx))
+ {
+ set = c()
+ for (i in 1:nrow(data))
+ if (circle_idx[i]==1)
+ set = c(set,i)#
+ points(data[set,], pch = 1, cex=1, col=1)
+ points(data[set,], pch = 1, cex=1.8, col=1)
+
+ ## for (j in max(color_idx):0)
+ ## {
+ ## set = c()
+ ## for (i in 1:nrow(data))
+ ## if (color_idx[i]==j && circle_idx[i]==1)
+ ## set = c(set,i)
+ ## points(data[set,], pch = 19, cex=1, col=1)
+ ## points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset)
+ ## points(data[set,], pch = 1, cex=2, col=1)
+ ## }
+
}
}
+superboxplot <- function( data, ..., significance_level=0.95 )
+{
+ b <- boxplot(data,...) #,col=rep(2:(ncol(data)+1)))
+
+ #print mean and stdev
+ for (i in 1:ncol(data))
+ {
+ med <- sprintf("%.3f",b$stats[,i][3])
+ stdev <- sprintf("%.3f",sqrt(var(data[,i])))
+ mtext(paste(med,"+-",stdev),side=1,at=i,line=2)
+ }
+
+ #print significance tests
+ sig <- array(list(),c(ncol(data),ncol(data)))
+ for (i in 1:(ncol(data)-1))
+ {
+ for (j in (i+1):ncol(data))
+ {
+ ttest = t.test(data[,i],data[,j],paired=T)
+ #print(ttest$p.value)
+ #print(is.na(ttest$p.value))
+ if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value)
+ {
+ sig[i,j] = T
+ sig[j,i] = T
+ }
+ else
+ {
+ sig[i,j] = F
+ sig[j,i] = F
+ }
+ }
+ }
+
+ for (i in 1:ncol(data))
+ {
+ ## s <- ""
+ ## for (j in 1:ncol(data))
+ ## {
+ ## if (i == j)
+ ## s <- paste( s, "-" ,sep="")
+ ## else if (sig[i,j]==T)
+ ## s <- paste( s, "X" ,sep="")
+ ## else
+ ## s <- paste( s, "0" ,sep="")
+ ## }
+
+ s<-""
+ bigger <- ""
+ for (j in 1:ncol(data))
+ {
+ #print(paste(i,j))
+ if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3])
+ bigger <- paste(bigger,j,sep=",")
+ }
+ if (nchar(bigger)>0)
+ {
+ bigger <- substring(bigger, 2)
+ bigger <- paste(">(",bigger,")",sep="")
+ s <- paste(s,bigger,sep=" ")
+ }
+ smaller <- ""
+ for (j in 1:ncol(data))
+ {
+ #print(paste(i,j))
+ if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3])
+ smaller <- paste(smaller,j,sep=",")
+ }
+ if (nchar(smaller)>0)
+ {
+ smaller <- substring(smaller, 2)
+ smaller <- paste("<(",smaller,")",sep="")
+ s <- paste(s,smaller,sep=" ")
+ }
+ mtext(s,side=1,at=i,line=3)
+ }
+
+ #print(sig)
+}
+
#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
#data<-rbind(a,b)
#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
#data<-rbind(data,c)
-#data=iris
+#data=iris[1:4]
#pca_reduce_features(data)
#data=ethanol
#split = stratified_k_fold_split(data, num_folds=3)
-#split = anti_stratified_split(data, ratio=0.75)
+#split = stratified_split(data, ratio=0.1, method="cluster_hierarchical")
-#split = stratified_split(data, ratio=0.9,preprocess = "pca")
+#split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit")
#cluster = cluster(ethanol,3,3)
#print(split)
-#print(sum(split))
-#plot_split(plot_pre_process(data, method="pca"),split,c("training","test"))
+#print(sum(split)
+#plot_split(plot_pre_process(data, method="pca"),color_idx=split$split)
+#plot_split(plot_pre_process(data, method="pca"),circle_idx=split$split,color_idx=split$cluster)