diff options
Diffstat (limited to 'lib/stratification.R')
-rw-r--r-- | lib/stratification.R | 230 |
1 files changed, 205 insertions, 25 deletions
diff --git a/lib/stratification.R b/lib/stratification.R index 8a838fa..f12b302 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -113,10 +113,59 @@ cluster_hierarchical <- function(data, hclust_method="ward", max_num_clusters=15 cut } -stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NULL, preprocess="none" ) +# input: classes [ 1 2 4 3 1 2 4 3 2 1 2 3 3 2 ] +# input : ratio 0.1 +# output: split [ 1 0 0 1 0 0 0 0 0 1 0 0 0 0 ] +explicit_stratified_split <- function( class, ratio=0.3 ) +{ +# print(class) + num_instances = length(class) +# print(num_instances) + + # create a list with indexes of each class + class_idx <- list() + for (i in 1:max(class)) + class_idx[[i]] <- vector() + for (i in 1:num_instances) + class_idx[[class[i]]] <- c(class_idx[[class[i]]],i) + #print(class_idx) + # shuffle each index list + for (i in 1:max(class)) + class_idx[[i]] <- sample(class_idx[[i]]) + # print(class_idx) + + selected <- vector() + sum_total <- 0 + + classes = sample(1:max(class)) + # iterate of classes in random order + for (i in classes) + { + # count the numbers of selections that should be made + sum_total <- sum_total + length(class_idx[[i]]) + selected_total <- round_it(sum_total*ratio) + # print(selected_total) + # count the number of new selections + selected_new <- selected_total - length(selected) + # print(selected_new) + # add selections + selected <- c(selected, class_idx[[i]][1:selected_new]) + # print(selected) + # print("") + } + + # convert selected indexs to split array + split <- array(0,num_instances) + for(i in selected) + split[i] <- 1 + as.vector(split) +} + +stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL, preprocess="none" ) { data.processed = as.matrix(process_data( data, colnames )) print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio," preprocess: ",preprocess)) + cl = NULL if (preprocess == "none") { @@ -137,11 +186,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed) pik = rep(ratio,times=nrow(data.processed)) data.strat = cbind(pik,data.processed) - samplecube(data.strat,pik,order=2,comment=F) + split = samplecube(data.strat,pik,order=2,comment=F) } else if (method == "cluster_knn") { - cl = cluster_knn(data.processed) + cl = as.vector(cluster_knn(data.processed)) # require("caret") # res = createDataPartition(cl,p=ratio) # split = rep(1, times=nrow(data)) @@ -149,12 +198,26 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU # if ( is.na(match(j,res$Resample1)) ) # split[j]=0 # split - stratified_split(cl,ratio,"samplecube") + + split = stratified_split(cl,ratio,"samplecube")$split } else if (method == "cluster_hierarchical") { - cl = cluster_hierarchical(data.processed) - stratified_split(cl,ratio,"samplecube") + cl = as.vector(cluster_hierarchical(data.processed)) + + #require("caret") + #res = createDataPartition(cl,p=ratio) + #split = rep(1, times=nrow(data)) + #for (j in 1:nrow(data)) + #if ( is.na(match(j,res$Resample1)) ) + # split[j]=0 + + if (method_2 == "samplecube") + split = stratified_split(cl,ratio,"samplecube")$split + else if (method_2 == "explicit") + split = explicit_stratified_split(cl,ratio) + else + stop("unknown method") } ## else if (method == "cluster2") ## { @@ -199,6 +262,8 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU ## } else stop("unknown method") + + list(split=split, cluster=cl) } anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) @@ -213,7 +278,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) data.processed = as.matrix(process_data( data, colnames )) print(paste("anti-split using #features: ",ncol(data.processed))) num_c = floor(1/ratio) - cl = cluster(data.processed, num_c, num_c) + cl = cluster_knn(data.processed, num_c, num_c) #print(cl) idx = -1 min = 1000000 @@ -244,7 +309,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) for(j in 1:nrow(data)) split[j] = 1-split[j] #print(split) - as.vector(split) + list(split=as.vector(split),cluster=cl) } stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL ) @@ -382,7 +447,7 @@ plot_pre_process <- function( data, method="pca" ) stop("unknown method") } -plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... ) +plot_split <- function( data, color_idx, circle_idx=NULL, ... ) { if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2])) stop("data not suitable for plotting, plot_pre_process() first") @@ -390,40 +455,155 @@ plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... ) plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... ) if (is.null(names)) names <- c("split 1","split 2") - colos = as.double(rep(2:(max(split)+2))) - legend("topleft",names,pch=2,col=colos) + #colos = as.double(rep(2:(max(color_idx)+2))) + #legend("topleft",names,pch=2,col=colos) + + #legend("topleft",names[1],pch=as.double(c(1)),col="red") - for (j in max(color_idx):0) + ## for (j in max(color_idx):0) + ## { + ## for (k in max(shape_idx):0) + ## { + ## set = c() + ## for (i in 1:nrow(data)) + ## if (color_idx[i]==j && shape_idx[i]==k) + ## set = c(set,i) + ## points(data[set,], pch = 15+k, col=(j+2)) + ## } + ## } + col_offset = 2 + if (!is.null(circle_idx)) + col_offset = 1 + for (j in 0:max(color_idx)) { - for (k in max(shape_idx):0) - { - set = c() - for (i in 1:nrow(data)) - if (color_idx[i]==j && shape_idx[i]==k) - set = c(set,i) - points(data[set,], pch = 15+k, col=(j+2)) - } + set = c() + for (i in 1:nrow(data)) + if (color_idx[i]==j) + set = c(set,i) + points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset) + } + if (!is.null(circle_idx)) + { + set = c() + for (i in 1:nrow(data)) + if (circle_idx[i]==1) + set = c(set,i)# + points(data[set,], pch = 1, cex=1, col=1) + points(data[set,], pch = 1, cex=1.8, col=1) + + ## for (j in max(color_idx):0) + ## { + ## set = c() + ## for (i in 1:nrow(data)) + ## if (color_idx[i]==j && circle_idx[i]==1) + ## set = c(set,i) + ## points(data[set,], pch = 19, cex=1, col=1) + ## points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset) + ## points(data[set,], pch = 1, cex=2, col=1) + ## } + } } +superboxplot <- function( data, ..., significance_level=0.95 ) +{ + b <- boxplot(data,...) #,col=rep(2:(ncol(data)+1))) + + #print mean and stdev + for (i in 1:ncol(data)) + { + med <- sprintf("%.3f",b$stats[,i][3]) + stdev <- sprintf("%.3f",sqrt(var(data[,i]))) + mtext(paste(med,"+-",stdev),side=1,at=i,line=2) + } + + #print significance tests + sig <- array(list(),c(ncol(data),ncol(data))) + for (i in 1:(ncol(data)-1)) + { + for (j in (i+1):ncol(data)) + { + ttest = t.test(data[,i],data[,j],paired=T) + #print(ttest$p.value) + #print(is.na(ttest$p.value)) + if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value) + { + sig[i,j] = T + sig[j,i] = T + } + else + { + sig[i,j] = F + sig[j,i] = F + } + } + } + + for (i in 1:ncol(data)) + { + ## s <- "" + ## for (j in 1:ncol(data)) + ## { + ## if (i == j) + ## s <- paste( s, "-" ,sep="") + ## else if (sig[i,j]==T) + ## s <- paste( s, "X" ,sep="") + ## else + ## s <- paste( s, "0" ,sep="") + ## } + + s<-"" + bigger <- "" + for (j in 1:ncol(data)) + { + #print(paste(i,j)) + if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3]) + bigger <- paste(bigger,j,sep=",") + } + if (nchar(bigger)>0) + { + bigger <- substring(bigger, 2) + bigger <- paste(">(",bigger,")",sep="") + s <- paste(s,bigger,sep=" ") + } + smaller <- "" + for (j in 1:ncol(data)) + { + #print(paste(i,j)) + if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3]) + smaller <- paste(smaller,j,sep=",") + } + if (nchar(smaller)>0) + { + smaller <- substring(smaller, 2) + smaller <- paste("<(",smaller,")",sep="") + s <- paste(s,smaller,sep=" ") + } + mtext(s,side=1,at=i,line=3) + } + + #print(sig) +} + #a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) #b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) #data<-rbind(a,b) #c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) #data<-rbind(data,c) -#data=iris +#data=iris[1:4] #pca_reduce_features(data) #data=ethanol #split = stratified_k_fold_split(data, num_folds=3) -#split = anti_stratified_split(data, ratio=0.75) +#split = stratified_split(data, ratio=0.1, method="cluster_hierarchical") -#split = stratified_split(data, ratio=0.9,preprocess = "pca") +#split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit") #cluster = cluster(ethanol,3,3) #print(split) -#print(sum(split)) -#plot_split(plot_pre_process(data, method="pca"),split,c("training","test")) +#print(sum(split) +#plot_split(plot_pre_process(data, method="pca"),color_idx=split$split) +#plot_split(plot_pre_process(data, method="pca"),circle_idx=split$split,color_idx=split$cluster) |