fixing explict-stratified-splitting: take cluster with index 0 into account

author: mguetlein <martin.guetlein@gmail.com> 2012-07-24 19:41:30 +0200
committer: mguetlein <martin.guetlein@gmail.com> 2012-07-24 19:41:30 +0200
commit: 56d3b604b089af91e3e82c8e9ff3e77841d845fb (patch)
tree: d7dc19106aaa1202b3e5351b4dd8039f8038aa74
parent: c4342873cb6c3aeaf4496ac12d05f47347a4f945 (diff)
1 files changed, 120 insertions, 66 deletions
diff --git a/lib/stratification.R b/lib/stratification.R
index f12b302..238515d 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -122,34 +122,37 @@ explicit_stratified_split <- function( class, ratio=0.3 )
 	num_instances = length(class)
 #	print(num_instances)
 
+	min_class=min(class)
+	max_class=max(class)
+	
 	# create a list with indexes of each class
 	class_idx <- list()
-	for (i in 1:max(class))
-		class_idx[[i]] <- vector() 
+	for (i in min_class:max_class)
+		class_idx[[i+1]] <- vector() 
 	for (i in 1:num_instances)
-		class_idx[[class[i]]] <- c(class_idx[[class[i]]],i)
+		class_idx[[class[i]+1]] <- c(class_idx[[class[i]+1]],i)
 	#print(class_idx)
 	# shuffle each index list
-	for (i in 1:max(class))
-		class_idx[[i]] <- sample(class_idx[[i]]) 
+	for (i in min_class:max_class)
+		class_idx[[i+1]] <- sample(class_idx[[i+1]]) 
  #   print(class_idx)
 	
 	selected <- vector()
 	sum_total <- 0
 	
-	classes = sample(1:max(class))
+	classes = sample(min_class:max_class)
 	# iterate of classes in random order
 	for (i in classes)
 	{
 		# count the numbers of selections that should be made
-		sum_total <- sum_total + length(class_idx[[i]])
+		sum_total <- sum_total + length(class_idx[[i+1]])
 		selected_total <- round_it(sum_total*ratio)
 	#	print(selected_total)
 		# count the number of new selections
 		selected_new <- selected_total - length(selected)
 	#	print(selected_new)
 		# add selections
-		selected <- c(selected, class_idx[[i]][1:selected_new])
+		selected <- c(selected, class_idx[[i+1]][1:selected_new])
 	#	print(selected)
 	#	print("")
 	}
@@ -161,6 +164,17 @@ explicit_stratified_split <- function( class, ratio=0.3 )
 	as.vector(split)
 }
 
+random_split <- function( data, ratio=0.3 )
+{
+	num <- round_it(nrow(data)*ratio)
+	selected <- sample(rep(1:nrow(data)))[1:num]
+	# convert selected indexs to split array
+	split <- array(0,nrow(data))
+	for(i in selected)
+		split[i] <- 1
+	as.vector(split)
+}
+
 stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL, preprocess="none" )
 {
 	data.processed = as.matrix(process_data( data, colnames ))
@@ -447,7 +461,7 @@ plot_pre_process <- function( data, method="pca" )
     stop("unknown method")
 }
 
-plot_split <- function( data, color_idx, circle_idx=NULL, ... )
+plot_split <- function( data, color_idx=NULL, circle_idx=NULL, ... )
 {
   if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
     stop("data not suitable for plotting, plot_pre_process() first")
@@ -474,6 +488,11 @@ plot_split <- function( data, color_idx, circle_idx=NULL, ... )
   col_offset = 2
   if (!is.null(circle_idx))
 	  col_offset = 1
+  if(is.null(color_idx))
+  {
+	  color_idx <- array(0,nrow(data))
+	  col_offset=3
+  }
   for (j in 0:max(color_idx))
   {
 	  set = c()
@@ -518,68 +537,70 @@ superboxplot  <- function( data, ..., significance_level=0.95 )
 	}
 	
 	#print significance tests
-	sig <- array(list(),c(ncol(data),ncol(data)))
-	for (i in 1:(ncol(data)-1))
+	if (nrow(data)>10)
 	{
-		for (j in (i+1):ncol(data))
+		sig <- array(list(),c(ncol(data),ncol(data)))
+		for (i in 1:(ncol(data)-1))
 		{
-			ttest = t.test(data[,i],data[,j],paired=T)
-			#print(ttest$p.value)
-			#print(is.na(ttest$p.value))
-			if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value)
-			{
-				sig[i,j] = T
-				sig[j,i] = T
-			}
-			else
+			for (j in (i+1):ncol(data))
 			{
-				sig[i,j] = F
-				sig[j,i] = F
+				ttest = t.test(data[,i],data[,j],paired=T)
+				#print(ttest$p.value)
+				#print(is.na(ttest$p.value))
+				if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value)
+				{
+					sig[i,j] = T
+					sig[j,i] = T
+				}
+				else
+				{
+					sig[i,j] = F
+					sig[j,i] = F
+				}
 			}
 		}
-	}
-	
-	for (i in 1:ncol(data))
-	{
-		## s <- ""
-		## for (j in 1:ncol(data))
-		## {
-		##     if (i == j)
-		##         s <- paste( s, "-" ,sep="")
-		##     else if (sig[i,j]==T)
-		##         s <- paste( s, "X" ,sep="")
-		##     else
-		##         s <- paste( s, "0" ,sep="")
-		## }
-		
-		s<-""
-		bigger <- ""
-		for (j in 1:ncol(data))
+		for (i in 1:ncol(data))
 		{
-			#print(paste(i,j))
-			if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3]) 
-				bigger <- paste(bigger,j,sep=",")
-		}
-		if (nchar(bigger)>0)
-		{
-			bigger <- substring(bigger, 2)
-			bigger <- paste(">(",bigger,")",sep="")
-			s <- paste(s,bigger,sep=" ")
-		}
-		smaller <- ""
-		for (j in 1:ncol(data))
-		{
-			#print(paste(i,j))
-			if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3]) 
-				smaller <- paste(smaller,j,sep=",")
-		}
-		if (nchar(smaller)>0)
-		{
-			smaller <- substring(smaller, 2)
-			smaller <- paste("<(",smaller,")",sep="")
-			s <- paste(s,smaller,sep=" ")
+			## s <- ""
+			## for (j in 1:ncol(data))
+			## {
+			##     if (i == j)
+			##         s <- paste( s, "-" ,sep="")
+			##     else if (sig[i,j]==T)
+			##         s <- paste( s, "X" ,sep="")
+			##     else
+			##         s <- paste( s, "0" ,sep="")
+			## }
+			
+			s<-""
+			bigger <- ""
+			for (j in 1:ncol(data))
+			{
+				#print(paste(i,j))
+				if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3]) 
+					bigger <- paste(bigger,j,sep=",")
+			}
+			if (nchar(bigger)>0)
+			{
+				bigger <- substring(bigger, 2)
+				bigger <- paste(">(",bigger,")",sep="")
+				s <- paste(s,bigger,sep=" ")
+			}
+			smaller <- ""
+			for (j in 1:ncol(data))
+			{
+				#print(paste(i,j))
+				if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3]) 
+					smaller <- paste(smaller,j,sep=",")
+			}
+			if (nchar(smaller)>0)
+			{
+				smaller <- substring(smaller, 2)
+				smaller <- paste("<(",smaller,")",sep="")
+				s <- paste(s,smaller,sep=" ")
+			}
+			mtext(s,side=1,at=i,line=3)
 		}
-		mtext(s,side=1,at=i,line=3)
 	}
 	
 	#print(sig)
@@ -591,17 +612,25 @@ superboxplot  <- function( data, ..., significance_level=0.95 )
 #c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
 #data<-rbind(data,c)
 #data=iris[1:4]
+
+#require("datasets")
+#data=ChickWeight
+#data=freeny
+#data=sunspots
+#data=faithful
 #pca_reduce_features(data)
 #data=ethanol
 #split = stratified_k_fold_split(data, num_folds=3)
 #split = stratified_split(data, ratio=0.1, method="cluster_hierarchical")
 
 #split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit")
+#split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical")
 
 #cluster = cluster(ethanol,3,3)
 
 #print(split)
 #print(sum(split)
+
 #plot_split(plot_pre_process(data, method="pca"),color_idx=split$split)
 #plot_split(plot_pre_process(data, method="pca"),circle_idx=split$split,color_idx=split$cluster)
 
@@ -610,5 +639,30 @@ superboxplot  <- function( data, ..., significance_level=0.95 )
 #cl = cluster(data)
 
 
-
-
+## # stratified splitting plot
+## 
+## #png("data.png",width=1024,height=768,pointsize=20)
+## png("random.png",width=1024,height=768,pointsize=20)
+## #png("stratfied_cluster.png",width=1024,height=768,pointsize=20)
+## #png("stratfied_cluster_selected.png",width=1024,height=768,pointsize=20)
+## #png("stratfied_selected.png",width=1024,height=768,pointsize=20)
+## 
+## #seed = sample(1:1000000,1)
+## #print(paste("seed ",seed))
+## #set.seed(seed)
+## 
+## set.seed(86281) #unlucky random
+## #set.seed(160719) #nice stratifed
+## 
+## data=df_11306
+## data=cbind(data[1],data[3])
+## plot_split(data,xlab="Molecular weight",ylab="LogP")
+## 
+## #split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit")
+## split = list(split=random_split(data, ratio=0.1))
+## 
+## plot_split(data,color_idx=split$split,xlab="Molecular weight",ylab="LogP")
+## #plot_split(data,color_idx=split$cluster,xlab="Molecular weight",ylab="LogP")
+## #plot_split(data,circle_idx=split$split,color_idx=split$cluster,xlab="Molecular weight",ylab="LogP")
+## 
+## dev.off()
+\ No newline at end of file
author	mguetlein <martin.guetlein@gmail.com>	2012-07-24 19:41:30 +0200
committer	mguetlein <martin.guetlein@gmail.com>	2012-07-24 19:41:30 +0200
commit	56d3b604b089af91e3e82c8e9ff3e77841d845fb (patch)
tree	d7dc19106aaa1202b3e5351b4dd8039f8038aa74
parent	c4342873cb6c3aeaf4496ac12d05f47347a4f945 (diff)