extend/fix(?) stratified splitting, more r-util stuff

author: mguetlein <martin.guetlein@gmail.com> 2012-07-23 22:28:08 +0200
committer: mguetlein <martin.guetlein@gmail.com> 2012-07-23 22:28:08 +0200
commit: c4342873cb6c3aeaf4496ac12d05f47347a4f945 (patch)
tree: daf612ee0722470336ec24ed60d1e01f6a393b4d
parent: d8f8050ba8bf47c5a272bcb937b4ee4f18ac5da0 (diff)
2 files changed, 291 insertions, 40 deletions
diff --git a/lib/r-util.rb b/lib/r-util.rb
index bbf98f3..952b873 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -58,10 +58,40 @@ module OpenTox
       end
     end
     
+    
+    def ttest_matrix(arrays, paired, significance_level=0.95)
+    
+      LOGGER.info("perform ttest matrix")
+      result = []
+      arrays.size.times do |i|
+        result[i] = [] 
+        @r.assign "v#{i}",arrays[i]
+      end 
+      (arrays.size-1).times do |i|
+        (i+1..arrays.size-1).each do |j|
+          raise if paired && arrays[i].size!=arrays[j].size
+          @r.eval "ttest = t.test(as.numeric(v#{i}),as.numeric(v#{j}),paired=#{paired ? "T" : "F"})"
+          t = @r.pull "ttest$statistic"
+          p = @r.pull "ttest$p.value"
+          if (1-significance_level > p)
+            result[i][j]=true
+            result[j][i]=true
+          else
+            result[i][j]=false
+            result[j][i]=false
+          end            
+        end
+      end
+      result
+    end
+    
+    
+    
     # <0 -> array1 << array2
     # 0  -> no significant difference
     # >0 -> array2 >> array1
     def ttest(array1, array2, paired, significance_level=0.95)
+      LOGGER.info("perform ttest")
       @r.assign "v1",array1
       @r.assign "v2",array2
       raise if paired && array1.size!=array2.size
@@ -106,6 +136,7 @@ module OpenTox
     def boxplot(files, data, title="", hline=nil, param="")
       LOGGER.debug("r-util> create boxplot "+data.inspect)
       raise "no hashes, to keep order" if data.is_a?(Hash)
+      raise "boxplot: data is empty" if data.size==0
       max = -1
       min = 100000
       max_median = -1
@@ -158,7 +189,7 @@ module OpenTox
     # embedds feature values of two datasets into 2D and plots it
     #        
     def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
-        features=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil)
+        features=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil, color_feature=nil )
         
       LOGGER.debug("r-util> create feature value plot")
       d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
@@ -178,23 +209,42 @@ module OpenTox
       waiting_task.progress(50) if waiting_task
       
       @r.eval "df <- rbind(#{df1},#{df2})"
-      @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
+      @r.eval "split <- c(rep(1,nrow(#{df1})),rep(0,nrow(#{df2})))"
       @r.names = [dataset_name1, dataset_name2]
       LOGGER.debug("r-util> - convert data to 2d")
       #@r.eval "save.image(\"/tmp/image.R\")"
       
+      if (color_feature)
+        color = []
+        [d1,d2].each do |d|
+          raise "no #{color_feature}, instead: #{d.features.keys.sort.inspect}" unless d.features.has_key?(color_feature)
+          d.compounds.each do |c|
+            color += d.data_entries[c][color_feature]
+          end
+        end
+        @r.assign "color",color
+      end
+      
       if (direct_plot)
         raise unless features.size==2
         @r.eval "df.2d <- df"
+        x = features[0].split("/")[-1]
+        y = features[1].split("/")[-1]
       else
         @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
+        x = "x"
+        y = "y"
       end
       waiting_task.progress(75) if waiting_task
       
       title = "Sammon embedding of #{features.size} features" unless title
       LOGGER.debug("r-util> - plot data")
       plot_to_files(files) do |file|
-        @r.eval "plot_split( df.2d, split, names, main='#{title}',xlab='x',ylab='y')"
+        if (color_feature)
+          @r.eval "plot_split( df.2d, color_idx=color, circle_idx=split, main='#{title}',xlab='#{x}',ylab='#{y}')"
+        else
+          @r.eval "plot_split( df.2d, color_idx=split, main='#{title}',xlab='#{x}',ylab='#{y}')"
+        end
       end
     end
     
@@ -255,8 +305,10 @@ module OpenTox
     # stratified splits a dataset into two dataset according to the feature values
     # all features are taken into account unless <split_features> is given
     # returns two datases
-    def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil, anti_stratification=false )
-      stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features, anti_stratification )
+    def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, 
+      seed=42, split_features=nil, anti_stratification=false, store_split_clusters=false )
+      stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, 
+        seed, split_features, anti_stratification, store_split_clusters )
     end
     
     # stratified splits a dataset into k datasets according the feature values
@@ -267,7 +319,8 @@ module OpenTox
     end    
     
     private
-    def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil, stratification="super" )
+    def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, 
+        pct=nil, subjectid=nil, seed=42, split_features=nil, stratification="super", store_split_clusters=false )
       raise "internal error" if num_folds!=nil and pct!=nil
       k_fold_split = num_folds!=nil
       if k_fold_split
@@ -303,9 +356,10 @@ module OpenTox
         end
         return train, test
       else
-        raise unless stratification=~/^(super|super4|anti)$/
+        raise unless stratification=~/^(super|super4|super5|anti)$/
         anti = ""
         super_method = ""
+        super_method_2 = ""
         preprocess = ""
         case stratification
         when "anti"
@@ -315,14 +369,19 @@ module OpenTox
         when "super4"
           super_method = ", method='cluster_hierarchical'"
           preprocess = ", preprocess='pca'"
+        when "super5"
+          super_method = ", method='cluster_hierarchical'"
+          super_method_2 = ", method_2='explicit'"
+          preprocess = ", preprocess='pca'"
         end
-        puts "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{preprocess})"
-        @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{preprocess})"
-        split = @r.pull 'split'
+        LOGGER.debug "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})"
+        @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})"
+        split = @r.pull 'split$split'
+        cluster = (store_split_clusters ?  @r.pull('split$cluster') : nil)
         metadata[DC.title] = "Training dataset split of "+dataset.uri
-        train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
+        train = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==1 }
         metadata[DC.title] = "Test dataset split of "+dataset.uri
-        test = split_to_dataset( df, split, metadata, subjectid, missing_values ){ |i| i==0 }
+        test = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==0 }
         return train, test
       end
     end
@@ -436,7 +495,7 @@ module OpenTox
     NEW = false
     
     private
-    def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil, missing_values="NA" )
+    def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil, missing_values="NA", cluster=nil )
       raise unless @@feats[df].size>0
 
       missing_value_regexp = Regexp.new("^#{missing_values.to_s=="0" ? "(0.0|0)" : missing_values.to_s}$") unless NEW
@@ -469,14 +528,26 @@ module OpenTox
           end 
         end
       end
+      if cluster
+        cluster_feature = "http://no.such.domain/feature/split_cluster"
+        dataset.add_feature(cluster_feature)
+        #LOGGER.warn "adding feature #{cluster_feature}" 
+        compounds.size.times do |r|
+          if compound_indices==nil or compound_indices.include?(r)
+            dataset.add(compounds[r],cluster_feature,cluster[r],true)
+          end 
+        end
+      else
+        #LOGGER.warn "no cluster feature" 
+      end
       dataset.save(subjectid)
       dataset
     end    
     
-    def split_to_dataset( df, split, metadata={}, subjectid=nil, missing_values="NA" )
+    def split_to_dataset( df, split, metadata={}, subjectid=nil, missing_values="NA", cluster=nil )
       indices = []
       split.size.times{|i| indices<<i if yield(split[i]) }
-      dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices, missing_values )
+      dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices, missing_values, cluster )
       LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
       dataset
     end
diff --git a/lib/stratification.R b/lib/stratification.R
index 8a838fa..f12b302 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -113,10 +113,59 @@ cluster_hierarchical <- function(data, hclust_method="ward", max_num_clusters=15
 	cut
 }
 
-stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NULL, preprocess="none" )
+# input: classes [ 1 2 4 3 1 2 4 3 2 1 2 3 3 2 ]
+# input : ratio 0.1
+# output: split  [ 1 0 0 1 0 0 0 0 0 1 0 0 0 0 ]
+explicit_stratified_split <- function( class, ratio=0.3 )
+{
+#	print(class)
+	num_instances = length(class)
+#	print(num_instances)
+
+	# create a list with indexes of each class
+	class_idx <- list()
+	for (i in 1:max(class))
+		class_idx[[i]] <- vector() 
+	for (i in 1:num_instances)
+		class_idx[[class[i]]] <- c(class_idx[[class[i]]],i)
+	#print(class_idx)
+	# shuffle each index list
+	for (i in 1:max(class))
+		class_idx[[i]] <- sample(class_idx[[i]]) 
+ #   print(class_idx)
+	
+	selected <- vector()
+	sum_total <- 0
+	
+	classes = sample(1:max(class))
+	# iterate of classes in random order
+	for (i in classes)
+	{
+		# count the numbers of selections that should be made
+		sum_total <- sum_total + length(class_idx[[i]])
+		selected_total <- round_it(sum_total*ratio)
+	#	print(selected_total)
+		# count the number of new selections
+		selected_new <- selected_total - length(selected)
+	#	print(selected_new)
+		# add selections
+		selected <- c(selected, class_idx[[i]][1:selected_new])
+	#	print(selected)
+	#	print("")
+	}
+	
+	# convert selected indexs to split array
+	split <- array(0,num_instances)
+	for(i in selected)
+		split[i] <- 1
+	as.vector(split)
+}
+
+stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL, preprocess="none" )
 {
 	data.processed = as.matrix(process_data( data, colnames ))
 	print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio," preprocess: ",preprocess))
+	cl = NULL
 	
 	if (preprocess == "none")
 	{
@@ -137,11 +186,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
       ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
       pik = rep(ratio,times=nrow(data.processed))
       data.strat = cbind(pik,data.processed)
-      samplecube(data.strat,pik,order=2,comment=F)
+      split = samplecube(data.strat,pik,order=2,comment=F)
     }
     else if (method == "cluster_knn")
     {
-      cl = cluster_knn(data.processed)
+      cl = as.vector(cluster_knn(data.processed))
 #      require("caret")
 #      res = createDataPartition(cl,p=ratio)
 #      split = rep(1, times=nrow(data))
@@ -149,12 +198,26 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
 #        if ( is.na(match(j,res$Resample1)) )
 #          split[j]=0
 #      split
-      stratified_split(cl,ratio,"samplecube")
+	   
+       split = stratified_split(cl,ratio,"samplecube")$split
     }
 	else if (method == "cluster_hierarchical") 
 	{
-		cl = cluster_hierarchical(data.processed)
-		stratified_split(cl,ratio,"samplecube")
+		cl = as.vector(cluster_hierarchical(data.processed))
+		
+	    #require("caret")
+        #res = createDataPartition(cl,p=ratio)
+        #split = rep(1, times=nrow(data))
+        #for (j in 1:nrow(data))
+        #if ( is.na(match(j,res$Resample1)) )
+        #  split[j]=0
+		
+	   if (method_2 == "samplecube")
+	     split = stratified_split(cl,ratio,"samplecube")$split
+	   else if (method_2 == "explicit")
+	     split = explicit_stratified_split(cl,ratio)
+	   else
+		   stop("unknown method")
 	}
 	## else if (method == "cluster2")
 	## {
@@ -199,6 +262,8 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
 	## }
     else
       stop("unknown method")
+  
+    list(split=split, cluster=cl)
 }
 
 anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
@@ -213,7 +278,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
   data.processed = as.matrix(process_data( data, colnames ))
   print(paste("anti-split using #features: ",ncol(data.processed)))
   num_c = floor(1/ratio)
-  cl = cluster(data.processed, num_c, num_c)
+  cl = cluster_knn(data.processed, num_c, num_c)
   #print(cl)
   idx = -1
   min = 1000000
@@ -244,7 +309,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
     for(j in 1:nrow(data))
       split[j] = 1-split[j]
   #print(split)
-  as.vector(split)
+  list(split=as.vector(split),cluster=cl)
 }
 
 stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
@@ -382,7 +447,7 @@ plot_pre_process <- function( data, method="pca" )
     stop("unknown method")
 }
 
-plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... )
+plot_split <- function( data, color_idx, circle_idx=NULL, ... )
 {
   if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
     stop("data not suitable for plotting, plot_pre_process() first")
@@ -390,40 +455,155 @@ plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... )
   plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
   if (is.null(names))
     names <- c("split 1","split 2")
-  colos = as.double(rep(2:(max(split)+2)))
-  legend("topleft",names,pch=2,col=colos)
+  #colos = as.double(rep(2:(max(color_idx)+2)))
+  #legend("topleft",names,pch=2,col=colos)
+  
+  #legend("topleft",names[1],pch=as.double(c(1)),col="red")
 
-  for (j in max(color_idx):0)
+  ## for (j in max(color_idx):0)
+  ## {
+  ##   for (k in max(shape_idx):0)
+  ##   {
+  ##     set = c()
+  ##     for (i in 1:nrow(data))
+  ##       if (color_idx[i]==j && shape_idx[i]==k)
+  ##         set = c(set,i)
+  ##     points(data[set,], pch = 15+k, col=(j+2))
+  ##   }
+  ## }
+  col_offset = 2
+  if (!is.null(circle_idx))
+	  col_offset = 1
+  for (j in 0:max(color_idx))
   {
-    for (k in max(shape_idx):0)
-    {
-      set = c()
-      for (i in 1:nrow(data))
-        if (color_idx[i]==j && shape_idx[i]==k)
-          set = c(set,i)
-      points(data[set,], pch = 15+k, col=(j+2))
-    }
+	  set = c()
+	  for (i in 1:nrow(data))
+		  if (color_idx[i]==j)
+			  set = c(set,i)
+	  points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset)
+  }
+  if (!is.null(circle_idx))
+  {
+	  set = c()
+	  for (i in 1:nrow(data))
+		  if (circle_idx[i]==1)
+				set = c(set,i)#
+	  points(data[set,], pch = 1, cex=1, col=1)
+	  points(data[set,], pch = 1, cex=1.8, col=1)
+	  
+	  ## for (j in max(color_idx):0)
+	  ## {
+	  ##     set = c()
+	  ##     for (i in 1:nrow(data))
+	  ##         if (color_idx[i]==j && circle_idx[i]==1)
+	  ##             set = c(set,i)
+	  ##     points(data[set,], pch = 19, cex=1, col=1)
+	  ##     points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset)
+	  ##     points(data[set,], pch = 1, cex=2, col=1)
+	  ## }
+	  
   }
 }
 
+superboxplot  <- function( data, ..., significance_level=0.95 )
+{
+	b <- boxplot(data,...) #,col=rep(2:(ncol(data)+1)))
+	
+	#print mean and stdev
+	for (i in 1:ncol(data))
+	{
+		med <- sprintf("%.3f",b$stats[,i][3])
+		stdev <- sprintf("%.3f",sqrt(var(data[,i])))
+		mtext(paste(med,"+-",stdev),side=1,at=i,line=2)
+	}
+	
+	#print significance tests
+	sig <- array(list(),c(ncol(data),ncol(data)))
+	for (i in 1:(ncol(data)-1))
+	{
+		for (j in (i+1):ncol(data))
+		{
+			ttest = t.test(data[,i],data[,j],paired=T)
+			#print(ttest$p.value)
+			#print(is.na(ttest$p.value))
+			if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value)
+			{
+				sig[i,j] = T
+				sig[j,i] = T
+			}
+			else
+			{
+				sig[i,j] = F
+				sig[j,i] = F
+			}
+		}
+	}
+	
+	for (i in 1:ncol(data))
+	{
+		## s <- ""
+		## for (j in 1:ncol(data))
+		## {
+		##     if (i == j)
+		##         s <- paste( s, "-" ,sep="")
+		##     else if (sig[i,j]==T)
+		##         s <- paste( s, "X" ,sep="")
+		##     else
+		##         s <- paste( s, "0" ,sep="")
+		## }
+		
+		s<-""
+		bigger <- ""
+		for (j in 1:ncol(data))
+		{
+			#print(paste(i,j))
+			if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3]) 
+				bigger <- paste(bigger,j,sep=",")
+		}
+		if (nchar(bigger)>0)
+		{
+			bigger <- substring(bigger, 2)
+			bigger <- paste(">(",bigger,")",sep="")
+			s <- paste(s,bigger,sep=" ")
+		}
+		smaller <- ""
+		for (j in 1:ncol(data))
+		{
+			#print(paste(i,j))
+			if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3]) 
+				smaller <- paste(smaller,j,sep=",")
+		}
+		if (nchar(smaller)>0)
+		{
+			smaller <- substring(smaller, 2)
+			smaller <- paste("<(",smaller,")",sep="")
+			s <- paste(s,smaller,sep=" ")
+		}
+		mtext(s,side=1,at=i,line=3)
+	}
+	
+	#print(sig)
+}
+
 #a<-matrix(rnorm(100, mean=50,  sd=4), ncol=5)
 #b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
 #data<-rbind(a,b)
 #c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
 #data<-rbind(data,c)
-#data=iris
+#data=iris[1:4]
 #pca_reduce_features(data)
 #data=ethanol
 #split = stratified_k_fold_split(data, num_folds=3)
-#split = anti_stratified_split(data, ratio=0.75)
+#split = stratified_split(data, ratio=0.1, method="cluster_hierarchical")
 
-#split = stratified_split(data, ratio=0.9,preprocess = "pca")
+#split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit")
 
 #cluster = cluster(ethanol,3,3)
 
 #print(split)
-#print(sum(split))
-#plot_split(plot_pre_process(data, method="pca"),split,c("training","test"))
+#print(sum(split)
+#plot_split(plot_pre_process(data, method="pca"),color_idx=split$split)
+#plot_split(plot_pre_process(data, method="pca"),circle_idx=split$split,color_idx=split$cluster)
author	mguetlein <martin.guetlein@gmail.com>	2012-07-23 22:28:08 +0200
committer	mguetlein <martin.guetlein@gmail.com>	2012-07-23 22:28:08 +0200
commit	c4342873cb6c3aeaf4496ac12d05f47347a4f945 (patch)
tree	daf612ee0722470336ec24ed60d1e01f6a393b4d
parent	d8f8050ba8bf47c5a272bcb937b4ee4f18ac5da0 (diff)