summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-07-23 22:28:08 +0200
committermguetlein <martin.guetlein@gmail.com>2012-07-23 22:28:08 +0200
commitc4342873cb6c3aeaf4496ac12d05f47347a4f945 (patch)
treedaf612ee0722470336ec24ed60d1e01f6a393b4d
parentd8f8050ba8bf47c5a272bcb937b4ee4f18ac5da0 (diff)
extend/fix(?) stratified splitting, more r-util stuff
-rw-r--r--lib/r-util.rb101
-rw-r--r--lib/stratification.R230
2 files changed, 291 insertions, 40 deletions
diff --git a/lib/r-util.rb b/lib/r-util.rb
index bbf98f3..952b873 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -58,10 +58,40 @@ module OpenTox
end
end
+
+ def ttest_matrix(arrays, paired, significance_level=0.95)
+
+ LOGGER.info("perform ttest matrix")
+ result = []
+ arrays.size.times do |i|
+ result[i] = []
+ @r.assign "v#{i}",arrays[i]
+ end
+ (arrays.size-1).times do |i|
+ (i+1..arrays.size-1).each do |j|
+ raise if paired && arrays[i].size!=arrays[j].size
+ @r.eval "ttest = t.test(as.numeric(v#{i}),as.numeric(v#{j}),paired=#{paired ? "T" : "F"})"
+ t = @r.pull "ttest$statistic"
+ p = @r.pull "ttest$p.value"
+ if (1-significance_level > p)
+ result[i][j]=true
+ result[j][i]=true
+ else
+ result[i][j]=false
+ result[j][i]=false
+ end
+ end
+ end
+ result
+ end
+
+
+
# <0 -> array1 << array2
# 0 -> no significant difference
# >0 -> array2 >> array1
def ttest(array1, array2, paired, significance_level=0.95)
+ LOGGER.info("perform ttest")
@r.assign "v1",array1
@r.assign "v2",array2
raise if paired && array1.size!=array2.size
@@ -106,6 +136,7 @@ module OpenTox
def boxplot(files, data, title="", hline=nil, param="")
LOGGER.debug("r-util> create boxplot "+data.inspect)
raise "no hashes, to keep order" if data.is_a?(Hash)
+ raise "boxplot: data is empty" if data.size==0
max = -1
min = 100000
max_median = -1
@@ -158,7 +189,7 @@ module OpenTox
# embedds feature values of two datasets into 2D and plots it
#
def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
- features=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil)
+ features=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil, color_feature=nil )
LOGGER.debug("r-util> create feature value plot")
d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
@@ -178,23 +209,42 @@ module OpenTox
waiting_task.progress(50) if waiting_task
@r.eval "df <- rbind(#{df1},#{df2})"
- @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
+ @r.eval "split <- c(rep(1,nrow(#{df1})),rep(0,nrow(#{df2})))"
@r.names = [dataset_name1, dataset_name2]
LOGGER.debug("r-util> - convert data to 2d")
#@r.eval "save.image(\"/tmp/image.R\")"
+ if (color_feature)
+ color = []
+ [d1,d2].each do |d|
+ raise "no #{color_feature}, instead: #{d.features.keys.sort.inspect}" unless d.features.has_key?(color_feature)
+ d.compounds.each do |c|
+ color += d.data_entries[c][color_feature]
+ end
+ end
+ @r.assign "color",color
+ end
+
if (direct_plot)
raise unless features.size==2
@r.eval "df.2d <- df"
+ x = features[0].split("/")[-1]
+ y = features[1].split("/")[-1]
else
@r.eval "df.2d <- plot_pre_process(df, method='sammon')"
+ x = "x"
+ y = "y"
end
waiting_task.progress(75) if waiting_task
title = "Sammon embedding of #{features.size} features" unless title
LOGGER.debug("r-util> - plot data")
plot_to_files(files) do |file|
- @r.eval "plot_split( df.2d, split, names, main='#{title}',xlab='x',ylab='y')"
+ if (color_feature)
+ @r.eval "plot_split( df.2d, color_idx=color, circle_idx=split, main='#{title}',xlab='#{x}',ylab='#{y}')"
+ else
+ @r.eval "plot_split( df.2d, color_idx=split, main='#{title}',xlab='#{x}',ylab='#{y}')"
+ end
end
end
@@ -255,8 +305,10 @@ module OpenTox
# stratified splits a dataset into two dataset according to the feature values
# all features are taken into account unless <split_features> is given
# returns two datases
- def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil, anti_stratification=false )
- stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features, anti_stratification )
+ def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil,
+ seed=42, split_features=nil, anti_stratification=false, store_split_clusters=false )
+ stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid,
+ seed, split_features, anti_stratification, store_split_clusters )
end
# stratified splits a dataset into k datasets according the feature values
@@ -267,7 +319,8 @@ module OpenTox
end
private
- def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil, stratification="super" )
+ def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil,
+ pct=nil, subjectid=nil, seed=42, split_features=nil, stratification="super", store_split_clusters=false )
raise "internal error" if num_folds!=nil and pct!=nil
k_fold_split = num_folds!=nil
if k_fold_split
@@ -303,9 +356,10 @@ module OpenTox
end
return train, test
else
- raise unless stratification=~/^(super|super4|anti)$/
+ raise unless stratification=~/^(super|super4|super5|anti)$/
anti = ""
super_method = ""
+ super_method_2 = ""
preprocess = ""
case stratification
when "anti"
@@ -315,14 +369,19 @@ module OpenTox
when "super4"
super_method = ", method='cluster_hierarchical'"
preprocess = ", preprocess='pca'"
+ when "super5"
+ super_method = ", method='cluster_hierarchical'"
+ super_method_2 = ", method_2='explicit'"
+ preprocess = ", preprocess='pca'"
end
- puts "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{preprocess})"
- @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{preprocess})"
- split = @r.pull 'split'
+ LOGGER.debug "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})"
+ @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})"
+ split = @r.pull 'split$split'
+ cluster = (store_split_clusters ? @r.pull('split$cluster') : nil)
metadata[DC.title] = "Training dataset split of "+dataset.uri
- train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
+ train = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==1 }
metadata[DC.title] = "Test dataset split of "+dataset.uri
- test = split_to_dataset( df, split, metadata, subjectid, missing_values ){ |i| i==0 }
+ test = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==0 }
return train, test
end
end
@@ -436,7 +495,7 @@ module OpenTox
NEW = false
private
- def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil, missing_values="NA" )
+ def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil, missing_values="NA", cluster=nil )
raise unless @@feats[df].size>0
missing_value_regexp = Regexp.new("^#{missing_values.to_s=="0" ? "(0.0|0)" : missing_values.to_s}$") unless NEW
@@ -469,14 +528,26 @@ module OpenTox
end
end
end
+ if cluster
+ cluster_feature = "http://no.such.domain/feature/split_cluster"
+ dataset.add_feature(cluster_feature)
+ #LOGGER.warn "adding feature #{cluster_feature}"
+ compounds.size.times do |r|
+ if compound_indices==nil or compound_indices.include?(r)
+ dataset.add(compounds[r],cluster_feature,cluster[r],true)
+ end
+ end
+ else
+ #LOGGER.warn "no cluster feature"
+ end
dataset.save(subjectid)
dataset
end
- def split_to_dataset( df, split, metadata={}, subjectid=nil, missing_values="NA" )
+ def split_to_dataset( df, split, metadata={}, subjectid=nil, missing_values="NA", cluster=nil )
indices = []
split.size.times{|i| indices<<i if yield(split[i]) }
- dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices, missing_values )
+ dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices, missing_values, cluster )
LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
dataset
end
diff --git a/lib/stratification.R b/lib/stratification.R
index 8a838fa..f12b302 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -113,10 +113,59 @@ cluster_hierarchical <- function(data, hclust_method="ward", max_num_clusters=15
cut
}
-stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NULL, preprocess="none" )
+# input: classes [ 1 2 4 3 1 2 4 3 2 1 2 3 3 2 ]
+# input : ratio 0.1
+# output: split [ 1 0 0 1 0 0 0 0 0 1 0 0 0 0 ]
+explicit_stratified_split <- function( class, ratio=0.3 )
+{
+# print(class)
+ num_instances = length(class)
+# print(num_instances)
+
+ # create a list with indexes of each class
+ class_idx <- list()
+ for (i in 1:max(class))
+ class_idx[[i]] <- vector()
+ for (i in 1:num_instances)
+ class_idx[[class[i]]] <- c(class_idx[[class[i]]],i)
+ #print(class_idx)
+ # shuffle each index list
+ for (i in 1:max(class))
+ class_idx[[i]] <- sample(class_idx[[i]])
+ # print(class_idx)
+
+ selected <- vector()
+ sum_total <- 0
+
+ classes = sample(1:max(class))
+ # iterate of classes in random order
+ for (i in classes)
+ {
+ # count the numbers of selections that should be made
+ sum_total <- sum_total + length(class_idx[[i]])
+ selected_total <- round_it(sum_total*ratio)
+ # print(selected_total)
+ # count the number of new selections
+ selected_new <- selected_total - length(selected)
+ # print(selected_new)
+ # add selections
+ selected <- c(selected, class_idx[[i]][1:selected_new])
+ # print(selected)
+ # print("")
+ }
+
+ # convert selected indexs to split array
+ split <- array(0,num_instances)
+ for(i in selected)
+ split[i] <- 1
+ as.vector(split)
+}
+
+stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL, preprocess="none" )
{
data.processed = as.matrix(process_data( data, colnames ))
print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio," preprocess: ",preprocess))
+ cl = NULL
if (preprocess == "none")
{
@@ -137,11 +186,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
pik = rep(ratio,times=nrow(data.processed))
data.strat = cbind(pik,data.processed)
- samplecube(data.strat,pik,order=2,comment=F)
+ split = samplecube(data.strat,pik,order=2,comment=F)
}
else if (method == "cluster_knn")
{
- cl = cluster_knn(data.processed)
+ cl = as.vector(cluster_knn(data.processed))
# require("caret")
# res = createDataPartition(cl,p=ratio)
# split = rep(1, times=nrow(data))
@@ -149,12 +198,26 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
# if ( is.na(match(j,res$Resample1)) )
# split[j]=0
# split
- stratified_split(cl,ratio,"samplecube")
+
+ split = stratified_split(cl,ratio,"samplecube")$split
}
else if (method == "cluster_hierarchical")
{
- cl = cluster_hierarchical(data.processed)
- stratified_split(cl,ratio,"samplecube")
+ cl = as.vector(cluster_hierarchical(data.processed))
+
+ #require("caret")
+ #res = createDataPartition(cl,p=ratio)
+ #split = rep(1, times=nrow(data))
+ #for (j in 1:nrow(data))
+ #if ( is.na(match(j,res$Resample1)) )
+ # split[j]=0
+
+ if (method_2 == "samplecube")
+ split = stratified_split(cl,ratio,"samplecube")$split
+ else if (method_2 == "explicit")
+ split = explicit_stratified_split(cl,ratio)
+ else
+ stop("unknown method")
}
## else if (method == "cluster2")
## {
@@ -199,6 +262,8 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU
## }
else
stop("unknown method")
+
+ list(split=split, cluster=cl)
}
anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
@@ -213,7 +278,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
data.processed = as.matrix(process_data( data, colnames ))
print(paste("anti-split using #features: ",ncol(data.processed)))
num_c = floor(1/ratio)
- cl = cluster(data.processed, num_c, num_c)
+ cl = cluster_knn(data.processed, num_c, num_c)
#print(cl)
idx = -1
min = 1000000
@@ -244,7 +309,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
for(j in 1:nrow(data))
split[j] = 1-split[j]
#print(split)
- as.vector(split)
+ list(split=as.vector(split),cluster=cl)
}
stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
@@ -382,7 +447,7 @@ plot_pre_process <- function( data, method="pca" )
stop("unknown method")
}
-plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... )
+plot_split <- function( data, color_idx, circle_idx=NULL, ... )
{
if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
stop("data not suitable for plotting, plot_pre_process() first")
@@ -390,40 +455,155 @@ plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... )
plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
if (is.null(names))
names <- c("split 1","split 2")
- colos = as.double(rep(2:(max(split)+2)))
- legend("topleft",names,pch=2,col=colos)
+ #colos = as.double(rep(2:(max(color_idx)+2)))
+ #legend("topleft",names,pch=2,col=colos)
+
+ #legend("topleft",names[1],pch=as.double(c(1)),col="red")
- for (j in max(color_idx):0)
+ ## for (j in max(color_idx):0)
+ ## {
+ ## for (k in max(shape_idx):0)
+ ## {
+ ## set = c()
+ ## for (i in 1:nrow(data))
+ ## if (color_idx[i]==j && shape_idx[i]==k)
+ ## set = c(set,i)
+ ## points(data[set,], pch = 15+k, col=(j+2))
+ ## }
+ ## }
+ col_offset = 2
+ if (!is.null(circle_idx))
+ col_offset = 1
+ for (j in 0:max(color_idx))
{
- for (k in max(shape_idx):0)
- {
- set = c()
- for (i in 1:nrow(data))
- if (color_idx[i]==j && shape_idx[i]==k)
- set = c(set,i)
- points(data[set,], pch = 15+k, col=(j+2))
- }
+ set = c()
+ for (i in 1:nrow(data))
+ if (color_idx[i]==j)
+ set = c(set,i)
+ points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset)
+ }
+ if (!is.null(circle_idx))
+ {
+ set = c()
+ for (i in 1:nrow(data))
+ if (circle_idx[i]==1)
+ set = c(set,i)#
+ points(data[set,], pch = 1, cex=1, col=1)
+ points(data[set,], pch = 1, cex=1.8, col=1)
+
+ ## for (j in max(color_idx):0)
+ ## {
+ ## set = c()
+ ## for (i in 1:nrow(data))
+ ## if (color_idx[i]==j && circle_idx[i]==1)
+ ## set = c(set,i)
+ ## points(data[set,], pch = 19, cex=1, col=1)
+ ## points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset)
+ ## points(data[set,], pch = 1, cex=2, col=1)
+ ## }
+
}
}
+superboxplot <- function( data, ..., significance_level=0.95 )
+{
+ b <- boxplot(data,...) #,col=rep(2:(ncol(data)+1)))
+
+ #print mean and stdev
+ for (i in 1:ncol(data))
+ {
+ med <- sprintf("%.3f",b$stats[,i][3])
+ stdev <- sprintf("%.3f",sqrt(var(data[,i])))
+ mtext(paste(med,"+-",stdev),side=1,at=i,line=2)
+ }
+
+ #print significance tests
+ sig <- array(list(),c(ncol(data),ncol(data)))
+ for (i in 1:(ncol(data)-1))
+ {
+ for (j in (i+1):ncol(data))
+ {
+ ttest = t.test(data[,i],data[,j],paired=T)
+ #print(ttest$p.value)
+ #print(is.na(ttest$p.value))
+ if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value)
+ {
+ sig[i,j] = T
+ sig[j,i] = T
+ }
+ else
+ {
+ sig[i,j] = F
+ sig[j,i] = F
+ }
+ }
+ }
+
+ for (i in 1:ncol(data))
+ {
+ ## s <- ""
+ ## for (j in 1:ncol(data))
+ ## {
+ ## if (i == j)
+ ## s <- paste( s, "-" ,sep="")
+ ## else if (sig[i,j]==T)
+ ## s <- paste( s, "X" ,sep="")
+ ## else
+ ## s <- paste( s, "0" ,sep="")
+ ## }
+
+ s<-""
+ bigger <- ""
+ for (j in 1:ncol(data))
+ {
+ #print(paste(i,j))
+ if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3])
+ bigger <- paste(bigger,j,sep=",")
+ }
+ if (nchar(bigger)>0)
+ {
+ bigger <- substring(bigger, 2)
+ bigger <- paste(">(",bigger,")",sep="")
+ s <- paste(s,bigger,sep=" ")
+ }
+ smaller <- ""
+ for (j in 1:ncol(data))
+ {
+ #print(paste(i,j))
+ if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3])
+ smaller <- paste(smaller,j,sep=",")
+ }
+ if (nchar(smaller)>0)
+ {
+ smaller <- substring(smaller, 2)
+ smaller <- paste("<(",smaller,")",sep="")
+ s <- paste(s,smaller,sep=" ")
+ }
+ mtext(s,side=1,at=i,line=3)
+ }
+
+ #print(sig)
+}
+
#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
#data<-rbind(a,b)
#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
#data<-rbind(data,c)
-#data=iris
+#data=iris[1:4]
#pca_reduce_features(data)
#data=ethanol
#split = stratified_k_fold_split(data, num_folds=3)
-#split = anti_stratified_split(data, ratio=0.75)
+#split = stratified_split(data, ratio=0.1, method="cluster_hierarchical")
-#split = stratified_split(data, ratio=0.9,preprocess = "pca")
+#split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit")
#cluster = cluster(ethanol,3,3)
#print(split)
-#print(sum(split))
-#plot_split(plot_pre_process(data, method="pca"),split,c("training","test"))
+#print(sum(split)
+#plot_split(plot_pre_process(data, method="pca"),color_idx=split$split)
+#plot_split(plot_pre_process(data, method="pca"),circle_idx=split$split,color_idx=split$cluster)