diff options
author | mguetlein <martin.guetlein@gmail.com> | 2012-07-23 22:28:08 +0200 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2012-07-23 22:28:08 +0200 |
commit | c4342873cb6c3aeaf4496ac12d05f47347a4f945 (patch) | |
tree | daf612ee0722470336ec24ed60d1e01f6a393b4d | |
parent | d8f8050ba8bf47c5a272bcb937b4ee4f18ac5da0 (diff) |
extend/fix(?) stratified splitting, more r-util stuff
-rw-r--r-- | lib/r-util.rb | 101 | ||||
-rw-r--r-- | lib/stratification.R | 230 |
2 files changed, 291 insertions, 40 deletions
diff --git a/lib/r-util.rb b/lib/r-util.rb index bbf98f3..952b873 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -58,10 +58,40 @@ module OpenTox end end + + def ttest_matrix(arrays, paired, significance_level=0.95) + + LOGGER.info("perform ttest matrix") + result = [] + arrays.size.times do |i| + result[i] = [] + @r.assign "v#{i}",arrays[i] + end + (arrays.size-1).times do |i| + (i+1..arrays.size-1).each do |j| + raise if paired && arrays[i].size!=arrays[j].size + @r.eval "ttest = t.test(as.numeric(v#{i}),as.numeric(v#{j}),paired=#{paired ? "T" : "F"})" + t = @r.pull "ttest$statistic" + p = @r.pull "ttest$p.value" + if (1-significance_level > p) + result[i][j]=true + result[j][i]=true + else + result[i][j]=false + result[j][i]=false + end + end + end + result + end + + + # <0 -> array1 << array2 # 0 -> no significant difference # >0 -> array2 >> array1 def ttest(array1, array2, paired, significance_level=0.95) + LOGGER.info("perform ttest") @r.assign "v1",array1 @r.assign "v2",array2 raise if paired && array1.size!=array2.size @@ -106,6 +136,7 @@ module OpenTox def boxplot(files, data, title="", hline=nil, param="") LOGGER.debug("r-util> create boxplot "+data.inspect) raise "no hashes, to keep order" if data.is_a?(Hash) + raise "boxplot: data is empty" if data.size==0 max = -1 min = 100000 max_median = -1 @@ -158,7 +189,7 @@ module OpenTox # embedds feature values of two datasets into 2D and plots it # def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, - features=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil) + features=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil, color_feature=nil ) LOGGER.debug("r-util> create feature value plot") d1 = OpenTox::Dataset.find(dataset_uri1,subjectid) @@ -178,23 +209,42 @@ module OpenTox waiting_task.progress(50) if waiting_task @r.eval "df <- rbind(#{df1},#{df2})" - @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))" + @r.eval "split <- c(rep(1,nrow(#{df1})),rep(0,nrow(#{df2})))" @r.names = [dataset_name1, dataset_name2] LOGGER.debug("r-util> - convert data to 2d") #@r.eval "save.image(\"/tmp/image.R\")" + if (color_feature) + color = [] + [d1,d2].each do |d| + raise "no #{color_feature}, instead: #{d.features.keys.sort.inspect}" unless d.features.has_key?(color_feature) + d.compounds.each do |c| + color += d.data_entries[c][color_feature] + end + end + @r.assign "color",color + end + if (direct_plot) raise unless features.size==2 @r.eval "df.2d <- df" + x = features[0].split("/")[-1] + y = features[1].split("/")[-1] else @r.eval "df.2d <- plot_pre_process(df, method='sammon')" + x = "x" + y = "y" end waiting_task.progress(75) if waiting_task title = "Sammon embedding of #{features.size} features" unless title LOGGER.debug("r-util> - plot data") plot_to_files(files) do |file| - @r.eval "plot_split( df.2d, split, names, main='#{title}',xlab='x',ylab='y')" + if (color_feature) + @r.eval "plot_split( df.2d, color_idx=color, circle_idx=split, main='#{title}',xlab='#{x}',ylab='#{y}')" + else + @r.eval "plot_split( df.2d, color_idx=split, main='#{title}',xlab='#{x}',ylab='#{y}')" + end end end @@ -255,8 +305,10 @@ module OpenTox # stratified splits a dataset into two dataset according to the feature values # all features are taken into account unless <split_features> is given # returns two datases - def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil, anti_stratification=false ) - stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features, anti_stratification ) + def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, + seed=42, split_features=nil, anti_stratification=false, store_split_clusters=false ) + stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, + seed, split_features, anti_stratification, store_split_clusters ) end # stratified splits a dataset into k datasets according the feature values @@ -267,7 +319,8 @@ module OpenTox end private - def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil, stratification="super" ) + def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, + pct=nil, subjectid=nil, seed=42, split_features=nil, stratification="super", store_split_clusters=false ) raise "internal error" if num_folds!=nil and pct!=nil k_fold_split = num_folds!=nil if k_fold_split @@ -303,9 +356,10 @@ module OpenTox end return train, test else - raise unless stratification=~/^(super|super4|anti)$/ + raise unless stratification=~/^(super|super4|super5|anti)$/ anti = "" super_method = "" + super_method_2 = "" preprocess = "" case stratification when "anti" @@ -315,14 +369,19 @@ module OpenTox when "super4" super_method = ", method='cluster_hierarchical'" preprocess = ", preprocess='pca'" + when "super5" + super_method = ", method='cluster_hierarchical'" + super_method_2 = ", method_2='explicit'" + preprocess = ", preprocess='pca'" end - puts "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{preprocess})" - @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{preprocess})" - split = @r.pull 'split' + LOGGER.debug "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})" + @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})" + split = @r.pull 'split$split' + cluster = (store_split_clusters ? @r.pull('split$cluster') : nil) metadata[DC.title] = "Training dataset split of "+dataset.uri - train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 } + train = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==1 } metadata[DC.title] = "Test dataset split of "+dataset.uri - test = split_to_dataset( df, split, metadata, subjectid, missing_values ){ |i| i==0 } + test = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==0 } return train, test end end @@ -436,7 +495,7 @@ module OpenTox NEW = false private - def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil, missing_values="NA" ) + def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil, missing_values="NA", cluster=nil ) raise unless @@feats[df].size>0 missing_value_regexp = Regexp.new("^#{missing_values.to_s=="0" ? "(0.0|0)" : missing_values.to_s}$") unless NEW @@ -469,14 +528,26 @@ module OpenTox end end end + if cluster + cluster_feature = "http://no.such.domain/feature/split_cluster" + dataset.add_feature(cluster_feature) + #LOGGER.warn "adding feature #{cluster_feature}" + compounds.size.times do |r| + if compound_indices==nil or compound_indices.include?(r) + dataset.add(compounds[r],cluster_feature,cluster[r],true) + end + end + else + #LOGGER.warn "no cluster feature" + end dataset.save(subjectid) dataset end - def split_to_dataset( df, split, metadata={}, subjectid=nil, missing_values="NA" ) + def split_to_dataset( df, split, metadata={}, subjectid=nil, missing_values="NA", cluster=nil ) indices = [] split.size.times{|i| indices<<i if yield(split[i]) } - dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices, missing_values ) + dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices, missing_values, cluster ) LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}") dataset end diff --git a/lib/stratification.R b/lib/stratification.R index 8a838fa..f12b302 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -113,10 +113,59 @@ cluster_hierarchical <- function(data, hclust_method="ward", max_num_clusters=15 cut } -stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NULL, preprocess="none" ) +# input: classes [ 1 2 4 3 1 2 4 3 2 1 2 3 3 2 ] +# input : ratio 0.1 +# output: split [ 1 0 0 1 0 0 0 0 0 1 0 0 0 0 ] +explicit_stratified_split <- function( class, ratio=0.3 ) +{ +# print(class) + num_instances = length(class) +# print(num_instances) + + # create a list with indexes of each class + class_idx <- list() + for (i in 1:max(class)) + class_idx[[i]] <- vector() + for (i in 1:num_instances) + class_idx[[class[i]]] <- c(class_idx[[class[i]]],i) + #print(class_idx) + # shuffle each index list + for (i in 1:max(class)) + class_idx[[i]] <- sample(class_idx[[i]]) + # print(class_idx) + + selected <- vector() + sum_total <- 0 + + classes = sample(1:max(class)) + # iterate of classes in random order + for (i in classes) + { + # count the numbers of selections that should be made + sum_total <- sum_total + length(class_idx[[i]]) + selected_total <- round_it(sum_total*ratio) + # print(selected_total) + # count the number of new selections + selected_new <- selected_total - length(selected) + # print(selected_new) + # add selections + selected <- c(selected, class_idx[[i]][1:selected_new]) + # print(selected) + # print("") + } + + # convert selected indexs to split array + split <- array(0,num_instances) + for(i in selected) + split[i] <- 1 + as.vector(split) +} + +stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL, preprocess="none" ) { data.processed = as.matrix(process_data( data, colnames )) print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio," preprocess: ",preprocess)) + cl = NULL if (preprocess == "none") { @@ -137,11 +186,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed) pik = rep(ratio,times=nrow(data.processed)) data.strat = cbind(pik,data.processed) - samplecube(data.strat,pik,order=2,comment=F) + split = samplecube(data.strat,pik,order=2,comment=F) } else if (method == "cluster_knn") { - cl = cluster_knn(data.processed) + cl = as.vector(cluster_knn(data.processed)) # require("caret") # res = createDataPartition(cl,p=ratio) # split = rep(1, times=nrow(data)) @@ -149,12 +198,26 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU # if ( is.na(match(j,res$Resample1)) ) # split[j]=0 # split - stratified_split(cl,ratio,"samplecube") + + split = stratified_split(cl,ratio,"samplecube")$split } else if (method == "cluster_hierarchical") { - cl = cluster_hierarchical(data.processed) - stratified_split(cl,ratio,"samplecube") + cl = as.vector(cluster_hierarchical(data.processed)) + + #require("caret") + #res = createDataPartition(cl,p=ratio) + #split = rep(1, times=nrow(data)) + #for (j in 1:nrow(data)) + #if ( is.na(match(j,res$Resample1)) ) + # split[j]=0 + + if (method_2 == "samplecube") + split = stratified_split(cl,ratio,"samplecube")$split + else if (method_2 == "explicit") + split = explicit_stratified_split(cl,ratio) + else + stop("unknown method") } ## else if (method == "cluster2") ## { @@ -199,6 +262,8 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", colnames=NU ## } else stop("unknown method") + + list(split=split, cluster=cl) } anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) @@ -213,7 +278,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) data.processed = as.matrix(process_data( data, colnames )) print(paste("anti-split using #features: ",ncol(data.processed))) num_c = floor(1/ratio) - cl = cluster(data.processed, num_c, num_c) + cl = cluster_knn(data.processed, num_c, num_c) #print(cl) idx = -1 min = 1000000 @@ -244,7 +309,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) for(j in 1:nrow(data)) split[j] = 1-split[j] #print(split) - as.vector(split) + list(split=as.vector(split),cluster=cl) } stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL ) @@ -382,7 +447,7 @@ plot_pre_process <- function( data, method="pca" ) stop("unknown method") } -plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... ) +plot_split <- function( data, color_idx, circle_idx=NULL, ... ) { if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2])) stop("data not suitable for plotting, plot_pre_process() first") @@ -390,40 +455,155 @@ plot_split <- function( data, color_idx, names=NULL, shape_idx=color_idx, ... ) plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... ) if (is.null(names)) names <- c("split 1","split 2") - colos = as.double(rep(2:(max(split)+2))) - legend("topleft",names,pch=2,col=colos) + #colos = as.double(rep(2:(max(color_idx)+2))) + #legend("topleft",names,pch=2,col=colos) + + #legend("topleft",names[1],pch=as.double(c(1)),col="red") - for (j in max(color_idx):0) + ## for (j in max(color_idx):0) + ## { + ## for (k in max(shape_idx):0) + ## { + ## set = c() + ## for (i in 1:nrow(data)) + ## if (color_idx[i]==j && shape_idx[i]==k) + ## set = c(set,i) + ## points(data[set,], pch = 15+k, col=(j+2)) + ## } + ## } + col_offset = 2 + if (!is.null(circle_idx)) + col_offset = 1 + for (j in 0:max(color_idx)) { - for (k in max(shape_idx):0) - { - set = c() - for (i in 1:nrow(data)) - if (color_idx[i]==j && shape_idx[i]==k) - set = c(set,i) - points(data[set,], pch = 15+k, col=(j+2)) - } + set = c() + for (i in 1:nrow(data)) + if (color_idx[i]==j) + set = c(set,i) + points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset) + } + if (!is.null(circle_idx)) + { + set = c() + for (i in 1:nrow(data)) + if (circle_idx[i]==1) + set = c(set,i)# + points(data[set,], pch = 1, cex=1, col=1) + points(data[set,], pch = 1, cex=1.8, col=1) + + ## for (j in max(color_idx):0) + ## { + ## set = c() + ## for (i in 1:nrow(data)) + ## if (color_idx[i]==j && circle_idx[i]==1) + ## set = c(set,i) + ## points(data[set,], pch = 19, cex=1, col=1) + ## points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset) + ## points(data[set,], pch = 1, cex=2, col=1) + ## } + } } +superboxplot <- function( data, ..., significance_level=0.95 ) +{ + b <- boxplot(data,...) #,col=rep(2:(ncol(data)+1))) + + #print mean and stdev + for (i in 1:ncol(data)) + { + med <- sprintf("%.3f",b$stats[,i][3]) + stdev <- sprintf("%.3f",sqrt(var(data[,i]))) + mtext(paste(med,"+-",stdev),side=1,at=i,line=2) + } + + #print significance tests + sig <- array(list(),c(ncol(data),ncol(data))) + for (i in 1:(ncol(data)-1)) + { + for (j in (i+1):ncol(data)) + { + ttest = t.test(data[,i],data[,j],paired=T) + #print(ttest$p.value) + #print(is.na(ttest$p.value)) + if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value) + { + sig[i,j] = T + sig[j,i] = T + } + else + { + sig[i,j] = F + sig[j,i] = F + } + } + } + + for (i in 1:ncol(data)) + { + ## s <- "" + ## for (j in 1:ncol(data)) + ## { + ## if (i == j) + ## s <- paste( s, "-" ,sep="") + ## else if (sig[i,j]==T) + ## s <- paste( s, "X" ,sep="") + ## else + ## s <- paste( s, "0" ,sep="") + ## } + + s<-"" + bigger <- "" + for (j in 1:ncol(data)) + { + #print(paste(i,j)) + if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3]) + bigger <- paste(bigger,j,sep=",") + } + if (nchar(bigger)>0) + { + bigger <- substring(bigger, 2) + bigger <- paste(">(",bigger,")",sep="") + s <- paste(s,bigger,sep=" ") + } + smaller <- "" + for (j in 1:ncol(data)) + { + #print(paste(i,j)) + if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3]) + smaller <- paste(smaller,j,sep=",") + } + if (nchar(smaller)>0) + { + smaller <- substring(smaller, 2) + smaller <- paste("<(",smaller,")",sep="") + s <- paste(s,smaller,sep=" ") + } + mtext(s,side=1,at=i,line=3) + } + + #print(sig) +} + #a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) #b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) #data<-rbind(a,b) #c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) #data<-rbind(data,c) -#data=iris +#data=iris[1:4] #pca_reduce_features(data) #data=ethanol #split = stratified_k_fold_split(data, num_folds=3) -#split = anti_stratified_split(data, ratio=0.75) +#split = stratified_split(data, ratio=0.1, method="cluster_hierarchical") -#split = stratified_split(data, ratio=0.9,preprocess = "pca") +#split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit") #cluster = cluster(ethanol,3,3) #print(split) -#print(sum(split)) -#plot_split(plot_pre_process(data, method="pca"),split,c("training","test")) +#print(sum(split) +#plot_split(plot_pre_process(data, method="pca"),color_idx=split$split) +#plot_split(plot_pre_process(data, method="pca"),circle_idx=split$split,color_idx=split$cluster) |