diff options
author | mguetlein <martin.guetlein@gmail.com> | 2012-10-19 23:47:11 +0200 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2012-10-19 23:47:11 +0200 |
commit | 92228afaca2a1fd441a528999f91b515617c668a (patch) | |
tree | 76e775d198c4abc19133036d553e44b2f0f25cc5 | |
parent | 4c389e95880e363eb03e165264476746690dc054 (diff) |
fix contra-splitting by writing own sample method
-rw-r--r-- | lib/r-util.rb | 25 | ||||
-rw-r--r-- | lib/stratification.R | 111 | ||||
-rw-r--r-- | lib/utils.rb | 3 |
3 files changed, 114 insertions, 25 deletions
diff --git a/lib/r-util.rb b/lib/r-util.rb index db76433..f10c59b 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -202,12 +202,24 @@ module OpenTox end + private + def get_r_cols(pair_colors=false) + cols = ["red","cyan","green","magenta","blue","orange","seagreen","salmon","goldenrod","gray","orchid","khaki"] + if pair_colors + pair_cols=[] + cols.each{|c| pair_cols<<c; pair_cols<<"dark#{c}"} + cols = pair_cols + end + "col=c('#{cols.join("','")}')" + end + + public # example: # files = ["/tmp/box.svg","/tmp/box.png"] # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ] # boxplot(files, data, "comparison1" ) # - def boxplot(files, data, title="", hline=nil, param="") + def boxplot(files, data, title="", hline=nil, param="", pair_colors=false) LOGGER.debug("r-util> create boxplot "+data.inspect) raise "no hashes, to keep order" if data.is_a?(Hash) raise "boxplot: data is empty" if data.size==0 @@ -228,7 +240,7 @@ module OpenTox max_median_idx = i if max_median==med min_median = [min_median,med].min min_median_idx = i if min_median==med - data[i] = [data[i][0]+"(#{values.size})",data[i][1]] if @@boxplot_alg_info + data[i] = [data[i][0].to_s+"(#{values.size})",data[i][1]] if @@boxplot_alg_info end if min != max times = max/min.to_f @@ -256,7 +268,8 @@ module OpenTox hlines << [max_median,2+max_median_idx] hlines << [min_median,2+min_median_idx] plot_to_files(files, hlines) do |file| - @r.eval "superboxplot(boxdata,alg_info=#{@@boxplot_alg_info ? "T" : "F"},main='#{title}',col=rep(2:#{data.size+1})#{param_str})" + #@r.eval "superboxplot(boxdata,alg_info=#{@@boxplot_alg_info ? "T" : "F"},main='#{title}',col=rep(2:#{data.size+1})#{param_str})" + @r.eval "superboxplot(boxdata,alg_info=#{@@boxplot_alg_info ? "T" : "F"},main='#{title}',#{get_r_cols(pair_colors)}#{param_str})" end end @@ -443,16 +456,16 @@ module OpenTox end return train, test else - raise unless stratification=~/^(super|super4|super5|super_bin|contra_eucl|contra_bin)$/ + raise unless stratification=~/^(super|super4|super5|super_bin|contra_eucl2|contra_bin2)$/ anti = "" super_method = "" super_method_2 = "" #preprocess = "" case stratification - when "contra_eucl" + when "contra_eucl2" feature_type = "numerical" anti = "contra_" - when "contra_bin" + when "contra_bin2" feature_type = "binary" anti = "contra_" when "super" diff --git a/lib/stratification.R b/lib/stratification.R index 8935278..6dbb980 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -4,9 +4,9 @@ #data=df_12117 # 250 compounds, 2 features -#load("/home/martin/workspace/ValidationExperiments/strat_pics/image.R") -#data=df_11306 -#data=cbind(data[1],data[3]) +## load("/home/martin/workspace/ValidationExperiments/strat_pics/image.R") +## data=df_11306 +## data=cbind(data[1],data[3]) # 1000 compounds, 226 features #load("/home/martin/tmp/image_12171.R") @@ -285,6 +285,7 @@ contra_stratified_split <- function( data, feature_type, ratio=0.3, colnames=NUL } samplesize = nrow(data.processed)/20 + print(paste("samplesize",samplesize)) if (nrow(data.processed)<=samplesize) sample = as.vector(1:nrow(data.processed)) @@ -350,12 +351,14 @@ contra_stratified_split <- function( data, feature_type, ratio=0.3, colnames=NUL #raw = array(0,nrow(data.processed)) for(i in 1:nrow(data.processed)) { - prob[i] = (1-dist[i]/max) ^ 100 + prob[i] = (1-dist[i]/max) ^ 10 #raw[i] = 1-dist[i]/max if(prob[i]!=0) not_nil = not_nil+1 } - + #hist( log(prob[1:anti_strat_center-1] )) + #hist( c( prob[1:anti_strat_center-1],prob[anti_strat_center+1:length(prob)] )) + #hist( log( c( prob[1:anti_strat_center-1],prob[anti_strat_center+1:length(prob)] ))) print("head(prop) (convert distance into propability)") print(head(prob)) @@ -372,19 +375,71 @@ contra_stratified_split <- function( data, feature_type, ratio=0.3, colnames=NUL prob[i]=.Machine$double.xmin } - selected = sample(1:nrow(data.processed),num_sel,replace=FALSE,prob=prob) + #selected = sample(1:nrow(data.processed),num_sel,replace=FALSE,prob=prob) + selected = sample_self(1:nrow(data.processed),num_sel,prob) split <- array(0,nrow(data.processed)) for(i in selected) { - #print(paste("sel ",i," raw ",raw[i]," dist ",prop[i])) + # print(paste("sel ",i," raw ",raw[i]," dist ",prob[i])) split[i] <- 1 } + #for(i in 1:nrow(data.processed)) +# print(paste(i," sel",split[i]," raw ",raw[i]," dist ",prob[i])) split = as.vector(split) cl <- array(0,nrow(data.processed)) cl[anti_strat_center] = 1 - list(split=split,cluster=cl) + list(split=split,cluster=cl,dist=prob) +} + +sample_self <- function(x,n,prop) +{ + if (length(x)!=length(prop)) + stop("wtf") + sum_prop = 0 + for(i in 1:length(x)) + sum_prop = sum_prop + prop[i] + #print("start") + #print(x) + #print(n) + #print(prop) + #print(paste("sum ",sum_prop)) + r = runif(1,0.0,sum_prop) + #print(paste("rand",r)) + sum_prop = 0 + for(i in 1:length(x)) + { + sum_prop = sum_prop + prop[i] + #print(paste("current sum",sum_prop)) + if(r <= sum_prop) + { + sel = i + break + } + } + if (n>1) + { + if(sel==1) + { + x_ = x[2:length(x)] + prop_ = prop[2:length(prop)] + } + else if(sel==length(x)) + { + x_ = x[1:length(x)-1] + prop_ = prop[1:length(prop)-1] + } + else + { + x_ = c(x[1:sel-1],x[(sel+1):length(x)]) + prop_ = c(prop[1:sel-1],prop[(sel+1):length(prop)]) + } + n_ = n - 1 + c(x[i],sample_self(x_,n_,prop_)) + } + else + x[i] } stratified_split <- function( data, feature_type, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL ) #, preprocess="none" @@ -674,7 +729,7 @@ plot_pre_process <- function( data, feature_type, method="pca" ) } -plot_split <- function( data, color_idx=NULL, circle_idx=NULL, ... ) +plot_split <- function( data, color_idx=NULL, circle_idx=NULL, transparent=NULL, ... ) { if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2])) stop("data not suitable for plotting, plot_pre_process() first") @@ -708,11 +763,26 @@ plot_split <- function( data, color_idx=NULL, circle_idx=NULL, ... ) } for (j in 0:max(color_idx)) { - set = c() - for (i in 1:nrow(data)) - if (color_idx[i]==j) - set = c(set,i) - points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset) + if(is.null(transparent)) + { + set = c() + for (i in 1:nrow(data)) + if (color_idx[i]==j) + set = c(set,i) + points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset) + } + else + { + for (i in 1:nrow(data)) + if (color_idx[i]==j) + { + col_rgb = col2rgb((max(color_idx)-j)+col_offset) + col_rgb2=rgb(col_rgb[1]/255.0,col_rgb[2]/255.0,col_rgb[3]/255.0,transparent[i]) + points(data[i,], pch = 19, cex=1, col=col_rgb2) + } + + #points(data[set,], pch = 19, cex=1, col=col_rgb2) + } } if (!is.null(circle_idx)) { @@ -919,13 +989,20 @@ pre_process_ttest_closer_to_zero <- function( x, y ) } } -split_plot <- function(ratio=0.33) +split_plot <- function(feature_type,ratio=0.33) { + seed = as.numeric(Sys.time()) + print(paste("seed",seed)) + + #seed = 1350675271.02398 + + set.seed(seed) print("splitting") - split = contra_stratified_split(data, ratio=ratio) + split = contra_stratified_split(data, feature_type, ratio=ratio) #print(split$cluster) + print("plotting") - plot_split(plot_data,circle_idx=split$cluster,color_idx=split$split) + plot_split(data,circle_idx=split$cluster,color_idx=split$split)#,transparent=split$dist) } norm_test <- function(data.orig) diff --git a/lib/utils.rb b/lib/utils.rb index 992e53a..7a777e3 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -10,9 +10,8 @@ module ProcessUtil stdout_str = stdout.readlines.join("") stderr_str = stderr.readlines.join("") ignored, status = Process::waitpid2 pid - exit_status = status.exitstatus [stdin, stdout, stderr].each{|io| io.close} - return stdout_str, stderr_str, exit_status + return stdout_str, stderr_str, status.exitstatus end def self.run(cmd) |