summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-10-19 23:47:11 +0200
committermguetlein <martin.guetlein@gmail.com>2012-10-19 23:47:11 +0200
commit92228afaca2a1fd441a528999f91b515617c668a (patch)
tree76e775d198c4abc19133036d553e44b2f0f25cc5
parent4c389e95880e363eb03e165264476746690dc054 (diff)
fix contra-splitting by writing own sample method
-rw-r--r--lib/r-util.rb25
-rw-r--r--lib/stratification.R111
-rw-r--r--lib/utils.rb3
3 files changed, 114 insertions, 25 deletions
diff --git a/lib/r-util.rb b/lib/r-util.rb
index db76433..f10c59b 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -202,12 +202,24 @@ module OpenTox
end
+ private
+ def get_r_cols(pair_colors=false)
+ cols = ["red","cyan","green","magenta","blue","orange","seagreen","salmon","goldenrod","gray","orchid","khaki"]
+ if pair_colors
+ pair_cols=[]
+ cols.each{|c| pair_cols<<c; pair_cols<<"dark#{c}"}
+ cols = pair_cols
+ end
+ "col=c('#{cols.join("','")}')"
+ end
+
+ public
# example:
# files = ["/tmp/box.svg","/tmp/box.png"]
# data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
# boxplot(files, data, "comparison1" )
#
- def boxplot(files, data, title="", hline=nil, param="")
+ def boxplot(files, data, title="", hline=nil, param="", pair_colors=false)
LOGGER.debug("r-util> create boxplot "+data.inspect)
raise "no hashes, to keep order" if data.is_a?(Hash)
raise "boxplot: data is empty" if data.size==0
@@ -228,7 +240,7 @@ module OpenTox
max_median_idx = i if max_median==med
min_median = [min_median,med].min
min_median_idx = i if min_median==med
- data[i] = [data[i][0]+"(#{values.size})",data[i][1]] if @@boxplot_alg_info
+ data[i] = [data[i][0].to_s+"(#{values.size})",data[i][1]] if @@boxplot_alg_info
end
if min != max
times = max/min.to_f
@@ -256,7 +268,8 @@ module OpenTox
hlines << [max_median,2+max_median_idx]
hlines << [min_median,2+min_median_idx]
plot_to_files(files, hlines) do |file|
- @r.eval "superboxplot(boxdata,alg_info=#{@@boxplot_alg_info ? "T" : "F"},main='#{title}',col=rep(2:#{data.size+1})#{param_str})"
+ #@r.eval "superboxplot(boxdata,alg_info=#{@@boxplot_alg_info ? "T" : "F"},main='#{title}',col=rep(2:#{data.size+1})#{param_str})"
+ @r.eval "superboxplot(boxdata,alg_info=#{@@boxplot_alg_info ? "T" : "F"},main='#{title}',#{get_r_cols(pair_colors)}#{param_str})"
end
end
@@ -443,16 +456,16 @@ module OpenTox
end
return train, test
else
- raise unless stratification=~/^(super|super4|super5|super_bin|contra_eucl|contra_bin)$/
+ raise unless stratification=~/^(super|super4|super5|super_bin|contra_eucl2|contra_bin2)$/
anti = ""
super_method = ""
super_method_2 = ""
#preprocess = ""
case stratification
- when "contra_eucl"
+ when "contra_eucl2"
feature_type = "numerical"
anti = "contra_"
- when "contra_bin"
+ when "contra_bin2"
feature_type = "binary"
anti = "contra_"
when "super"
diff --git a/lib/stratification.R b/lib/stratification.R
index 8935278..6dbb980 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -4,9 +4,9 @@
#data=df_12117
# 250 compounds, 2 features
-#load("/home/martin/workspace/ValidationExperiments/strat_pics/image.R")
-#data=df_11306
-#data=cbind(data[1],data[3])
+## load("/home/martin/workspace/ValidationExperiments/strat_pics/image.R")
+## data=df_11306
+## data=cbind(data[1],data[3])
# 1000 compounds, 226 features
#load("/home/martin/tmp/image_12171.R")
@@ -285,6 +285,7 @@ contra_stratified_split <- function( data, feature_type, ratio=0.3, colnames=NUL
}
samplesize = nrow(data.processed)/20
+
print(paste("samplesize",samplesize))
if (nrow(data.processed)<=samplesize)
sample = as.vector(1:nrow(data.processed))
@@ -350,12 +351,14 @@ contra_stratified_split <- function( data, feature_type, ratio=0.3, colnames=NUL
#raw = array(0,nrow(data.processed))
for(i in 1:nrow(data.processed))
{
- prob[i] = (1-dist[i]/max) ^ 100
+ prob[i] = (1-dist[i]/max) ^ 10
#raw[i] = 1-dist[i]/max
if(prob[i]!=0)
not_nil = not_nil+1
}
-
+ #hist( log(prob[1:anti_strat_center-1] ))
+ #hist( c( prob[1:anti_strat_center-1],prob[anti_strat_center+1:length(prob)] ))
+ #hist( log( c( prob[1:anti_strat_center-1],prob[anti_strat_center+1:length(prob)] )))
print("head(prop) (convert distance into propability)")
print(head(prob))
@@ -372,19 +375,71 @@ contra_stratified_split <- function( data, feature_type, ratio=0.3, colnames=NUL
prob[i]=.Machine$double.xmin
}
- selected = sample(1:nrow(data.processed),num_sel,replace=FALSE,prob=prob)
+ #selected = sample(1:nrow(data.processed),num_sel,replace=FALSE,prob=prob)
+ selected = sample_self(1:nrow(data.processed),num_sel,prob)
split <- array(0,nrow(data.processed))
for(i in selected)
{
- #print(paste("sel ",i," raw ",raw[i]," dist ",prop[i]))
+ # print(paste("sel ",i," raw ",raw[i]," dist ",prob[i]))
split[i] <- 1
}
+ #for(i in 1:nrow(data.processed))
+# print(paste(i," sel",split[i]," raw ",raw[i]," dist ",prob[i]))
split = as.vector(split)
cl <- array(0,nrow(data.processed))
cl[anti_strat_center] = 1
- list(split=split,cluster=cl)
+ list(split=split,cluster=cl,dist=prob)
+}
+
+sample_self <- function(x,n,prop)
+{
+ if (length(x)!=length(prop))
+ stop("wtf")
+ sum_prop = 0
+ for(i in 1:length(x))
+ sum_prop = sum_prop + prop[i]
+ #print("start")
+ #print(x)
+ #print(n)
+ #print(prop)
+ #print(paste("sum ",sum_prop))
+ r = runif(1,0.0,sum_prop)
+ #print(paste("rand",r))
+ sum_prop = 0
+ for(i in 1:length(x))
+ {
+ sum_prop = sum_prop + prop[i]
+ #print(paste("current sum",sum_prop))
+ if(r <= sum_prop)
+ {
+ sel = i
+ break
+ }
+ }
+ if (n>1)
+ {
+ if(sel==1)
+ {
+ x_ = x[2:length(x)]
+ prop_ = prop[2:length(prop)]
+ }
+ else if(sel==length(x))
+ {
+ x_ = x[1:length(x)-1]
+ prop_ = prop[1:length(prop)-1]
+ }
+ else
+ {
+ x_ = c(x[1:sel-1],x[(sel+1):length(x)])
+ prop_ = c(prop[1:sel-1],prop[(sel+1):length(prop)])
+ }
+ n_ = n - 1
+ c(x[i],sample_self(x_,n_,prop_))
+ }
+ else
+ x[i]
}
stratified_split <- function( data, feature_type, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL ) #, preprocess="none"
@@ -674,7 +729,7 @@ plot_pre_process <- function( data, feature_type, method="pca" )
}
-plot_split <- function( data, color_idx=NULL, circle_idx=NULL, ... )
+plot_split <- function( data, color_idx=NULL, circle_idx=NULL, transparent=NULL, ... )
{
if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
stop("data not suitable for plotting, plot_pre_process() first")
@@ -708,11 +763,26 @@ plot_split <- function( data, color_idx=NULL, circle_idx=NULL, ... )
}
for (j in 0:max(color_idx))
{
- set = c()
- for (i in 1:nrow(data))
- if (color_idx[i]==j)
- set = c(set,i)
- points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset)
+ if(is.null(transparent))
+ {
+ set = c()
+ for (i in 1:nrow(data))
+ if (color_idx[i]==j)
+ set = c(set,i)
+ points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset)
+ }
+ else
+ {
+ for (i in 1:nrow(data))
+ if (color_idx[i]==j)
+ {
+ col_rgb = col2rgb((max(color_idx)-j)+col_offset)
+ col_rgb2=rgb(col_rgb[1]/255.0,col_rgb[2]/255.0,col_rgb[3]/255.0,transparent[i])
+ points(data[i,], pch = 19, cex=1, col=col_rgb2)
+ }
+
+ #points(data[set,], pch = 19, cex=1, col=col_rgb2)
+ }
}
if (!is.null(circle_idx))
{
@@ -919,13 +989,20 @@ pre_process_ttest_closer_to_zero <- function( x, y )
}
}
-split_plot <- function(ratio=0.33)
+split_plot <- function(feature_type,ratio=0.33)
{
+ seed = as.numeric(Sys.time())
+ print(paste("seed",seed))
+
+ #seed = 1350675271.02398
+
+ set.seed(seed)
print("splitting")
- split = contra_stratified_split(data, ratio=ratio)
+ split = contra_stratified_split(data, feature_type, ratio=ratio)
#print(split$cluster)
+
print("plotting")
- plot_split(plot_data,circle_idx=split$cluster,color_idx=split$split)
+ plot_split(data,circle_idx=split$cluster,color_idx=split$split)#,transparent=split$dist)
}
norm_test <- function(data.orig)
diff --git a/lib/utils.rb b/lib/utils.rb
index 992e53a..7a777e3 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -10,9 +10,8 @@ module ProcessUtil
stdout_str = stdout.readlines.join("")
stderr_str = stderr.readlines.join("")
ignored, status = Process::waitpid2 pid
- exit_status = status.exitstatus
[stdin, stdout, stderr].each{|io| io.close}
- return stdout_str, stderr_str, exit_status
+ return stdout_str, stderr_str, status.exitstatus
end
def self.run(cmd)