summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-10-12 10:21:42 +0200
committermguetlein <martin.guetlein@gmail.com>2012-10-12 10:21:42 +0200
commit8aa3468ce02816f515f304ac7144b74a6eb2f986 (patch)
tree2a5e4bd382d870b2197df0211bb7ad71eee2b71c
parent56d3b604b089af91e3e82c8e9ff3e77841d845fb (diff)
add contra stratified splitting, add jaccard distance for binary fragments, add test stuff, additional fixes
-rw-r--r--lib/r-util.rb222
-rw-r--r--lib/stratification.R667
2 files changed, 654 insertions, 235 deletions
diff --git a/lib/r-util.rb b/lib/r-util.rb
index 952b873..fddcd43 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -21,6 +21,14 @@ class Array
end
+class RinRuby
+ def puts(object)
+ object.to_s.split("\n").each do |s|
+ LOGGER.debug "R> #{s.chomp}" if s.chomp.length>0
+ end
+ end
+end
+
module OpenTox
class RUtil
@@ -29,6 +37,7 @@ module OpenTox
def initialize
@r = RinRuby.new(true,false) unless defined?(@r) and @r
+ #@r.eval "sink(type='message')"
@r.eval ".libPaths('#{PACKAGE_DIR}')"
@r_packages = @r.pull "installed.packages()[,1]"
["sampling","gam","vegan","dynamicTreeCut"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
@@ -58,21 +67,46 @@ module OpenTox
end
end
+# def ttest_matrix_deviation(arrays, significance_level=0.95)
+#
+# LOGGER.info("perform ttest matrix deviation")
+# result = []
+# arrays.size.times do |i|
+# result[i] = []
+# @r.assign "v#{i}",arrays[i]
+# @r.eval "v#{i}<-abs(as.numeric(v#{i}))"
+# end
+# (arrays.size-1).times do |i|
+# (i+1..arrays.size-1).each do |j|
+# raise if arrays[i].size!=arrays[j].size
+# @r.eval "ttest = t.test(v#{i}-v#{j},conf.level=#{significance_level})"
+# min = @r.pull "ttest$conf.int[1]"
+# max = @r.pull "ttest$conf.int[2]"
+# if (min>0 or max<0)
+# result[i][j]=true
+# result[j][i]=true
+# else
+# result[i][j]=false
+# result[j][i]=false
+# end
+# end
+# end
+# result
+# end
- def ttest_matrix(arrays, paired, significance_level=0.95)
-
- LOGGER.info("perform ttest matrix")
+ def pvalue_test_matrix(test, arrays, significance_level=0.95, params="")
+ LOGGER.info("perform test '#{test}' matrix")
result = []
arrays.size.times do |i|
result[i] = []
@r.assign "v#{i}",arrays[i]
+ @r.eval "v#{i}<-as.numeric(v#{i})"
end
(arrays.size-1).times do |i|
(i+1..arrays.size-1).each do |j|
- raise if paired && arrays[i].size!=arrays[j].size
- @r.eval "ttest = t.test(as.numeric(v#{i}),as.numeric(v#{j}),paired=#{paired ? "T" : "F"})"
- t = @r.pull "ttest$statistic"
- p = @r.pull "ttest$p.value"
+ @r.eval "test = #{test}(v#{i},v#{j},#{params})"
+ t = @r.pull "test$statistic"
+ p = @r.pull "test$p.value"
if (1-significance_level > p)
result[i][j]=true
result[j][i]=true
@@ -85,24 +119,61 @@ module OpenTox
result
end
+ def ttest_matrix(arrays, paired, significance_level=0.95)
+ (arrays.size-1).times do |i|
+ (i+1..arrays.size-1).each do |j|
+ raise if paired && arrays[i].size!=arrays[j].size
+ end
+ end
+ params = "paired=#{paired ? "T" : "F"}"
+ pvalue_test_matrix("t.test",arrays,significance_level,params)
+ end
+
+ def ftest_matrix(arrays, paired, significance_level=0.95)
+ pvalue_test_matrix("var.test",arrays,significance_level)
+ end
+
+ def ttest_closer_to_zero_matrix(arrays, paired, significance_level=0.95)
+ (arrays.size-1).times do |i|
+ (i+1..arrays.size-1).each do |j|
+ raise if paired && arrays[i].size!=arrays[j].size
+ end
+ end
+ params = "paired=#{paired ? "T" : "F"}"
+ pvalue_test_matrix("ttest_closer_to_zero",arrays,significance_level,params)
+ end
+ def pvalue_test(test, array1, array2, significance_level=0.95, params="")
+ LOGGER.info("perform test '#{test}'")
+ @r.assign "v1",array1
+ @r.assign "v2",array2
+ @r.eval "test = #{test}(as.numeric(v1),as.numeric(v2),#{params})"
+ t = @r.pull "test$statistic"
+ p = @r.pull "test$p.value"
+ if (1-significance_level > p)
+ t
+ else
+ 0
+ end
+ end
# <0 -> array1 << array2
# 0 -> no significant difference
# >0 -> array2 >> array1
def ttest(array1, array2, paired, significance_level=0.95)
- LOGGER.info("perform ttest")
- @r.assign "v1",array1
- @r.assign "v2",array2
raise if paired && array1.size!=array2.size
- @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=#{paired ? "T" : "F"})"
- t = @r.pull "ttest$statistic"
- p = @r.pull "ttest$p.value"
- if (1-significance_level > p)
- t
- else
- 0
- end
+ params = "paired=#{paired ? "T" : "F"}"
+ pvalue_test("t.test",array1,array2,significance_level,params)
+ end
+
+ def ftest(array1, array2, significance_level=0.95)
+ pvalue_test("var.test",array1,array2,significance_level)
+ end
+
+ def ttest_closer_to_zero
+ raise if paired && array1.size!=array2.size
+ params = "paired=#{paired ? "T" : "F"}"
+ pvalue_test("ttest_closer_to_zero",array1,array2,significance_level,params)
end
def pvalue(array1, array2)
@@ -119,10 +190,13 @@ module OpenTox
min = @r.pull "ttest$conf.int[1]"
max = @r.pull "ttest$conf.int[2]"
if value2 <= min
+ LOGGER.debug "perform ttest-single: significant=true, #{value2} is lower than conf-interval [ #{min} - #{max} ]"
1
elsif value2 >= max
+ LOGGER.debug "perform ttest-single: significant=true, #{value2} is higher than conf-interval [ #{min} - #{max} ]"
-1
else
+ LOGGER.debug "perform ttest-single: significant=false, #{value2} is inside conf-interval [ #{min} - #{max} ]"
0
end
end
@@ -154,7 +228,7 @@ module OpenTox
max_median_idx = i if max_median==med
min_median = [min_median,med].min
min_median_idx = i if min_median==med
- data[i] = [data[i][0]+"(#{values.size})",data[i][1]]
+ data[i] = [data[i][0]+"(#{values.size})",data[i][1]] if @@boxplot_alg_info
end
if min != max
times = max/min.to_f
@@ -182,25 +256,30 @@ module OpenTox
hlines << [max_median,2+max_median_idx]
hlines << [min_median,2+min_median_idx]
plot_to_files(files, hlines) do |file|
- @r.eval "superboxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1})#{param_str})"
+ @r.eval "superboxplot(boxdata,alg_info=#{@@boxplot_alg_info ? "T" : "F"},main='#{title}',col=rep(2:#{data.size+1})#{param_str})"
end
end
# embedds feature values of two datasets into 2D and plots it
#
def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
- features=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil, color_feature=nil )
+ prediction_feature=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil, color_feature=nil )
LOGGER.debug("r-util> create feature value plot")
d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
- if features
- [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}}
- else
- raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if
+
+ raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if
(d1.features.keys.sort != d2.features.keys.sort)
- features = d1.features.keys
- end
+ features = d1.features.keys
+ if prediction_feature
+ if features.include?(prediction_feature)
+ features -= [prediction_feature]
+ else
+ LOGGER.debug "prediction feature #{prediction_feature} cannot be remvoed because not included in #{dataset_uri1}"
+ end
+ end
+
raise "at least two features needed" if d1.features.keys.size<2
waiting_task.progress(25) if waiting_task
@@ -258,6 +337,11 @@ module OpenTox
if (is_numerical)
@r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
{
+ if( log && ( min(data1)<=0 || min(data2)<=0 ))
+ {
+ print('disabling log because of datapoints <= 0')
+ log = FALSE
+ }
if (log)
{
data1 <- log(data1)
@@ -265,7 +349,8 @@ module OpenTox
xlab = paste('logarithm of',xlab,sep=' ')
}
xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
- h <- hist(rbind(data1,data2),plot=F)
+ save.image('/tmp/image.R')
+ h <- hist(c(data1,data2),plot=F)
h1 <- hist(data1,plot=F,breaks=h$breaks)
h2 <- hist(data2,plot=F,breaks=h$breaks)
xlims = c(min(h$breaks),max(h$breaks))
@@ -356,32 +441,44 @@ module OpenTox
end
return train, test
else
- raise unless stratification=~/^(super|super4|super5|anti)$/
+ raise unless stratification=~/^(super|super4|super5|contra)$/
anti = ""
super_method = ""
super_method_2 = ""
- preprocess = ""
+ #preprocess = ""
case stratification
- when "anti"
- anti = "anti_"
+ when "contra"
+ anti = "contra_"
when "super"
super_method = ", method='cluster_knn'"
when "super4"
super_method = ", method='cluster_hierarchical'"
- preprocess = ", preprocess='pca'"
+ #preprocess = ", preprocess='pca'"
when "super5"
super_method = ", method='cluster_hierarchical'"
super_method_2 = ", method_2='explicit'"
- preprocess = ", preprocess='pca'"
+ #preprocess = ", preprocess='pca'"
end
- LOGGER.debug "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})"
- @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})"
+ LOGGER.debug "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2})" # #{preprocess}
+ @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2})" # #{preprocess}
split = @r.pull 'split$split'
cluster = (store_split_clusters ? @r.pull('split$cluster') : nil)
metadata[DC.title] = "Training dataset split of "+dataset.uri
train = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==1 }
metadata[DC.title] = "Test dataset split of "+dataset.uri
test = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==0 }
+
+ f = "/tmp/split_pic.svg"
+ LOGGER.debug "plotting to #{f} .."
+ @r.eval "num_feats = #{split_features ? split_features.size : "ncol(plot_data)"}"
+ @r.eval "plot_data = process_data(#{df}, #{str_split_features})"
+ @r.eval "plot_data = plot_pre_process(plot_data, method='sammon')"
+ @r.eval "title = paste('sammon embedding for splitting #{df},',num_feats,'features,',nrow(plot_data),'instances')"
+ plot_to_files([f]) do |file|
+ @r.eval "plot_split(plot_data,color_idx=split$split, main=title)"
+ end
+ LOGGER.debug "plotting to #{f} .. done"
+
return train, test
end
end
@@ -507,26 +604,35 @@ module OpenTox
LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
features.each{|f| dataset.add_feature(f,@@feats[df][f])}
- features.size.times do |c|
- LOGGER.debug "r-util> dataframe to dataset - feature #{c+1} / #{features.size}" if
- c%25==0 && (features.size*compounds.size)>100000
- if features[c]=~/\/feature\/bbrc\//
- numeric=true
+ features.size.times do |f_i|
+ LOGGER.debug "r-util> dataframe to dataset - feature #{f_i+1} / #{features.size}" if
+ f_i%25==0 && (features.size*compounds.size)>100000
+ if features[f_i]=~/\/feature\/bbrc\//
+ numeric="int"
else
- type = @@feats[df][features[c]][RDF.type]
+ type = @@feats[df][features[f_i]][RDF.type]
unless type
LOGGER.debug "r-util> derive feature type by rest-call"
- feat = OpenTox::Feature.find(features[c],subjectid)
+ feat = OpenTox::Feature.find(features[f_i],subjectid)
type = feat.metadata[RDF.type]
end
- numeric = type.to_a.flatten.include?(OT.NumericFeature)
+ numeric = type.to_a.flatten.include?(OT.NumericFeature) ? "float" : nil
end
- compounds.size.times do |r|
- if compound_indices==nil or compound_indices.include?(r)
- dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c], true) if
- ((NEW and values[r][c]!=nil) or (values[r][c]!="NA" and !(values[r][c] =~ missing_value_regexp)))
+ case numeric
+ when "int"
+ def convert_numeric(v); v.to_i; end
+ when "float"
+ def convert_numeric(v); v.to_f; end
+ else
+ def convert_numeric(v); v; end
+ end
+ compounds.size.times do |c_i|
+ if compound_indices==nil or compound_indices.include?(c_i)
+ dataset.add(compounds[c_i],features[f_i],convert_numeric(values[c_i][f_i]), true) if
+ ((NEW and values[c_i][f_i]!=nil) or (values[c_i][f_i]!="NA" and !(values[c_i][f_i] =~ missing_value_regexp)))
end
end
+
end
if cluster
cluster_feature = "http://no.such.domain/feature/split_cluster"
@@ -595,20 +701,36 @@ module OpenTox
@@svg_plot_width = 12
@@svg_plot_height = 8
-
+
public
def set_svg_plot_size(width,height)
@@svg_plot_width = width
@@svg_plot_height = height
end
+ @@png_plot_width = 800
+ @@png_plot_height = 600
+ @@png_plot_pointsize = 12
+
+ def set_png_plot_size(width,height,pointsize)
+ @@png_plot_width = width
+ @@png_plot_height = height
+ @@png_plot_pointsize = pointsize
+ end
+
+ @@boxplot_alg_info = true
+
+ def set_boxplot_alg_info(boxplot_alg_info)
+ @@boxplot_alg_info = boxplot_alg_info
+ end
+
private
def plot_to_files(files,hlines=nil)
files.each do |file|
if file=~/(?i)\.svg/
@r.eval("svg('#{file}',#{@@svg_plot_width},#{@@svg_plot_height})")
elsif file=~/(?i)\.png/
- @r.eval("png('#{file}')")
+ @r.eval("png('#{file}',width=#{@@png_plot_width},height=#{@@png_plot_height},pointsize=#{@@png_plot_pointsize})")
else
raise "invalid format: "+file.to_s
end
diff --git a/lib/stratification.R b/lib/stratification.R
index 238515d..dcc630e 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -1,4 +1,39 @@
+# bbrc dataset 249 x 530
+#load("/home/martin/workspace/ValidationExperiments/bbrc_image.R")
+#data=df_12117
+
+# 250 compounds, 2 features
+#load("/home/martin/workspace/ValidationExperiments/strat_pics/image.R")
+#data=df_11306
+#data=cbind(data[1],data[3])
+
+# 1000 compounds, 226 features
+#load("/home/martin/tmp/image_12171.R")
+#data=df_12171
+
+is_binary <- function( data )
+{
+ #print(length(data[data==0]))
+ #print(length(data[data==1]))
+ #print(length(data[data==0])+length(data[data==1]))
+ #print(nrow(data))
+ #print(ncol(data))
+ #print(nrow(data)*ncol(data))
+ #print(head(data))
+ length(data[data==0])+length(data[data==1]) == nrow(data)*ncol(data)
+}
+
+is_really_numeric <- function(data)
+{
+ for (i in 1:ncol(data))
+ {
+ if (!is.numeric(data[,i]))
+ return(FALSE)
+ }
+ TRUE
+}
+
round_it <- function( x )
{
if(isTRUE((x - floor(x))>=0.5))
@@ -60,30 +95,45 @@ remove_var_zero <- function(data, useNA = 'ifany')
process_data <- function( data, colnames=NULL )
{
+ #save.image("/tmp/to_process.R")
+
+ rownames(data) <- NULL
data.num <- as.data.frame(data)
if (!is.null(colnames))
{
data.num = subset(data.num, select = colnames)
+ ## if (ncol(data.num)==0)
+ ## {
+ ## print("ERROR no columns left after selecting colnames")
+ ## print("colnames before selection:")
+ ## print(colnames(data))
+ ## print("colnames to be selected:")
+ ## print(colnames)
+ ## stop("no columns left after selecting colnames")
+ ## }
}
- if (!is.numeric(data.num))
- {
+ if (!is_really_numeric(data.num))
data.num = nominal_to_binary(data.num)
- }
if(any(is.na(data.num)))
{
- require("gam")
+ print("WARNING data contains NAs, if you convert to dataframe you can specify the missing values value.")
+ suppressPackageStartupMessages(require("gam"))
data.repl = na.gam.replace(data.num)
}
else
data.repl = data.num
data.without_constant_vals = remove_var_zero(data.repl)
+ if(ncol(data.repl)>ncol(data.without_constant_vals))
+ print(paste("removed ",(ncol(data.repl)-ncol(data.without_constant_vals)),"/",ncol(data.repl)," features because of no variance"))
+ if (ncol(data.without_constant_vals)==0)
+ stop("ERROR no columns left after processing")
data.without_constant_vals
}
# depends on random seed
cluster_knn <- function( data, min=10, max=15) #, method="kmeans" )
{
- require("vegan")
+ suppressPackageStartupMessages(require("vegan"))
max <- min(max,nrow(unique(data)))
max <- min(max,nrow(data)-1)
if (min>max)
@@ -95,17 +145,36 @@ cluster_knn <- function( data, min=10, max=15) #, method="kmeans" )
cbind(s$partition[,m])
}
+dynamic_dist <- function(data, is_bin=NULL)
+{
+ if(is.null(is_bin))
+ is_bin = is_binary(data)
+ if (!is_bin)
+ {
+ print(paste("distance used: Euclidean"))
+ d <- dist(data, method="euclidean")
+ }
+ else
+ {
+ print(paste("distance used: Jaccard (Tanimoto)"))
+ suppressPackageStartupMessages(require("proxy"))
+ d <- dist(data, method="Jaccard")
+ }
+ m <- as.matrix(d)
+ diag(m)=0 # do this manually, as it is set to NA by the Tanimoto method
+ list(dist=d,matrix=m)
+}
+
#deterministic!
cluster_hierarchical <- function(data, hclust_method="ward", max_num_clusters=15, deep_split=4, ...)
{
max_num_clusters <- min(max_num_clusters,nrow(unique(data)))
max_num_clusters <- min(max_num_clusters,nrow(data)-1)
min_size <- round_it(nrow(data)/max_num_clusters)
-
- require("dynamicTreeCut")
- d <- dist(data, method = "euclidean")
- fit <- hclust(d, method=hclust_method)
- cut = cutreeDynamic(fit, method = "hybrid", distM = as.matrix(d), minClusterSize = min_size, deepSplit=deep_split, ...)
+ d <- dynamic_dist(data)
+ suppressPackageStartupMessages(require("dynamicTreeCut"))
+ fit <- hclust(d$dist, method=hclust_method)
+ cut = cutreeDynamic(fit, method = "hybrid", distM = d$matrix, minClusterSize = min_size, deepSplit=deep_split, ...)
print(paste("dynamicTreeCut clustering, minClusterSize:",min_size," result:"))
print(table(cut))
#print(cut)
@@ -175,27 +244,146 @@ random_split <- function( data, ratio=0.3 )
as.vector(split)
}
-stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL, preprocess="none" )
+
+contra_stratified_split <- function( data, ratio=0.3, colnames=NULL ) #, samplesize=10 )
{
data.processed = as.matrix(process_data( data, colnames ))
- print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio," preprocess: ",preprocess))
- cl = NULL
+ print(paste("contra strat split, #features:",ncol(data.processed),"/",ncol(data),", ratio:",ratio))
+
+ ##save.image("/tmp/contra_splitting.R")
+
+ binary=is_binary(data.processed)
+ print(paste("data is binary: ",binary))
- if (preprocess == "none")
+ if (!binary)
+ {
+ data.processed = as.matrix(pca_reduce_features(data.processed))
+ print(paste("#features reduced with pca: ",ncol(data.processed)))
+ }
+ else
{
#do nothing
+ }
+
+ samplesize = 50
+ if (nrow(data.processed)<=samplesize)
+ sample = as.vector(1:nrow(data.processed))
+ else
+ sample = random_split(data,ratio=samplesize/nrow(data))
+ sample_data = data.frame()
+ orig_idx = c()
+ for(i in 1:nrow(data.processed))
+ if(sample[i]==1)
+ {
+ sample_data = rbind(sample_data, data.processed[i,])
+ orig_idx = cbind(orig_idx,i)
+ }
+ #print("head(sample data) (random sampled with fixed samplesize)")
+ #print(head(sample_data))
+ print("orig_idx (orinal indices of sample data in orig data)")
+ print(as.vector(orig_idx))
+
+ m <- dynamic_dist(sample_data, is_bin=binary)$matrix
+
+ max_dist = 0
+ max_dist_idx = -1
+ for(i in 1:nrow(sample_data))
+ {
+ m_dist <- sum(m[,i])
+ if (max_dist_idx==-1 || m_dist>max_dist)
+ {
+ max_dist = m_dist
+ max_dist_idx = i
+ }
+ }
+ print("max-dist (in sample distance matrix)")
+ print(max_dist)
+ print("max-dist-id")
+ print(max_dist_idx)
+
+ anti_strat_center = orig_idx[max_dist_idx]
+ print("anti strat center (orignal index of max-distindex)")
+ print(anti_strat_center)
+
+ #anti_strat_center = sample(1:nrow(data.processed),1)
+
+ dist = array(0,nrow(data.processed))
+ if (!binary)
+ {
+ for(i in 1:nrow(data.processed))
+ dist[i] = sqrt(sum((data.processed[i,] - data.processed[anti_strat_center,]) ^ 2))
+ }
+ else
+ {
+ for(i in 1:nrow(data.processed))
+ dist[i] = as.vector(dist(rbind(data.processed[i,],data.processed[anti_strat_center,]),method="Jaccard"))
+ }
+
+ print("head(dist) (distance from all data to anti strat center)")
+ print(head(dist))
+ max = max(dist)
+ print("max (maxium distance)")
+ print(max)
+
+ not_nil = 0
+ prob = array(0,nrow(data.processed))
+ #raw = array(0,nrow(data.processed))
+ for(i in 1:nrow(data.processed))
+ {
+ prob[i] = (1-dist[i]/max) ^ 100
+ #raw[i] = 1-dist[i]/max
+ if(prob[i]!=0)
+ not_nil = not_nil+1
+ }
+
+
+ print("head(prop) (convert distance into propability)")
+ print(head(prob))
+ sum_prop = sum(prob)
+ print("sum prop (sum of propabilities)")
+ print(sum_prop)
+
+ num_sel = round_it(nrow(data.processed)*ratio)
+ print(paste("selecting",num_sel,"from",nrow(data.processed),"according to prob ( prob!=0 for",not_nil,")"))
+ if (num_sel>not_nil)
+ {
+ for(i in 1:nrow(data.processed))
+ if(prob[i]==0)
+ prob[i]=.Machine$double.xmin
}
- else if (preprocess == "pca")
+
+ selected = sample(1:nrow(data.processed),num_sel,replace=FALSE,prob=prob)
+ split <- array(0,nrow(data.processed))
+ for(i in selected)
+ {
+ #print(paste("sel ",i," raw ",raw[i]," dist ",prop[i]))
+ split[i] <- 1
+ }
+ split = as.vector(split)
+
+ cl <- array(0,nrow(data.processed))
+ cl[anti_strat_center] = 1
+
+ list(split=split,cluster=cl)
+}
+
+stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL ) #, preprocess="none"
+{
+ data.processed = as.matrix(process_data( data, colnames ))
+ print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio)) #," preprocess: ",preprocess))
+ cl = NULL
+
+
+
+ if (!is_binary(data.processed))
{
data.processed = as.matrix(pca_reduce_features(data.processed))
- print(paste("#features reduced with pca: ",ncol(data.processed)))
+ print(paste("#features reduced with pca: ",ncol(data.processed)))
}
- else
- stop("unknown preprocess")
if (method == "samplecube")
{
- require("sampling")
+ suppressPackageStartupMessages(require("sampling"))
# adjust ratio to make samplecube return exact number of samples
ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
pik = rep(ratio,times=nrow(data.processed))
@@ -205,7 +393,7 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="s
else if (method == "cluster_knn")
{
cl = as.vector(cluster_knn(data.processed))
-# require("caret")
+# suppressPackageStartupMessages(require("caret"))
# res = createDataPartition(cl,p=ratio)
# split = rep(1, times=nrow(data))
# for (j in 1:nrow(data))
@@ -219,7 +407,7 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="s
{
cl = as.vector(cluster_hierarchical(data.processed))
- #require("caret")
+ #suppressPackageStartupMessages(require("caret"))
#res = createDataPartition(cl,p=ratio)
#split = rep(1, times=nrow(data))
#for (j in 1:nrow(data))
@@ -270,7 +458,7 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="s
## }
## else
## {
- ## require("sampling")
+ ## suppressPackageStartupMessages(require("sampling"))
## stratified_split(cl,ratio,"samplecube")
## }
## }
@@ -280,7 +468,7 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="s
list(split=split, cluster=cl)
}
-anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
+anti_stratified_split_OLD <- function( data, ratio=0.3, colnames=NULL)
{
if (ratio > 0.5)
{
@@ -336,7 +524,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna
folds = rep(0, times=nrow(data))
for (i in 1:(num_folds-1))
{
- require("sampling")
+ suppressPackageStartupMessages(require("sampling"))
prop = 1/(num_folds-(i-1))
print(paste("fold",i,"/",num_folds," prop",prop))
pik = rep(prop,times=nrow(data))
@@ -357,7 +545,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna
}
else if (method == "cluster")
{
- require("TunePareto")
+ suppressPackageStartupMessages(require("TunePareto"))
cl = cluster(data.processed)
res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
folds = rep(0, times=nrow(data))
@@ -398,11 +586,15 @@ add_duplicates <- function( data, dup_indices ) {
sammon_duplicates <- function( data, ... ) {
di <- duplicate_indices(data)
- print(di)
+ #print("duplicate indices")
+ #print(di)
u <- unique(data)
print(paste('unique data points',nrow(u),'of',nrow(data)))
if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
- points_unique <- sammon(dist(u), ...)$points
+ distance <- dynamic_dist(u, is_binary(data))$dist
+ points_unique <- sammon(distance, ...)$points
+ #print("points unique")
+ #print(points_unique)
if (nrow(u)<nrow(data))
{
points <- add_duplicates(points_unique, di)
@@ -422,11 +614,11 @@ pca_reduce_features <- function( data, max_num_features=1000, min_explained_vari
#data.processed = process_data( data )
data.pca <- prcomp(data,scale=TRUE)
- print("pca reduce features2")
+ #print("pca reduce features2")
data.var <- data.pca$sdev^2
- print("pca reduce features3")
+ #print("pca reduce features3")
explained_variance <- cumsum(data.var)/sum(data.var)
@@ -448,171 +640,285 @@ plot_pre_process <- function( data, method="pca" )
}
else if (method == "smacof")
{
- require("smacof")
+ suppressPackageStartupMessages(require("smacof"))
data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
data.emb$conf
}
else if (method == "sammon")
{
- require("MASS")
+ suppressPackageStartupMessages(require("MASS"))
sammon_duplicates(data.processed, k=2)
}
else
stop("unknown method")
}
+
plot_split <- function( data, color_idx=NULL, circle_idx=NULL, ... )
-{
- if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
- stop("data not suitable for plotting, plot_pre_process() first")
-
- plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
- if (is.null(names))
- names <- c("split 1","split 2")
- #colos = as.double(rep(2:(max(color_idx)+2)))
- #legend("topleft",names,pch=2,col=colos)
-
- #legend("topleft",names[1],pch=as.double(c(1)),col="red")
-
- ## for (j in max(color_idx):0)
- ## {
- ## for (k in max(shape_idx):0)
- ## {
- ## set = c()
- ## for (i in 1:nrow(data))
- ## if (color_idx[i]==j && shape_idx[i]==k)
- ## set = c(set,i)
- ## points(data[set,], pch = 15+k, col=(j+2))
- ## }
- ## }
- col_offset = 2
- if (!is.null(circle_idx))
- col_offset = 1
- if(is.null(color_idx))
- {
- color_idx <- array(0,nrow(data))
- col_offset=3
- }
- for (j in 0:max(color_idx))
- {
- set = c()
- for (i in 1:nrow(data))
- if (color_idx[i]==j)
- set = c(set,i)
- points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset)
- }
- if (!is.null(circle_idx))
- {
- set = c()
- for (i in 1:nrow(data))
- if (circle_idx[i]==1)
+{
+ if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
+ stop("data not suitable for plotting, plot_pre_process() first")
+
+ plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
+ if (is.null(names))
+ names <- c("split 1","split 2")
+ #colos = as.double(rep(2:(max(color_idx)+2)))
+ #legend("topleft",names,pch=2,col=colos)
+
+ #legend("topleft",names[1],pch=as.double(c(1)),col="red")
+
+ ## for (j in max(color_idx):0)
+ ## {
+ ## for (k in max(shape_idx):0)
+ ## {
+ ## set = c()
+ ## for (i in 1:nrow(data))
+ ## if (color_idx[i]==j && shape_idx[i]==k)
+ ## set = c(set,i)
+ ## points(data[set,], pch = 15+k, col=(j+2))
+ ## }
+ ## }
+
+
+ col_offset = 2
+ if(is.null(color_idx))
+ {
+ color_idx <- array(0,nrow(data))
+ col_offset=3
+ }
+ for (j in 0:max(color_idx))
+ {
+ set = c()
+ for (i in 1:nrow(data))
+ if (color_idx[i]==j)
+ set = c(set,i)
+ points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset)
+ }
+ if (!is.null(circle_idx))
+ {
+ set = c()
+ for (i in 1:nrow(data))
+ if (circle_idx[i]==1)
set = c(set,i)#
- points(data[set,], pch = 1, cex=1, col=1)
- points(data[set,], pch = 1, cex=1.8, col=1)
-
- ## for (j in max(color_idx):0)
- ## {
- ## set = c()
- ## for (i in 1:nrow(data))
- ## if (color_idx[i]==j && circle_idx[i]==1)
- ## set = c(set,i)
- ## points(data[set,], pch = 19, cex=1, col=1)
- ## points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset)
- ## points(data[set,], pch = 1, cex=2, col=1)
- ## }
-
- }
+
+ #print(set)
+ #print(data[set,])
+
+ points(matrix(data[set,],ncol=2), pch = 1, cex=1, col=1)
+ points(matrix(data[set,],ncol=2), pch = 1, cex=1.8, col=1)
+ #points(data[set,], pch = 1, cex=1, col=1)
+ #points(data[set,], pch = 1, cex=1.8, col=1)
+
+ ## for (j in max(color_idx):0)
+ ## {
+ ## set = c()
+ ## for (i in 1:nrow(data))
+ ## if (color_idx[i]==j && circle_idx[i]==1)
+ ## set = c(set,i)
+ ## points(data[set,], pch = 19, cex=1, col=1)
+ ## points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset)
+ ## points(data[set,], pch = 1, cex=2, col=1)
+ ## }
+
+ }
}
-superboxplot <- function( data, ..., significance_level=0.95 )
+superboxplot <- function( data, ..., significance_level=0.95, paired=T, test_closer_to_zero=F, alg_info=T )
{
b <- boxplot(data,...) #,col=rep(2:(ncol(data)+1)))
- #print mean and stdev
- for (i in 1:ncol(data))
- {
- med <- sprintf("%.3f",b$stats[,i][3])
- stdev <- sprintf("%.3f",sqrt(var(data[,i])))
- mtext(paste(med,"+-",stdev),side=1,at=i,line=2)
- }
-
- #print significance tests
- if (nrow(data)>10)
+ if (alg_info)
{
- sig <- array(list(),c(ncol(data),ncol(data)))
- for (i in 1:(ncol(data)-1))
+ #print mean and stdev
+ for (i in 1:ncol(data))
{
- for (j in (i+1):ncol(data))
- {
- ttest = t.test(data[,i],data[,j],paired=T)
- #print(ttest$p.value)
- #print(is.na(ttest$p.value))
- if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value)
- {
- sig[i,j] = T
- sig[j,i] = T
- }
- else
- {
- sig[i,j] = F
- sig[j,i] = F
- }
- }
+ med <- sprintf("%.3f",b$stats[,i][3])
+ stdev <- sprintf("%.3f",sqrt(var(data[,i])))
+ mtext(paste(med,"+-",stdev),side=1,at=i,line=2)
}
- for (i in 1:ncol(data))
+
+ #print significance tests
+ if (nrow(data)>10 && significance_level>0 && significance_level<=1)
{
- ## s <- ""
- ## for (j in 1:ncol(data))
- ## {
- ## if (i == j)
- ## s <- paste( s, "-" ,sep="")
- ## else if (sig[i,j]==T)
- ## s <- paste( s, "X" ,sep="")
- ## else
- ## s <- paste( s, "0" ,sep="")
- ## }
-
- s<-""
- bigger <- ""
- for (j in 1:ncol(data))
- {
- #print(paste(i,j))
- if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3])
- bigger <- paste(bigger,j,sep=",")
- }
- if (nchar(bigger)>0)
- {
- bigger <- substring(bigger, 2)
- bigger <- paste(">(",bigger,")",sep="")
- s <- paste(s,bigger,sep=" ")
- }
- smaller <- ""
- for (j in 1:ncol(data))
+ sig <- array(list(),c(ncol(data),ncol(data)))
+ sig_var <- array(list(),c(ncol(data),ncol(data)))
+ for (i in 1:(ncol(data)-1))
{
- #print(paste(i,j))
- if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3])
- smaller <- paste(smaller,j,sep=",")
+ for (j in (i+1):ncol(data))
+ {
+ if (test_closer_to_zero==F)
+ ttest = t.test(data[,i],data[,j],paired=paired)
+ else
+ ttest = ttest_closer_to_zero(data[,i],data[,j],paired=paired)
+ if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value)
+ {
+ sig[i,j] = T
+ sig[j,i] = T
+ }
+ else
+ {
+ sig[i,j] = F
+ sig[j,i] = F
+ }
+
+ ftest = var.test(data[,i],data[,j])
+ if ( !is.na(ftest$p.value) && 1-significance_level > ftest$p.value)
+ {
+ sig_var[i,j] = T
+ sig_var[j,i] = T
+ }
+ else
+ {
+ sig_var[i,j] = F
+ sig_var[j,i] = F
+ }
+ }
}
- if (nchar(smaller)>0)
+ for (i in 1:ncol(data))
{
- smaller <- substring(smaller, 2)
- smaller <- paste("<(",smaller,")",sep="")
- s <- paste(s,smaller,sep=" ")
+ ## s <- ""
+ ## for (j in 1:ncol(data))
+ ## {
+ ## if (i == j)
+ ## s <- paste( s, "-" ,sep="")
+ ## else if (sig[i,j]==T)
+ ## s <- paste( s, "X" ,sep="")
+ ## else
+ ## s <- paste( s, "0" ,sep="")
+ ## }
+
+ s<-""
+ bigger <- ""
+ for (j in 1:ncol(data))
+ {
+ #print(paste(i,j))
+ if (i!=j && sig[i,j]==T)
+ {
+ if (test_closer_to_zero==F && b$stats[,i][3] > b$stats[,j][3])
+ bigger <- paste(bigger,j,sep=",")
+ else if (test_closer_to_zero==T && abs(b$stats[,i][3]) > abs(b$stats[,j][3]))
+ bigger <- paste(bigger,j,sep=",")
+ }
+ }
+ if (nchar(bigger)>0)
+ {
+ s<-"med"
+ bigger <- substring(bigger, 2)
+ bigger <- paste(">(",bigger,")",sep="")
+ s <- paste(s,bigger,sep=" ")
+ }
+ smaller <- ""
+ for (j in 1:ncol(data))
+ {
+ #print(paste(i,j))
+ if (i!=j && sig[i,j]==T)
+ {
+ if (test_closer_to_zero==F && b$stats[,i][3] < b$stats[,j][3])
+ smaller <- paste(smaller,j,sep=",")
+ else if (test_closer_to_zero==T && abs(b$stats[,i][3]) < abs(b$stats[,j][3]))
+ smaller <- paste(smaller,j,sep=",")
+ }
+ }
+ if (nchar(smaller)>0)
+ {
+ if(nchar(bigger)==0)
+ s<-"med"
+ smaller <- substring(smaller, 2)
+ smaller <- paste("<(",smaller,")",sep="")
+ s <- paste(s,smaller,sep=" ")
+ }
+
+ bigger <- ""
+ for (j in 1:ncol(data))
+ {
+ if (i!=j && sig_var[i,j]==T && var(data[,i]) > var(data[,j]))
+ bigger <- paste(bigger,j,sep=",")
+ }
+ if (nchar(bigger)>0)
+ {
+ s <- paste(s,"var",sep=" ")
+ bigger <- substring(bigger, 2)
+ bigger <- paste(">(",bigger,")",sep="")
+ s <- paste(s,bigger,sep=" ")
+ }
+ smaller <- ""
+ for (j in 1:ncol(data))
+ {
+ #print(paste(i,j))
+ if (i!=j && sig_var[i,j]==T && var(data[,i]) < var(data[,j]))
+ smaller <- paste(smaller,j,sep=",")
+ }
+ if (nchar(smaller)>0)
+ {
+ if(nchar(bigger)==0)
+ s <- paste(s,"var",sep=" ")
+ smaller <- substring(smaller, 2)
+ smaller <- paste("<(",smaller,")",sep="")
+ s <- paste(s,smaller,sep=" ")
+ }
+
+
+ mtext(s,side=1,at=i,line=3)
}
- mtext(s,side=1,at=i,line=3)
}
}
#print(sig)
}
+ttest_closer_to_zero <- function( x, y, ... )
+{
+ prep = pre_process_ttest_closer_to_zero(x,y)
+ t.test(prep$x, prep$y, ...)
+}
+
+pre_process_ttest_closer_to_zero <- function( x, y )
+{
+ medX = median(x)
+ medY = median(y)
+ if((medX > 0 && medY > 0)|| (medX < 0 && medY < 0))
+ {
+ list(x=x,y=y)
+ }
+ else
+ {
+ pos_data = x
+ neg_data = y
+ if (medX<medY)
+ {
+ pos_data = y
+ neg_data = x
+ }
+ shift = min(abs(medX),abs(medY))
+ pos_data = pos_data - shift
+ neg_data = neg_data + shift
+ if (medX>medY)
+ list(x=pos_data,y=neg_data)
+ else
+ list(x=neg_data,y=pos_data)
+ }
+}
+
+split_plot <- function(ratio=0.33)
+{
+ print("splitting")
+ split = contra_stratified_split(data, ratio=ratio)
+ #print(split$cluster)
+ print("plotting")
+ plot_split(plot_data,circle_idx=split$cluster,color_idx=split$split)
+}
+
#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
-#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
+#b<-matrix(rnorm(5000, mean=0, sd=30), ncol=5)
+#data<-b
#data<-rbind(a,b)
#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
#data<-rbind(data,c)
#data=iris[1:4]
+
+#data = data[1:10,]
+
#require("datasets")
#data=ChickWeight
#data=freeny
@@ -621,6 +927,10 @@ superboxplot <- function( data, ..., significance_level=0.95 )
#pca_reduce_features(data)
#data=ethanol
#split = stratified_k_fold_split(data, num_folds=3)
+
+#set.seed(as.numeric(Sys.time()))
+#split = contra_stratified_split(data, ratio=0.1)
+#split = stratified_split(data, ratio=0.1, method="cluster_hierarchical")
#split = stratified_split(data, ratio=0.1, method="cluster_hierarchical")
#split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit")
@@ -631,38 +941,25 @@ superboxplot <- function( data, ..., significance_level=0.95 )
#print(split)
#print(sum(split)
-#plot_split(plot_pre_process(data, method="pca"),color_idx=split$split)
-#plot_split(plot_pre_process(data, method="pca"),circle_idx=split$split,color_idx=split$cluster)
+#print(split$cluster)
+#print(which(split$cluster==1))
+
+#plot_data = plot_pre_process(data, method="sammon")
+
+#print(nrow(plot_data))
+#print(ncol(plot_data))
+#print(head(plot_data))
+#print(plot_data[which(split$cluster==1),])
+
+
+#plot_data = data
+#plot_split(plot_data,color_idx=split$split)
+#plot_split(plot_data,circle_idx=split$split,color_idx=split$cluster)
+#plot_split(plot_data,circle_idx=split$cluster,color_idx=split$split)
#cl = cluster(data)
-## # stratified splitting plot
-##
-## #png("data.png",width=1024,height=768,pointsize=20)
-## png("random.png",width=1024,height=768,pointsize=20)
-## #png("stratfied_cluster.png",width=1024,height=768,pointsize=20)
-## #png("stratfied_cluster_selected.png",width=1024,height=768,pointsize=20)
-## #png("stratfied_selected.png",width=1024,height=768,pointsize=20)
-##
-## #seed = sample(1:1000000,1)
-## #print(paste("seed ",seed))
-## #set.seed(seed)
-##
-## set.seed(86281) #unlucky random
-## #set.seed(160719) #nice stratifed
-##
-## data=df_11306
-## data=cbind(data[1],data[3])
-## plot_split(data,xlab="Molecular weight",ylab="LogP")
-##
-## #split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit")
-## split = list(split=random_split(data, ratio=0.1))
-##
-## plot_split(data,color_idx=split$split,xlab="Molecular weight",ylab="LogP")
-## #plot_split(data,color_idx=split$cluster,xlab="Molecular weight",ylab="LogP")
-## #plot_split(data,circle_idx=split$split,color_idx=split$cluster,xlab="Molecular weight",ylab="LogP")
-##
-## dev.off() \ No newline at end of file
+