diff options
author | mguetlein <martin.guetlein@gmail.com> | 2012-10-12 10:21:42 +0200 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2012-10-12 10:21:42 +0200 |
commit | 8aa3468ce02816f515f304ac7144b74a6eb2f986 (patch) | |
tree | 2a5e4bd382d870b2197df0211bb7ad71eee2b71c | |
parent | 56d3b604b089af91e3e82c8e9ff3e77841d845fb (diff) |
add contra stratified splitting, add jaccard distance for binary fragments, add test stuff, additional fixes
-rw-r--r-- | lib/r-util.rb | 222 | ||||
-rw-r--r-- | lib/stratification.R | 667 |
2 files changed, 654 insertions, 235 deletions
diff --git a/lib/r-util.rb b/lib/r-util.rb index 952b873..fddcd43 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -21,6 +21,14 @@ class Array end +class RinRuby + def puts(object) + object.to_s.split("\n").each do |s| + LOGGER.debug "R> #{s.chomp}" if s.chomp.length>0 + end + end +end + module OpenTox class RUtil @@ -29,6 +37,7 @@ module OpenTox def initialize @r = RinRuby.new(true,false) unless defined?(@r) and @r + #@r.eval "sink(type='message')" @r.eval ".libPaths('#{PACKAGE_DIR}')" @r_packages = @r.pull "installed.packages()[,1]" ["sampling","gam","vegan","dynamicTreeCut"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto" @@ -58,21 +67,46 @@ module OpenTox end end +# def ttest_matrix_deviation(arrays, significance_level=0.95) +# +# LOGGER.info("perform ttest matrix deviation") +# result = [] +# arrays.size.times do |i| +# result[i] = [] +# @r.assign "v#{i}",arrays[i] +# @r.eval "v#{i}<-abs(as.numeric(v#{i}))" +# end +# (arrays.size-1).times do |i| +# (i+1..arrays.size-1).each do |j| +# raise if arrays[i].size!=arrays[j].size +# @r.eval "ttest = t.test(v#{i}-v#{j},conf.level=#{significance_level})" +# min = @r.pull "ttest$conf.int[1]" +# max = @r.pull "ttest$conf.int[2]" +# if (min>0 or max<0) +# result[i][j]=true +# result[j][i]=true +# else +# result[i][j]=false +# result[j][i]=false +# end +# end +# end +# result +# end - def ttest_matrix(arrays, paired, significance_level=0.95) - - LOGGER.info("perform ttest matrix") + def pvalue_test_matrix(test, arrays, significance_level=0.95, params="") + LOGGER.info("perform test '#{test}' matrix") result = [] arrays.size.times do |i| result[i] = [] @r.assign "v#{i}",arrays[i] + @r.eval "v#{i}<-as.numeric(v#{i})" end (arrays.size-1).times do |i| (i+1..arrays.size-1).each do |j| - raise if paired && arrays[i].size!=arrays[j].size - @r.eval "ttest = t.test(as.numeric(v#{i}),as.numeric(v#{j}),paired=#{paired ? "T" : "F"})" - t = @r.pull "ttest$statistic" - p = @r.pull "ttest$p.value" + @r.eval "test = #{test}(v#{i},v#{j},#{params})" + t = @r.pull "test$statistic" + p = @r.pull "test$p.value" if (1-significance_level > p) result[i][j]=true result[j][i]=true @@ -85,24 +119,61 @@ module OpenTox result end + def ttest_matrix(arrays, paired, significance_level=0.95) + (arrays.size-1).times do |i| + (i+1..arrays.size-1).each do |j| + raise if paired && arrays[i].size!=arrays[j].size + end + end + params = "paired=#{paired ? "T" : "F"}" + pvalue_test_matrix("t.test",arrays,significance_level,params) + end + + def ftest_matrix(arrays, paired, significance_level=0.95) + pvalue_test_matrix("var.test",arrays,significance_level) + end + + def ttest_closer_to_zero_matrix(arrays, paired, significance_level=0.95) + (arrays.size-1).times do |i| + (i+1..arrays.size-1).each do |j| + raise if paired && arrays[i].size!=arrays[j].size + end + end + params = "paired=#{paired ? "T" : "F"}" + pvalue_test_matrix("ttest_closer_to_zero",arrays,significance_level,params) + end + def pvalue_test(test, array1, array2, significance_level=0.95, params="") + LOGGER.info("perform test '#{test}'") + @r.assign "v1",array1 + @r.assign "v2",array2 + @r.eval "test = #{test}(as.numeric(v1),as.numeric(v2),#{params})" + t = @r.pull "test$statistic" + p = @r.pull "test$p.value" + if (1-significance_level > p) + t + else + 0 + end + end # <0 -> array1 << array2 # 0 -> no significant difference # >0 -> array2 >> array1 def ttest(array1, array2, paired, significance_level=0.95) - LOGGER.info("perform ttest") - @r.assign "v1",array1 - @r.assign "v2",array2 raise if paired && array1.size!=array2.size - @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=#{paired ? "T" : "F"})" - t = @r.pull "ttest$statistic" - p = @r.pull "ttest$p.value" - if (1-significance_level > p) - t - else - 0 - end + params = "paired=#{paired ? "T" : "F"}" + pvalue_test("t.test",array1,array2,significance_level,params) + end + + def ftest(array1, array2, significance_level=0.95) + pvalue_test("var.test",array1,array2,significance_level) + end + + def ttest_closer_to_zero + raise if paired && array1.size!=array2.size + params = "paired=#{paired ? "T" : "F"}" + pvalue_test("ttest_closer_to_zero",array1,array2,significance_level,params) end def pvalue(array1, array2) @@ -119,10 +190,13 @@ module OpenTox min = @r.pull "ttest$conf.int[1]" max = @r.pull "ttest$conf.int[2]" if value2 <= min + LOGGER.debug "perform ttest-single: significant=true, #{value2} is lower than conf-interval [ #{min} - #{max} ]" 1 elsif value2 >= max + LOGGER.debug "perform ttest-single: significant=true, #{value2} is higher than conf-interval [ #{min} - #{max} ]" -1 else + LOGGER.debug "perform ttest-single: significant=false, #{value2} is inside conf-interval [ #{min} - #{max} ]" 0 end end @@ -154,7 +228,7 @@ module OpenTox max_median_idx = i if max_median==med min_median = [min_median,med].min min_median_idx = i if min_median==med - data[i] = [data[i][0]+"(#{values.size})",data[i][1]] + data[i] = [data[i][0]+"(#{values.size})",data[i][1]] if @@boxplot_alg_info end if min != max times = max/min.to_f @@ -182,25 +256,30 @@ module OpenTox hlines << [max_median,2+max_median_idx] hlines << [min_median,2+min_median_idx] plot_to_files(files, hlines) do |file| - @r.eval "superboxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1})#{param_str})" + @r.eval "superboxplot(boxdata,alg_info=#{@@boxplot_alg_info ? "T" : "F"},main='#{title}',col=rep(2:#{data.size+1})#{param_str})" end end # embedds feature values of two datasets into 2D and plots it # def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, - features=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil, color_feature=nil ) + prediction_feature=nil, subjectid=nil, waiting_task=nil, direct_plot=false, title=nil, color_feature=nil ) LOGGER.debug("r-util> create feature value plot") d1 = OpenTox::Dataset.find(dataset_uri1,subjectid) d2 = OpenTox::Dataset.find(dataset_uri2,subjectid) - if features - [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}} - else - raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if + + raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if (d1.features.keys.sort != d2.features.keys.sort) - features = d1.features.keys - end + features = d1.features.keys + if prediction_feature + if features.include?(prediction_feature) + features -= [prediction_feature] + else + LOGGER.debug "prediction feature #{prediction_feature} cannot be remvoed because not included in #{dataset_uri1}" + end + end + raise "at least two features needed" if d1.features.keys.size<2 waiting_task.progress(25) if waiting_task @@ -258,6 +337,11 @@ module OpenTox if (is_numerical) @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values') { + if( log && ( min(data1)<=0 || min(data2)<=0 )) + { + print('disabling log because of datapoints <= 0') + log = FALSE + } if (log) { data1 <- log(data1) @@ -265,7 +349,8 @@ module OpenTox xlab = paste('logarithm of',xlab,sep=' ') } xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2))))) - h <- hist(rbind(data1,data2),plot=F) + save.image('/tmp/image.R') + h <- hist(c(data1,data2),plot=F) h1 <- hist(data1,plot=F,breaks=h$breaks) h2 <- hist(data2,plot=F,breaks=h$breaks) xlims = c(min(h$breaks),max(h$breaks)) @@ -356,32 +441,44 @@ module OpenTox end return train, test else - raise unless stratification=~/^(super|super4|super5|anti)$/ + raise unless stratification=~/^(super|super4|super5|contra)$/ anti = "" super_method = "" super_method_2 = "" - preprocess = "" + #preprocess = "" case stratification - when "anti" - anti = "anti_" + when "contra" + anti = "contra_" when "super" super_method = ", method='cluster_knn'" when "super4" super_method = ", method='cluster_hierarchical'" - preprocess = ", preprocess='pca'" + #preprocess = ", preprocess='pca'" when "super5" super_method = ", method='cluster_hierarchical'" super_method_2 = ", method_2='explicit'" - preprocess = ", preprocess='pca'" + #preprocess = ", preprocess='pca'" end - LOGGER.debug "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})" - @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2} #{preprocess})" + LOGGER.debug "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2})" # #{preprocess} + @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features} #{super_method} #{super_method_2})" # #{preprocess} split = @r.pull 'split$split' cluster = (store_split_clusters ? @r.pull('split$cluster') : nil) metadata[DC.title] = "Training dataset split of "+dataset.uri train = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==1 } metadata[DC.title] = "Test dataset split of "+dataset.uri test = split_to_dataset( df, split, metadata, subjectid, missing_values, cluster ){ |i| i==0 } + + f = "/tmp/split_pic.svg" + LOGGER.debug "plotting to #{f} .." + @r.eval "num_feats = #{split_features ? split_features.size : "ncol(plot_data)"}" + @r.eval "plot_data = process_data(#{df}, #{str_split_features})" + @r.eval "plot_data = plot_pre_process(plot_data, method='sammon')" + @r.eval "title = paste('sammon embedding for splitting #{df},',num_feats,'features,',nrow(plot_data),'instances')" + plot_to_files([f]) do |file| + @r.eval "plot_split(plot_data,color_idx=split$split, main=title)" + end + LOGGER.debug "plotting to #{f} .. done" + return train, test end end @@ -507,26 +604,35 @@ module OpenTox LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}" compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)} features.each{|f| dataset.add_feature(f,@@feats[df][f])} - features.size.times do |c| - LOGGER.debug "r-util> dataframe to dataset - feature #{c+1} / #{features.size}" if - c%25==0 && (features.size*compounds.size)>100000 - if features[c]=~/\/feature\/bbrc\// - numeric=true + features.size.times do |f_i| + LOGGER.debug "r-util> dataframe to dataset - feature #{f_i+1} / #{features.size}" if + f_i%25==0 && (features.size*compounds.size)>100000 + if features[f_i]=~/\/feature\/bbrc\// + numeric="int" else - type = @@feats[df][features[c]][RDF.type] + type = @@feats[df][features[f_i]][RDF.type] unless type LOGGER.debug "r-util> derive feature type by rest-call" - feat = OpenTox::Feature.find(features[c],subjectid) + feat = OpenTox::Feature.find(features[f_i],subjectid) type = feat.metadata[RDF.type] end - numeric = type.to_a.flatten.include?(OT.NumericFeature) + numeric = type.to_a.flatten.include?(OT.NumericFeature) ? "float" : nil end - compounds.size.times do |r| - if compound_indices==nil or compound_indices.include?(r) - dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c], true) if - ((NEW and values[r][c]!=nil) or (values[r][c]!="NA" and !(values[r][c] =~ missing_value_regexp))) + case numeric + when "int" + def convert_numeric(v); v.to_i; end + when "float" + def convert_numeric(v); v.to_f; end + else + def convert_numeric(v); v; end + end + compounds.size.times do |c_i| + if compound_indices==nil or compound_indices.include?(c_i) + dataset.add(compounds[c_i],features[f_i],convert_numeric(values[c_i][f_i]), true) if + ((NEW and values[c_i][f_i]!=nil) or (values[c_i][f_i]!="NA" and !(values[c_i][f_i] =~ missing_value_regexp))) end end + end if cluster cluster_feature = "http://no.such.domain/feature/split_cluster" @@ -595,20 +701,36 @@ module OpenTox @@svg_plot_width = 12 @@svg_plot_height = 8 - + public def set_svg_plot_size(width,height) @@svg_plot_width = width @@svg_plot_height = height end + @@png_plot_width = 800 + @@png_plot_height = 600 + @@png_plot_pointsize = 12 + + def set_png_plot_size(width,height,pointsize) + @@png_plot_width = width + @@png_plot_height = height + @@png_plot_pointsize = pointsize + end + + @@boxplot_alg_info = true + + def set_boxplot_alg_info(boxplot_alg_info) + @@boxplot_alg_info = boxplot_alg_info + end + private def plot_to_files(files,hlines=nil) files.each do |file| if file=~/(?i)\.svg/ @r.eval("svg('#{file}',#{@@svg_plot_width},#{@@svg_plot_height})") elsif file=~/(?i)\.png/ - @r.eval("png('#{file}')") + @r.eval("png('#{file}',width=#{@@png_plot_width},height=#{@@png_plot_height},pointsize=#{@@png_plot_pointsize})") else raise "invalid format: "+file.to_s end diff --git a/lib/stratification.R b/lib/stratification.R index 238515d..dcc630e 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -1,4 +1,39 @@ +# bbrc dataset 249 x 530 +#load("/home/martin/workspace/ValidationExperiments/bbrc_image.R") +#data=df_12117 + +# 250 compounds, 2 features +#load("/home/martin/workspace/ValidationExperiments/strat_pics/image.R") +#data=df_11306 +#data=cbind(data[1],data[3]) + +# 1000 compounds, 226 features +#load("/home/martin/tmp/image_12171.R") +#data=df_12171 + +is_binary <- function( data ) +{ + #print(length(data[data==0])) + #print(length(data[data==1])) + #print(length(data[data==0])+length(data[data==1])) + #print(nrow(data)) + #print(ncol(data)) + #print(nrow(data)*ncol(data)) + #print(head(data)) + length(data[data==0])+length(data[data==1]) == nrow(data)*ncol(data) +} + +is_really_numeric <- function(data) +{ + for (i in 1:ncol(data)) + { + if (!is.numeric(data[,i])) + return(FALSE) + } + TRUE +} + round_it <- function( x ) { if(isTRUE((x - floor(x))>=0.5)) @@ -60,30 +95,45 @@ remove_var_zero <- function(data, useNA = 'ifany') process_data <- function( data, colnames=NULL ) { + #save.image("/tmp/to_process.R") + + rownames(data) <- NULL data.num <- as.data.frame(data) if (!is.null(colnames)) { data.num = subset(data.num, select = colnames) + ## if (ncol(data.num)==0) + ## { + ## print("ERROR no columns left after selecting colnames") + ## print("colnames before selection:") + ## print(colnames(data)) + ## print("colnames to be selected:") + ## print(colnames) + ## stop("no columns left after selecting colnames") + ## } } - if (!is.numeric(data.num)) - { + if (!is_really_numeric(data.num)) data.num = nominal_to_binary(data.num) - } if(any(is.na(data.num))) { - require("gam") + print("WARNING data contains NAs, if you convert to dataframe you can specify the missing values value.") + suppressPackageStartupMessages(require("gam")) data.repl = na.gam.replace(data.num) } else data.repl = data.num data.without_constant_vals = remove_var_zero(data.repl) + if(ncol(data.repl)>ncol(data.without_constant_vals)) + print(paste("removed ",(ncol(data.repl)-ncol(data.without_constant_vals)),"/",ncol(data.repl)," features because of no variance")) + if (ncol(data.without_constant_vals)==0) + stop("ERROR no columns left after processing") data.without_constant_vals } # depends on random seed cluster_knn <- function( data, min=10, max=15) #, method="kmeans" ) { - require("vegan") + suppressPackageStartupMessages(require("vegan")) max <- min(max,nrow(unique(data))) max <- min(max,nrow(data)-1) if (min>max) @@ -95,17 +145,36 @@ cluster_knn <- function( data, min=10, max=15) #, method="kmeans" ) cbind(s$partition[,m]) } +dynamic_dist <- function(data, is_bin=NULL) +{ + if(is.null(is_bin)) + is_bin = is_binary(data) + if (!is_bin) + { + print(paste("distance used: Euclidean")) + d <- dist(data, method="euclidean") + } + else + { + print(paste("distance used: Jaccard (Tanimoto)")) + suppressPackageStartupMessages(require("proxy")) + d <- dist(data, method="Jaccard") + } + m <- as.matrix(d) + diag(m)=0 # do this manually, as it is set to NA by the Tanimoto method + list(dist=d,matrix=m) +} + #deterministic! cluster_hierarchical <- function(data, hclust_method="ward", max_num_clusters=15, deep_split=4, ...) { max_num_clusters <- min(max_num_clusters,nrow(unique(data))) max_num_clusters <- min(max_num_clusters,nrow(data)-1) min_size <- round_it(nrow(data)/max_num_clusters) - - require("dynamicTreeCut") - d <- dist(data, method = "euclidean") - fit <- hclust(d, method=hclust_method) - cut = cutreeDynamic(fit, method = "hybrid", distM = as.matrix(d), minClusterSize = min_size, deepSplit=deep_split, ...) + d <- dynamic_dist(data) + suppressPackageStartupMessages(require("dynamicTreeCut")) + fit <- hclust(d$dist, method=hclust_method) + cut = cutreeDynamic(fit, method = "hybrid", distM = d$matrix, minClusterSize = min_size, deepSplit=deep_split, ...) print(paste("dynamicTreeCut clustering, minClusterSize:",min_size," result:")) print(table(cut)) #print(cut) @@ -175,27 +244,146 @@ random_split <- function( data, ratio=0.3 ) as.vector(split) } -stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL, preprocess="none" ) + +contra_stratified_split <- function( data, ratio=0.3, colnames=NULL ) #, samplesize=10 ) { data.processed = as.matrix(process_data( data, colnames )) - print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio," preprocess: ",preprocess)) - cl = NULL + print(paste("contra strat split, #features:",ncol(data.processed),"/",ncol(data),", ratio:",ratio)) + + ##save.image("/tmp/contra_splitting.R") + + binary=is_binary(data.processed) + print(paste("data is binary: ",binary)) - if (preprocess == "none") + if (!binary) + { + data.processed = as.matrix(pca_reduce_features(data.processed)) + print(paste("#features reduced with pca: ",ncol(data.processed))) + } + else { #do nothing + } + + samplesize = 50 + if (nrow(data.processed)<=samplesize) + sample = as.vector(1:nrow(data.processed)) + else + sample = random_split(data,ratio=samplesize/nrow(data)) + sample_data = data.frame() + orig_idx = c() + for(i in 1:nrow(data.processed)) + if(sample[i]==1) + { + sample_data = rbind(sample_data, data.processed[i,]) + orig_idx = cbind(orig_idx,i) + } + #print("head(sample data) (random sampled with fixed samplesize)") + #print(head(sample_data)) + print("orig_idx (orinal indices of sample data in orig data)") + print(as.vector(orig_idx)) + + m <- dynamic_dist(sample_data, is_bin=binary)$matrix + + max_dist = 0 + max_dist_idx = -1 + for(i in 1:nrow(sample_data)) + { + m_dist <- sum(m[,i]) + if (max_dist_idx==-1 || m_dist>max_dist) + { + max_dist = m_dist + max_dist_idx = i + } + } + print("max-dist (in sample distance matrix)") + print(max_dist) + print("max-dist-id") + print(max_dist_idx) + + anti_strat_center = orig_idx[max_dist_idx] + print("anti strat center (orignal index of max-distindex)") + print(anti_strat_center) + + #anti_strat_center = sample(1:nrow(data.processed),1) + + dist = array(0,nrow(data.processed)) + if (!binary) + { + for(i in 1:nrow(data.processed)) + dist[i] = sqrt(sum((data.processed[i,] - data.processed[anti_strat_center,]) ^ 2)) + } + else + { + for(i in 1:nrow(data.processed)) + dist[i] = as.vector(dist(rbind(data.processed[i,],data.processed[anti_strat_center,]),method="Jaccard")) + } + + print("head(dist) (distance from all data to anti strat center)") + print(head(dist)) + max = max(dist) + print("max (maxium distance)") + print(max) + + not_nil = 0 + prob = array(0,nrow(data.processed)) + #raw = array(0,nrow(data.processed)) + for(i in 1:nrow(data.processed)) + { + prob[i] = (1-dist[i]/max) ^ 100 + #raw[i] = 1-dist[i]/max + if(prob[i]!=0) + not_nil = not_nil+1 + } + + + print("head(prop) (convert distance into propability)") + print(head(prob)) + sum_prop = sum(prob) + print("sum prop (sum of propabilities)") + print(sum_prop) + + num_sel = round_it(nrow(data.processed)*ratio) + print(paste("selecting",num_sel,"from",nrow(data.processed),"according to prob ( prob!=0 for",not_nil,")")) + if (num_sel>not_nil) + { + for(i in 1:nrow(data.processed)) + if(prob[i]==0) + prob[i]=.Machine$double.xmin } - else if (preprocess == "pca") + + selected = sample(1:nrow(data.processed),num_sel,replace=FALSE,prob=prob) + split <- array(0,nrow(data.processed)) + for(i in selected) + { + #print(paste("sel ",i," raw ",raw[i]," dist ",prop[i])) + split[i] <- 1 + } + split = as.vector(split) + + cl <- array(0,nrow(data.processed)) + cl[anti_strat_center] = 1 + + list(split=split,cluster=cl) +} + +stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="samplecube", colnames=NULL ) #, preprocess="none" +{ + data.processed = as.matrix(process_data( data, colnames )) + print(paste("strat split, method: ",method," #features: ",ncol(data.processed)," ratio: ",ratio)) #," preprocess: ",preprocess)) + cl = NULL + + + + if (!is_binary(data.processed)) { data.processed = as.matrix(pca_reduce_features(data.processed)) - print(paste("#features reduced with pca: ",ncol(data.processed))) + print(paste("#features reduced with pca: ",ncol(data.processed))) } - else - stop("unknown preprocess") if (method == "samplecube") { - require("sampling") + suppressPackageStartupMessages(require("sampling")) # adjust ratio to make samplecube return exact number of samples ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed) pik = rep(ratio,times=nrow(data.processed)) @@ -205,7 +393,7 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="s else if (method == "cluster_knn") { cl = as.vector(cluster_knn(data.processed)) -# require("caret") +# suppressPackageStartupMessages(require("caret")) # res = createDataPartition(cl,p=ratio) # split = rep(1, times=nrow(data)) # for (j in 1:nrow(data)) @@ -219,7 +407,7 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="s { cl = as.vector(cluster_hierarchical(data.processed)) - #require("caret") + #suppressPackageStartupMessages(require("caret")) #res = createDataPartition(cl,p=ratio) #split = rep(1, times=nrow(data)) #for (j in 1:nrow(data)) @@ -270,7 +458,7 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="s ## } ## else ## { - ## require("sampling") + ## suppressPackageStartupMessages(require("sampling")) ## stratified_split(cl,ratio,"samplecube") ## } ## } @@ -280,7 +468,7 @@ stratified_split <- function( data, ratio=0.3, method="cluster_knn", method_2="s list(split=split, cluster=cl) } -anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) +anti_stratified_split_OLD <- function( data, ratio=0.3, colnames=NULL) { if (ratio > 0.5) { @@ -336,7 +524,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna folds = rep(0, times=nrow(data)) for (i in 1:(num_folds-1)) { - require("sampling") + suppressPackageStartupMessages(require("sampling")) prop = 1/(num_folds-(i-1)) print(paste("fold",i,"/",num_folds," prop",prop)) pik = rep(prop,times=nrow(data)) @@ -357,7 +545,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna } else if (method == "cluster") { - require("TunePareto") + suppressPackageStartupMessages(require("TunePareto")) cl = cluster(data.processed) res = generateCVRuns(cl,ntimes=1,nfold=num_folds) folds = rep(0, times=nrow(data)) @@ -398,11 +586,15 @@ add_duplicates <- function( data, dup_indices ) { sammon_duplicates <- function( data, ... ) { di <- duplicate_indices(data) - print(di) + #print("duplicate indices") + #print(di) u <- unique(data) print(paste('unique data points',nrow(u),'of',nrow(data))) if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4") - points_unique <- sammon(dist(u), ...)$points + distance <- dynamic_dist(u, is_binary(data))$dist + points_unique <- sammon(distance, ...)$points + #print("points unique") + #print(points_unique) if (nrow(u)<nrow(data)) { points <- add_duplicates(points_unique, di) @@ -422,11 +614,11 @@ pca_reduce_features <- function( data, max_num_features=1000, min_explained_vari #data.processed = process_data( data ) data.pca <- prcomp(data,scale=TRUE) - print("pca reduce features2") + #print("pca reduce features2") data.var <- data.pca$sdev^2 - print("pca reduce features3") + #print("pca reduce features3") explained_variance <- cumsum(data.var)/sum(data.var) @@ -448,171 +640,285 @@ plot_pre_process <- function( data, method="pca" ) } else if (method == "smacof") { - require("smacof") + suppressPackageStartupMessages(require("smacof")) data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T) data.emb$conf } else if (method == "sammon") { - require("MASS") + suppressPackageStartupMessages(require("MASS")) sammon_duplicates(data.processed, k=2) } else stop("unknown method") } + plot_split <- function( data, color_idx=NULL, circle_idx=NULL, ... ) -{ - if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2])) - stop("data not suitable for plotting, plot_pre_process() first") - - plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... ) - if (is.null(names)) - names <- c("split 1","split 2") - #colos = as.double(rep(2:(max(color_idx)+2))) - #legend("topleft",names,pch=2,col=colos) - - #legend("topleft",names[1],pch=as.double(c(1)),col="red") - - ## for (j in max(color_idx):0) - ## { - ## for (k in max(shape_idx):0) - ## { - ## set = c() - ## for (i in 1:nrow(data)) - ## if (color_idx[i]==j && shape_idx[i]==k) - ## set = c(set,i) - ## points(data[set,], pch = 15+k, col=(j+2)) - ## } - ## } - col_offset = 2 - if (!is.null(circle_idx)) - col_offset = 1 - if(is.null(color_idx)) - { - color_idx <- array(0,nrow(data)) - col_offset=3 - } - for (j in 0:max(color_idx)) - { - set = c() - for (i in 1:nrow(data)) - if (color_idx[i]==j) - set = c(set,i) - points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset) - } - if (!is.null(circle_idx)) - { - set = c() - for (i in 1:nrow(data)) - if (circle_idx[i]==1) +{ + if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2])) + stop("data not suitable for plotting, plot_pre_process() first") + + plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... ) + if (is.null(names)) + names <- c("split 1","split 2") + #colos = as.double(rep(2:(max(color_idx)+2))) + #legend("topleft",names,pch=2,col=colos) + + #legend("topleft",names[1],pch=as.double(c(1)),col="red") + + ## for (j in max(color_idx):0) + ## { + ## for (k in max(shape_idx):0) + ## { + ## set = c() + ## for (i in 1:nrow(data)) + ## if (color_idx[i]==j && shape_idx[i]==k) + ## set = c(set,i) + ## points(data[set,], pch = 15+k, col=(j+2)) + ## } + ## } + + + col_offset = 2 + if(is.null(color_idx)) + { + color_idx <- array(0,nrow(data)) + col_offset=3 + } + for (j in 0:max(color_idx)) + { + set = c() + for (i in 1:nrow(data)) + if (color_idx[i]==j) + set = c(set,i) + points(data[set,], pch = 19, cex=1, col=(max(color_idx)-j)+col_offset) + } + if (!is.null(circle_idx)) + { + set = c() + for (i in 1:nrow(data)) + if (circle_idx[i]==1) set = c(set,i)# - points(data[set,], pch = 1, cex=1, col=1) - points(data[set,], pch = 1, cex=1.8, col=1) - - ## for (j in max(color_idx):0) - ## { - ## set = c() - ## for (i in 1:nrow(data)) - ## if (color_idx[i]==j && circle_idx[i]==1) - ## set = c(set,i) - ## points(data[set,], pch = 19, cex=1, col=1) - ## points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset) - ## points(data[set,], pch = 1, cex=2, col=1) - ## } - - } + + #print(set) + #print(data[set,]) + + points(matrix(data[set,],ncol=2), pch = 1, cex=1, col=1) + points(matrix(data[set,],ncol=2), pch = 1, cex=1.8, col=1) + #points(data[set,], pch = 1, cex=1, col=1) + #points(data[set,], pch = 1, cex=1.8, col=1) + + ## for (j in max(color_idx):0) + ## { + ## set = c() + ## for (i in 1:nrow(data)) + ## if (color_idx[i]==j && circle_idx[i]==1) + ## set = c(set,i) + ## points(data[set,], pch = 19, cex=1, col=1) + ## points(data[set,], pch = 20, cex=1, col=(max(color_idx)-j)+col_offset) + ## points(data[set,], pch = 1, cex=2, col=1) + ## } + + } } -superboxplot <- function( data, ..., significance_level=0.95 ) +superboxplot <- function( data, ..., significance_level=0.95, paired=T, test_closer_to_zero=F, alg_info=T ) { b <- boxplot(data,...) #,col=rep(2:(ncol(data)+1))) - #print mean and stdev - for (i in 1:ncol(data)) - { - med <- sprintf("%.3f",b$stats[,i][3]) - stdev <- sprintf("%.3f",sqrt(var(data[,i]))) - mtext(paste(med,"+-",stdev),side=1,at=i,line=2) - } - - #print significance tests - if (nrow(data)>10) + if (alg_info) { - sig <- array(list(),c(ncol(data),ncol(data))) - for (i in 1:(ncol(data)-1)) + #print mean and stdev + for (i in 1:ncol(data)) { - for (j in (i+1):ncol(data)) - { - ttest = t.test(data[,i],data[,j],paired=T) - #print(ttest$p.value) - #print(is.na(ttest$p.value)) - if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value) - { - sig[i,j] = T - sig[j,i] = T - } - else - { - sig[i,j] = F - sig[j,i] = F - } - } + med <- sprintf("%.3f",b$stats[,i][3]) + stdev <- sprintf("%.3f",sqrt(var(data[,i]))) + mtext(paste(med,"+-",stdev),side=1,at=i,line=2) } - for (i in 1:ncol(data)) + + #print significance tests + if (nrow(data)>10 && significance_level>0 && significance_level<=1) { - ## s <- "" - ## for (j in 1:ncol(data)) - ## { - ## if (i == j) - ## s <- paste( s, "-" ,sep="") - ## else if (sig[i,j]==T) - ## s <- paste( s, "X" ,sep="") - ## else - ## s <- paste( s, "0" ,sep="") - ## } - - s<-"" - bigger <- "" - for (j in 1:ncol(data)) - { - #print(paste(i,j)) - if (i!=j && sig[i,j]==T && b$stats[,i][3] > b$stats[,j][3]) - bigger <- paste(bigger,j,sep=",") - } - if (nchar(bigger)>0) - { - bigger <- substring(bigger, 2) - bigger <- paste(">(",bigger,")",sep="") - s <- paste(s,bigger,sep=" ") - } - smaller <- "" - for (j in 1:ncol(data)) + sig <- array(list(),c(ncol(data),ncol(data))) + sig_var <- array(list(),c(ncol(data),ncol(data))) + for (i in 1:(ncol(data)-1)) { - #print(paste(i,j)) - if (i!=j && sig[i,j]==T && b$stats[,i][3] < b$stats[,j][3]) - smaller <- paste(smaller,j,sep=",") + for (j in (i+1):ncol(data)) + { + if (test_closer_to_zero==F) + ttest = t.test(data[,i],data[,j],paired=paired) + else + ttest = ttest_closer_to_zero(data[,i],data[,j],paired=paired) + if ( !is.na(ttest$p.value) && 1-significance_level > ttest$p.value) + { + sig[i,j] = T + sig[j,i] = T + } + else + { + sig[i,j] = F + sig[j,i] = F + } + + ftest = var.test(data[,i],data[,j]) + if ( !is.na(ftest$p.value) && 1-significance_level > ftest$p.value) + { + sig_var[i,j] = T + sig_var[j,i] = T + } + else + { + sig_var[i,j] = F + sig_var[j,i] = F + } + } } - if (nchar(smaller)>0) + for (i in 1:ncol(data)) { - smaller <- substring(smaller, 2) - smaller <- paste("<(",smaller,")",sep="") - s <- paste(s,smaller,sep=" ") + ## s <- "" + ## for (j in 1:ncol(data)) + ## { + ## if (i == j) + ## s <- paste( s, "-" ,sep="") + ## else if (sig[i,j]==T) + ## s <- paste( s, "X" ,sep="") + ## else + ## s <- paste( s, "0" ,sep="") + ## } + + s<-"" + bigger <- "" + for (j in 1:ncol(data)) + { + #print(paste(i,j)) + if (i!=j && sig[i,j]==T) + { + if (test_closer_to_zero==F && b$stats[,i][3] > b$stats[,j][3]) + bigger <- paste(bigger,j,sep=",") + else if (test_closer_to_zero==T && abs(b$stats[,i][3]) > abs(b$stats[,j][3])) + bigger <- paste(bigger,j,sep=",") + } + } + if (nchar(bigger)>0) + { + s<-"med" + bigger <- substring(bigger, 2) + bigger <- paste(">(",bigger,")",sep="") + s <- paste(s,bigger,sep=" ") + } + smaller <- "" + for (j in 1:ncol(data)) + { + #print(paste(i,j)) + if (i!=j && sig[i,j]==T) + { + if (test_closer_to_zero==F && b$stats[,i][3] < b$stats[,j][3]) + smaller <- paste(smaller,j,sep=",") + else if (test_closer_to_zero==T && abs(b$stats[,i][3]) < abs(b$stats[,j][3])) + smaller <- paste(smaller,j,sep=",") + } + } + if (nchar(smaller)>0) + { + if(nchar(bigger)==0) + s<-"med" + smaller <- substring(smaller, 2) + smaller <- paste("<(",smaller,")",sep="") + s <- paste(s,smaller,sep=" ") + } + + bigger <- "" + for (j in 1:ncol(data)) + { + if (i!=j && sig_var[i,j]==T && var(data[,i]) > var(data[,j])) + bigger <- paste(bigger,j,sep=",") + } + if (nchar(bigger)>0) + { + s <- paste(s,"var",sep=" ") + bigger <- substring(bigger, 2) + bigger <- paste(">(",bigger,")",sep="") + s <- paste(s,bigger,sep=" ") + } + smaller <- "" + for (j in 1:ncol(data)) + { + #print(paste(i,j)) + if (i!=j && sig_var[i,j]==T && var(data[,i]) < var(data[,j])) + smaller <- paste(smaller,j,sep=",") + } + if (nchar(smaller)>0) + { + if(nchar(bigger)==0) + s <- paste(s,"var",sep=" ") + smaller <- substring(smaller, 2) + smaller <- paste("<(",smaller,")",sep="") + s <- paste(s,smaller,sep=" ") + } + + + mtext(s,side=1,at=i,line=3) } - mtext(s,side=1,at=i,line=3) } } #print(sig) } +ttest_closer_to_zero <- function( x, y, ... ) +{ + prep = pre_process_ttest_closer_to_zero(x,y) + t.test(prep$x, prep$y, ...) +} + +pre_process_ttest_closer_to_zero <- function( x, y ) +{ + medX = median(x) + medY = median(y) + if((medX > 0 && medY > 0)|| (medX < 0 && medY < 0)) + { + list(x=x,y=y) + } + else + { + pos_data = x + neg_data = y + if (medX<medY) + { + pos_data = y + neg_data = x + } + shift = min(abs(medX),abs(medY)) + pos_data = pos_data - shift + neg_data = neg_data + shift + if (medX>medY) + list(x=pos_data,y=neg_data) + else + list(x=neg_data,y=pos_data) + } +} + +split_plot <- function(ratio=0.33) +{ + print("splitting") + split = contra_stratified_split(data, ratio=ratio) + #print(split$cluster) + print("plotting") + plot_split(plot_data,circle_idx=split$cluster,color_idx=split$split) +} + #a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) -#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) +#b<-matrix(rnorm(5000, mean=0, sd=30), ncol=5) +#data<-b #data<-rbind(a,b) #c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) #data<-rbind(data,c) #data=iris[1:4] + +#data = data[1:10,] + #require("datasets") #data=ChickWeight #data=freeny @@ -621,6 +927,10 @@ superboxplot <- function( data, ..., significance_level=0.95 ) #pca_reduce_features(data) #data=ethanol #split = stratified_k_fold_split(data, num_folds=3) + +#set.seed(as.numeric(Sys.time())) +#split = contra_stratified_split(data, ratio=0.1) +#split = stratified_split(data, ratio=0.1, method="cluster_hierarchical") #split = stratified_split(data, ratio=0.1, method="cluster_hierarchical") #split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit") @@ -631,38 +941,25 @@ superboxplot <- function( data, ..., significance_level=0.95 ) #print(split) #print(sum(split) -#plot_split(plot_pre_process(data, method="pca"),color_idx=split$split) -#plot_split(plot_pre_process(data, method="pca"),circle_idx=split$split,color_idx=split$cluster) +#print(split$cluster) +#print(which(split$cluster==1)) + +#plot_data = plot_pre_process(data, method="sammon") + +#print(nrow(plot_data)) +#print(ncol(plot_data)) +#print(head(plot_data)) +#print(plot_data[which(split$cluster==1),]) + + +#plot_data = data +#plot_split(plot_data,color_idx=split$split) +#plot_split(plot_data,circle_idx=split$split,color_idx=split$cluster) +#plot_split(plot_data,circle_idx=split$cluster,color_idx=split$split) #cl = cluster(data) -## # stratified splitting plot -## -## #png("data.png",width=1024,height=768,pointsize=20) -## png("random.png",width=1024,height=768,pointsize=20) -## #png("stratfied_cluster.png",width=1024,height=768,pointsize=20) -## #png("stratfied_cluster_selected.png",width=1024,height=768,pointsize=20) -## #png("stratfied_selected.png",width=1024,height=768,pointsize=20) -## -## #seed = sample(1:1000000,1) -## #print(paste("seed ",seed)) -## #set.seed(seed) -## -## set.seed(86281) #unlucky random -## #set.seed(160719) #nice stratifed -## -## data=df_11306 -## data=cbind(data[1],data[3]) -## plot_split(data,xlab="Molecular weight",ylab="LogP") -## -## #split = stratified_split(data, ratio=0.1,preprocess = "pca", method="cluster_hierarchical", method_2="explicit") -## split = list(split=random_split(data, ratio=0.1)) -## -## plot_split(data,color_idx=split$split,xlab="Molecular weight",ylab="LogP") -## #plot_split(data,color_idx=split$cluster,xlab="Molecular weight",ylab="LogP") -## #plot_split(data,circle_idx=split$split,color_idx=split$cluster,xlab="Molecular weight",ylab="LogP") -## -## dev.off()
\ No newline at end of file + |