summaryrefslogtreecommitdiff
path: root/lib/stratification.R
diff options
context:
space:
mode:
Diffstat (limited to 'lib/stratification.R')
-rw-r--r--lib/stratification.R78
1 files changed, 71 insertions, 7 deletions
diff --git a/lib/stratification.R b/lib/stratification.R
index 76ff2d8..3f8698c 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -1,4 +1,13 @@
+round_it <- function( x )
+{
+ if(isTRUE((x - floor(x))>=0.5))
+ ceiling(x)
+ else
+ floor(x)
+}
+
+
nominal_to_binary <- function( data )
{
result = NULL
@@ -41,9 +50,13 @@ nominal_to_binary <- function( data )
result
}
-process_data <- function( data )
+process_data <- function( data, colnames=NULL )
{
data.num <- as.data.frame(data)
+ if (!is.null(colnames))
+ {
+ data.num = subset(data.num, select = colnames)
+ }
if (!is.numeric(data.num))
{
data.num = nominal_to_binary(data.num)
@@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 )
cbind(s$partition[,m])
}
-stratified_split <- function( data, ratio=0.3, method="cluster" )
+stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
{
- data.processed = as.matrix(process_data( data ))
+ data.processed = as.matrix(process_data( data, colnames ))
+ print(paste("split using #features: ",ncol(data.processed)))
if (method == "samplecube")
{
require("sampling")
# adjust ratio to make samplecube return exact number of samples
- ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
+ ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
pik = rep(ratio,times=nrow(data.processed))
data.strat = cbind(pik,data.processed)
samplecube(data.strat,pik,order=2,comment=F)
@@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" )
stop("unknown method")
}
-stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
{
print(paste(num_folds,"-fold-split, data-size",nrow(data)))
- data.processed = as.matrix(process_data( data ))
+ data.processed = as.matrix(process_data( data, colnames ))
+ print(paste("split using #features: ",ncol(data.processed)))
if (method == "samplecube")
{
folds = rep(0, times=nrow(data))
@@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
{
require("TunePareto")
cl = cluster(data.processed)
- res = generateCVRuns(cl,ntimes=1,nfold=3)
+ res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
folds = rep(0, times=nrow(data))
for (i in 1:num_folds)
for(j in 1:length(res[[1]][[i]]))
@@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
stop("unknown method")
}
+duplicate_indices <- function( data ) {
+ indices = 1:nrow(data)
+ z = data
+ duplicate_index = anyDuplicated(z)
+ while(duplicate_index) {
+ duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
+ #print(paste(duplicate_index,'is dupl to',duplicate_to_index))
+ indices[duplicate_index] <- duplicate_to_index
+ z[duplicate_index,] <- paste('123$ยง%',duplicate_index)
+ duplicate_index = anyDuplicated(z)
+ }
+ indices
+}
+
+add_duplicates <- function( data, dup_indices ) {
+ result = data[1,]
+ for(i in 2:length(dup_indices)) {
+ row = data[rownames(data)==dup_indices[i],]
+ if(length(row)==0)
+ stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
+ result = rbind(result, row)
+ }
+ rownames(result)<-NULL
+ result
+}
+
+sammon_duplicates <- function( data, ... ) {
+ di <- duplicate_indices(data)
+ print(di)
+ u <- unique(data)
+ print(paste('unique data points',nrow(u),'of',nrow(data)))
+ if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
+ points_unique <- sammon(dist(u), ...)$points
+ if (nrow(u)<nrow(data))
+ {
+ points <- add_duplicates(points_unique, di)
+ points
+ }
+ else
+ {
+ points_unique
+ }
+}
+
plot_pre_process <- function( data, method="pca" )
{
data.processed = process_data( data )
@@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" )
data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
data.emb$conf
}
+ else if (method == "sammon")
+ {
+ require("MASS")
+ sammon_duplicates(data.processed, k=2)
+ }
else
stop("unknown method")
}