Added caret feature selectionglm

author: Andreas Maunz <andreas@maunz.de> 2012-02-15 11:54:03 +0100
committer: Andreas Maunz <andreas@maunz.de> 2012-02-15 11:54:03 +0100
commit: c84536e2bbbdcd06621e5324f5b2dfa979416b44 (patch)
tree: 005b37163cd6f2eb953a414a09ea63e2e70fc911
parent: 9cdc0f67f71159e5b5b8200b380326a7f3423493 (diff)
1 files changed, 41 insertions, 10 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 0b3714b..9d1ad44 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -402,6 +402,7 @@ module OpenTox
             LOGGER.debug "Preparing R data ..."
             @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
 
+            LOGGER.debug "VC: #{@r.n_prop_x_size}x#{@r.n_prop_y_size}"
             @r.eval <<-EOR
               rem = nearZeroVar(prop_matrix)
               if (length(rem) > 0) {
@@ -413,25 +414,55 @@ module OpenTox
                 prop_matrix = prop_matrix[,-rem,drop=F]
                 q_prop = q_prop[,-rem,drop=F]
               }
+              n_prop_x_size = dim(prop_matrix)[1]
+              n_prop_y_size = dim(prop_matrix)[2]
             EOR
+            LOGGER.debug "VC: #{@r.n_prop_x_size}x#{@r.n_prop_y_size}"
+
+
+            # model + support vectors
+            LOGGER.debug "Preprocessing R data ..."
+            @r.eval <<-EOR
+              # names
+              prop_matrix=data.frame(prop_matrix)
+              q_prop=data.frame(q_prop)
+              names(prop_matrix) = paste("Var",seq(1:dim(prop_matrix)[2]),sep="")
+              names(q_prop)=names(prop_matrix)
+ 
+              # preProcess
+              pp = preProcess(prop_matrix, method=c("scale", "center", "pca"))
+              prop_matrix=predict(pp, prop_matrix)
+              q_prop=predict(pp, q_prop)
+              n_prop_x_size = dim(prop_matrix)[1]
+              n_prop_y_size = dim(prop_matrix)[2]
+             EOR
+            LOGGER.debug "VC: #{@r.n_prop_x_size}x#{@r.n_prop_y_size}"
+
 
             # model + support vectors
             LOGGER.debug "Creating R GLM model ..."
             @r.eval <<-EOR
-              QSAR = data.frame(prop_matrix)
-              q_prop = data.frame(q_prop)
-              names(q_prop) = names(QSAR)
-              model_formula = as.formula(paste("y~", paste(names(QSAR), collapse="+"), sep=""))
-              QSAR$y = y
-              #model = train ( model_formula, data=QSAR, method="glm", family=gaussian(link="log"), preProcess=c("center", "scale") )
-              model = train ( model_formula, data=QSAR, method="lm", preProcess=c("center", "scale", "pca") )
-              perf = model$results[which.min(model$results$RMSE),]$Rsquared
+              # determine subsets
+              subsets = dim(prop_matrix)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+              subsets = c(2,3,4,5,7,10,subsets)
+              subsets = unique(sort(round(subsets))) 
+              subsets = subsets[subsets<=dim(prop_matrix)[2]]
+              subsets = subsets[subsets>1] 
+
+              save.image("/tmp/test.R")
+              # rfeControl
+              ctrl = rfeControl(functions=lmFuncs,verbose=F,returnResamp="final")
+ 
+              # do rfe
+              model = rfe(prop_matrix,y,sizes=subsets,rfeControl=ctrl) 
+              res=model$results; bs=model$bestSubset
+              perf = res[res$Variables==bs,"Rsquared"]
             EOR
 
 
             # prediction
-            LOGGER.debug "Predicting ..."
-            @r.eval "p = predict(model,q_prop)"
+            LOGGER.debug "Predicting (#{@r.bs} features)..."
+            @r.eval "p = predict(model$fit,q_prop)"
             prediction = @r.p
 
             # censoring
author	Andreas Maunz <andreas@maunz.de>	2012-02-15 11:54:03 +0100
committer	Andreas Maunz <andreas@maunz.de>	2012-02-15 11:54:03 +0100
commit	c84536e2bbbdcd06621e5324f5b2dfa979416b44 (patch)
tree	005b37163cd6f2eb953a414a09ea63e2e70fc911
parent	9cdc0f67f71159e5b5b8200b380326a7f3423493 (diff)