diff options
author | Andreas Maunz <andreas@maunz.de> | 2012-02-15 11:54:03 +0100 |
---|---|---|
committer | Andreas Maunz <andreas@maunz.de> | 2012-02-15 11:54:03 +0100 |
commit | c84536e2bbbdcd06621e5324f5b2dfa979416b44 (patch) | |
tree | 005b37163cd6f2eb953a414a09ea63e2e70fc911 | |
parent | 9cdc0f67f71159e5b5b8200b380326a7f3423493 (diff) |
Added caret feature selectionglm
-rw-r--r-- | lib/algorithm.rb | 51 |
1 files changed, 41 insertions, 10 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 0b3714b..9d1ad44 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -402,6 +402,7 @@ module OpenTox LOGGER.debug "Preparing R data ..." @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification + LOGGER.debug "VC: #{@r.n_prop_x_size}x#{@r.n_prop_y_size}" @r.eval <<-EOR rem = nearZeroVar(prop_matrix) if (length(rem) > 0) { @@ -413,25 +414,55 @@ module OpenTox prop_matrix = prop_matrix[,-rem,drop=F] q_prop = q_prop[,-rem,drop=F] } + n_prop_x_size = dim(prop_matrix)[1] + n_prop_y_size = dim(prop_matrix)[2] EOR + LOGGER.debug "VC: #{@r.n_prop_x_size}x#{@r.n_prop_y_size}" + + + # model + support vectors + LOGGER.debug "Preprocessing R data ..." + @r.eval <<-EOR + # names + prop_matrix=data.frame(prop_matrix) + q_prop=data.frame(q_prop) + names(prop_matrix) = paste("Var",seq(1:dim(prop_matrix)[2]),sep="") + names(q_prop)=names(prop_matrix) + + # preProcess + pp = preProcess(prop_matrix, method=c("scale", "center", "pca")) + prop_matrix=predict(pp, prop_matrix) + q_prop=predict(pp, q_prop) + n_prop_x_size = dim(prop_matrix)[1] + n_prop_y_size = dim(prop_matrix)[2] + EOR + LOGGER.debug "VC: #{@r.n_prop_x_size}x#{@r.n_prop_y_size}" + # model + support vectors LOGGER.debug "Creating R GLM model ..." @r.eval <<-EOR - QSAR = data.frame(prop_matrix) - q_prop = data.frame(q_prop) - names(q_prop) = names(QSAR) - model_formula = as.formula(paste("y~", paste(names(QSAR), collapse="+"), sep="")) - QSAR$y = y - #model = train ( model_formula, data=QSAR, method="glm", family=gaussian(link="log"), preProcess=c("center", "scale") ) - model = train ( model_formula, data=QSAR, method="lm", preProcess=c("center", "scale", "pca") ) - perf = model$results[which.min(model$results$RMSE),]$Rsquared + # determine subsets + subsets = dim(prop_matrix)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) + subsets = c(2,3,4,5,7,10,subsets) + subsets = unique(sort(round(subsets))) + subsets = subsets[subsets<=dim(prop_matrix)[2]] + subsets = subsets[subsets>1] + + save.image("/tmp/test.R") + # rfeControl + ctrl = rfeControl(functions=lmFuncs,verbose=F,returnResamp="final") + + # do rfe + model = rfe(prop_matrix,y,sizes=subsets,rfeControl=ctrl) + res=model$results; bs=model$bestSubset + perf = res[res$Variables==bs,"Rsquared"] EOR # prediction - LOGGER.debug "Predicting ..." - @r.eval "p = predict(model,q_prop)" + LOGGER.debug "Predicting (#{@r.bs} features)..." + @r.eval "p = predict(model$fit,q_prop)" prediction = @r.p # censoring |