summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordavor <vorgrimmlerdavid@gmx.de>2012-03-05 12:13:58 +0100
committerdavor <vorgrimmlerdavid@gmx.de>2012-03-05 12:13:58 +0100
commit88caecc7b94b96ccb5be47ec719bfcb55d4581d1 (patch)
treeacca94dcb7ffa967acf59a5a11969696db082352
parentbf1792b0353f0af5bf5f5383d193e315a8968245 (diff)
Added new ruby-plot verion. Modified rfe parameter and remove unvalide
featrues befor rfe calculation.
-rw-r--r--Rakefile2
-rw-r--r--lib/algorithm.rb17
2 files changed, 14 insertions, 5 deletions
diff --git a/Rakefile b/Rakefile
index dddea1b..4d1d0c3 100644
--- a/Rakefile
+++ b/Rakefile
@@ -42,7 +42,7 @@ begin
gem.add_dependency "dm-migrations", "=1.1.0"
gem.add_dependency "dm-validations", "=1.1.0"
gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
- gem.add_dependency "ruby-plot", "=0.6.0"
+ gem.add_dependency "ruby-plot", "=0.6.1"
gem.add_dependency "gsl", "=1.14.7"
gem.add_dependency "statsample", "=1.1.0"
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index f57954d..86a5d14 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -443,7 +443,7 @@ module OpenTox
end
- module FeatureSelection
+ module FeatureSelection
include Algorithm
# Recursive Feature Elimination using caret
# @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).
@@ -480,10 +480,14 @@ module OpenTox
na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
features = features[,!names(features) %in% na_col]
+ # features with infinite values removed
+ inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
+ features = features[,!names(features) %in% inf_col]
+
# features with zero variance removed
zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
features = features[,!names(features) %in% zero_var]
-
+
pp = NULL
if (del_missing) {
# needed if rows should be removed
@@ -497,15 +501,20 @@ module OpenTox
}
features = predict(pp, features)
+ # features with nan values removed (sometimes preProcess return NaN values)
+ nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
+ features = features[,!names(features) %in% nan_col]
+
# determine subsets
subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
subsets = c(2,3,4,5,7,10,subsets)
+ #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
subsets = unique(sort(round(subsets)))
subsets = subsets[subsets<=dim(features)[2]]
subsets = subsets[subsets>1]
-
+
# Recursive feature elimination
- rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=250), sizes=subsets)
# read existing dataset and select most useful features
csv=feats[,c("SMILES", rfProfile$optVariables)]