diff options
author | davor <vorgrimmlerdavid@gmx.de> | 2012-03-05 12:13:58 +0100 |
---|---|---|
committer | davor <vorgrimmlerdavid@gmx.de> | 2012-03-05 12:13:58 +0100 |
commit | 88caecc7b94b96ccb5be47ec719bfcb55d4581d1 (patch) | |
tree | acca94dcb7ffa967acf59a5a11969696db082352 | |
parent | bf1792b0353f0af5bf5f5383d193e315a8968245 (diff) |
Added new ruby-plot verion. Modified rfe parameter and remove unvalide
featrues befor rfe calculation.
-rw-r--r-- | Rakefile | 2 | ||||
-rw-r--r-- | lib/algorithm.rb | 17 |
2 files changed, 14 insertions, 5 deletions
@@ -42,7 +42,7 @@ begin gem.add_dependency "dm-migrations", "=1.1.0" gem.add_dependency "dm-validations", "=1.1.0" gem.add_dependency "dm-sqlite-adapter", "=1.1.0" - gem.add_dependency "ruby-plot", "=0.6.0" + gem.add_dependency "ruby-plot", "=0.6.1" gem.add_dependency "gsl", "=1.14.7" gem.add_dependency "statsample", "=1.1.0" diff --git a/lib/algorithm.rb b/lib/algorithm.rb index f57954d..86a5d14 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -443,7 +443,7 @@ module OpenTox end - module FeatureSelection + module FeatureSelection include Algorithm # Recursive Feature Elimination using caret # @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values). @@ -480,10 +480,14 @@ module OpenTox na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) ) features = features[,!names(features) %in% na_col] + # features with infinite values removed + inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) ) + features = features[,!names(features) %in% inf_col] + # features with zero variance removed zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) ) features = features[,!names(features) %in% zero_var] - + pp = NULL if (del_missing) { # needed if rows should be removed @@ -497,15 +501,20 @@ module OpenTox } features = predict(pp, features) + # features with nan values removed (sometimes preProcess return NaN values) + nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) ) + features = features[,!names(features) %in% nan_col] + # determine subsets subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) subsets = c(2,3,4,5,7,10,subsets) + #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30) subsets = unique(sort(round(subsets))) subsets = subsets[subsets<=dim(features)[2]] subsets = subsets[subsets>1] - + # Recursive feature elimination - rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets) + rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=250), sizes=subsets) # read existing dataset and select most useful features csv=feats[,c("SMILES", rfProfile$optVariables)] |