From dc4ab1f4e64d738d6c0b70f0b690a2359685080f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 12 Oct 2016 21:32:27 +0200 Subject: physchem regression, correlation_filter for fingerprints --- lib/feature_selection.rb | 60 +++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) (limited to 'lib/feature_selection.rb') diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb index 43e3bea..f599539 100644 --- a/lib/feature_selection.rb +++ b/lib/feature_selection.rb @@ -3,41 +3,39 @@ module OpenTox class FeatureSelection - def self.correlation_filter dataset:, prediction_feature:, types:nil - # TODO: speedup, single assignment of all features to R+ parallel computation of significance? + def self.correlation_filter model relevant_features = {} - measurements = [] - substances = [] - dataset.substances.each do |s| - dataset.values(s,prediction_feature).each do |act| - measurements << act - substances << s - end - end - R.assign "tox", measurements - feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq - feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types - feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]} - unless feature_values.uniq.size == 1 - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - relevant_features[feature_id] = {} - relevant_features[feature_id]["pvalue"] = pvalue - relevant_features[feature_id]["r"] = r - relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby - end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." + R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)} + model.descriptor_weights = [] + selected_variables = [] + selected_descriptor_ids = [] + model.independent_variables.each_with_index do |v,i| + R.assign "independent", v.collect{|n| to_r(n)} + begin + R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + model.descriptor_weights << R.eval("cor$estimate").to_ruby**2 + selected_variables << v + selected_descriptor_ids << model.descriptor_ids[i] end + rescue + #warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with '#{Feature.find(model.descriptor_ids[i]).name}' (#{v}) failed." + warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed." end end - relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + + model.independent_variables = selected_variables + model.descriptor_ids = selected_descriptor_ids + model + end + + def self.to_r v + return 0 if v == false + return 1 if v == true + return "NA" if v.nil? + return "NA" if v.is_a? Float and v.nan? + v end end -- cgit v1.2.3