summaryrefslogtreecommitdiff
path: root/lib/feature_selection.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-10-12 21:32:27 +0200
committerChristoph Helma <helma@in-silico.ch>2016-10-12 21:32:27 +0200
commitdc4ab1f4e64d738d6c0b70f0b690a2359685080f (patch)
tree054ae887bf978b519a95dce5dbead59bbc67a2bb /lib/feature_selection.rb
parent1ec5ad2c67f270287499980a794e51bc9a6bbd84 (diff)
physchem regression, correlation_filter for fingerprints
Diffstat (limited to 'lib/feature_selection.rb')
-rw-r--r--lib/feature_selection.rb60
1 files changed, 29 insertions, 31 deletions
diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb
index 43e3bea..f599539 100644
--- a/lib/feature_selection.rb
+++ b/lib/feature_selection.rb
@@ -3,41 +3,39 @@ module OpenTox
class FeatureSelection
- def self.correlation_filter dataset:, prediction_feature:, types:nil
- # TODO: speedup, single assignment of all features to R+ parallel computation of significance?
+ def self.correlation_filter model
relevant_features = {}
- measurements = []
- substances = []
- dataset.substances.each do |s|
- dataset.values(s,prediction_feature).each do |act|
- measurements << act
- substances << s
- end
- end
- R.assign "tox", measurements
- feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq
- feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types
- feature_ids.each do |feature_id|
- feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]}
- unless feature_values.uniq.size == 1
- R.assign "feature", feature_values
- begin
- R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
- pvalue = R.eval("cor$p.value").to_ruby
- if pvalue <= 0.05
- r = R.eval("cor$estimate").to_ruby
- relevant_features[feature_id] = {}
- relevant_features[feature_id]["pvalue"] = pvalue
- relevant_features[feature_id]["r"] = r
- relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
- relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
- end
- rescue
- warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed."
+ R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
+ model.descriptor_weights = []
+ selected_variables = []
+ selected_descriptor_ids = []
+ model.independent_variables.each_with_index do |v,i|
+ R.assign "independent", v.collect{|n| to_r(n)}
+ begin
+ R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
+ pvalue = R.eval("cor$p.value").to_ruby
+ if pvalue <= 0.05
+ model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
+ selected_variables << v
+ selected_descriptor_ids << model.descriptor_ids[i]
end
+ rescue
+ #warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with '#{Feature.find(model.descriptor_ids[i]).name}' (#{v}) failed."
+ warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
end
end
- relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+
+ model.independent_variables = selected_variables
+ model.descriptor_ids = selected_descriptor_ids
+ model
+ end
+
+ def self.to_r v
+ return 0 if v == false
+ return 1 if v == true
+ return "NA" if v.nil?
+ return "NA" if v.is_a? Float and v.nan?
+ v
end
end