summaryrefslogtreecommitdiff
path: root/lib/feature_selection.rb
blob: 43e3bea8aa00e706bfda94827fd30d6d3ad0314e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
module OpenTox
  module Algorithm
    
    class FeatureSelection

      def self.correlation_filter dataset:, prediction_feature:, types:nil
        # TODO: speedup, single assignment of all features to R+ parallel computation of significance?
        relevant_features = {}
        measurements = []
        substances = []
        dataset.substances.each do |s|
          dataset.values(s,prediction_feature).each do |act|
            measurements << act
            substances << s
          end
        end
        R.assign "tox", measurements
        feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq
        feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types
        feature_ids.each do |feature_id|
          feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]}
          unless feature_values.uniq.size == 1
            R.assign "feature", feature_values
            begin
              R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
              pvalue = R.eval("cor$p.value").to_ruby
              if pvalue <= 0.05
                r = R.eval("cor$estimate").to_ruby
                relevant_features[feature_id] = {}
                relevant_features[feature_id]["pvalue"] = pvalue
                relevant_features[feature_id]["r"] = r
                relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
                relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
              end
            rescue
              warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed."
            end
          end
        end
        relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
      end

    end

  end
end