lib/lazar_regression.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

module OpenTox

  module Model

    class LazarRegression < Lazar

      # Create a lazar model from a training_dataset and a feature_dataset
      # @param [OpenTox::Dataset] training_dataset
      # @param [OpenTox::Dataset] feature_dataset
      # @return [OpenTox::Model::Lazar] Regression or classification model
      def self.create training_dataset#, feature_dataset

        #bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty?
        bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
        #bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless training_dataset.compounds == feature_dataset.compounds

        prediction_feature = training_dataset.features.first
        prediction_feature.nominal ?  lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
        #lazar.feature_dataset_id = feature_dataset.id
        lazar.training_dataset_id = training_dataset.id
        lazar.prediction_feature_id = prediction_feature.id
        lazar.title = prediction_feature.title 

        # log transform activities (create new dataset)
        # scale, normalize features, might not be necessary
        # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
        # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
        # zero-order correlation and the semi-partial correlation
        # seems to be necessary for svm
        #   http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
        #   http://stackoverflow.com/questions/15436367/svm-scaling-input-values
        # use lasso or elastic net??
        # select relevant features
        #   remove features with a single value
        #   remove correlated features
        #   remove features not correlated with endpoint

        lazar.save
        lazar
      end

      def predict object

        t = Time.now
        at = Time.now

        training_dataset = OpenTox::Dataset.find(training_dataset_id)

        compounds = []
        case object.class.to_s
        when "OpenTox::Compound"
          compounds = [object] 
        when "Array"
          compounds = object
        when "OpenTox::Dataset"
          compounds = object.compounds
        else 
          bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
        end

        $logger.debug "Setup: #{Time.now-t}"
        t = Time.now

        $logger.debug "Query fingerprint calculation: #{Time.now-t}"
        t = Time.now

        predictions = []
        prediction_feature = OpenTox::Feature.find prediction_feature_id
        tt = 0
        pt = 0
        nt = 0
        st = 0
        nit = 0
        predictions = []
        compounds.each_with_index do |compound,c|
          t = Time.new
          neighbors = compound.neighbors 
          weighted_sum = 0
          sim_sum = 0
          neighbors.each do |row|
            n,sim = row
            i = training_dataset.compound_ids.index n.id
            if i
              act = training_dataset.data_entries[i].first
              if act
                weighted_sum += sim*Math.log10(act)
                sim_sum += sim
              end
            end
          end
          weighted_average = 10**(weighted_sum/sim_sum)
          p weighted_average
        end 
        $logger.debug "Transform time: #{tt}"
        $logger.debug "Neighbor search time: #{nt} (Similarity calculation: #{st}, Neighbor insert: #{nit})"
        $logger.debug "Prediction time: #{pt}"
        $logger.debug "Total prediction time: #{Time.now-at}"

        # serialize result
        case object.class.to_s
        when "OpenTox::Compound"
          return predictions.first
        when "Array"
          return predictions
        when "OpenTox::Dataset"
          # prepare prediction dataset
          prediction_dataset = LazarPrediction.new(
            :title => "Lazar prediction for #{prediction_feature.title}",
            :creator =>  __FILE__,
            :prediction_feature_id => prediction_feature.id

          )
          confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
          warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
          prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
          prediction_dataset.compounds = compounds
          prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence],p[:warning]]}
          prediction_dataset.save_all
          return prediction_dataset
        end

      end

      def training_dataset
        Dataset.find training_dataset_id
      end

      def prediction_feature
        Feature.find prediction_feature_id
      end
      
      def training_activities
        i = @training_dataset.feature_ids.index prediction_feature_id
        @training_dataset.data_entries.collect{|de| de[i]}
      end

    end


  end

end