summaryrefslogtreecommitdiff
path: root/lib/ot_predictions.rb
blob: f812854959153d97d19fc18ae617afbcec01fdc5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246

require "lib/predictions.rb"

module Lib
  
  class OTPredictions < Predictions
  
    CHECK_VALUES = ENV['RACK_ENV'] =~ /debug|test/
  
    def identifier(instance_index)
      return compound(instance_index)
    end
  
    def compound(instance_index)
      return @compounds[instance_index]
    end
  
    def initialize(feature_type, test_dataset_uri, test_target_dataset_uri, 
      prediction_feature, prediction_dataset_uri, predicted_variable, subjectid=nil, task=nil)
      
        LOGGER.debug("loading prediciton via test-dataset:'"+test_dataset_uri.to_s+
          "', test-target-datset:'"+test_target_dataset_uri.to_s+
          "', prediction-dataset:'"+prediction_dataset_uri.to_s+
          "', prediction_feature: '"+prediction_feature.to_s+"' "+
          "', predicted_variable: '"+predicted_variable.to_s+"'")
          
        predicted_variable=prediction_feature if predicted_variable==nil
        
        test_dataset = OpenTox::Dataset.find test_dataset_uri,subjectid
        raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset
        raise "prediction_feature missing" unless prediction_feature
        
        if test_target_dataset_uri == nil || test_target_dataset_uri.strip.size==0 || test_target_dataset_uri==test_dataset_uri
          test_target_dataset_uri = test_dataset_uri
          test_target_dataset = test_dataset
          raise "prediction_feature not found in test_dataset, specify a test_target_dataset\n"+
                "prediction_feature: '"+prediction_feature.to_s+"'\n"+
                "test_dataset: '"+test_target_dataset_uri.to_s+"'\n"+
                "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil
        else
          test_target_dataset = OpenTox::Dataset.find test_target_dataset_uri,subjectid
          raise "test target datset not found: '"+test_target_dataset_uri.to_s+"'" unless test_target_dataset
          if CHECK_VALUES
            test_dataset.compounds.each do |c|
              raise "test compound not found on test class dataset "+c.to_s unless test_target_dataset.compounds.include?(c)
            end
          end
          raise "prediction_feature not found in test_target_dataset\n"+
                "prediction_feature: '"+prediction_feature.to_s+"'\n"+
                "test_target_dataset: '"+test_target_dataset_uri.to_s+"'\n"+
                "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil
        end
        
        @compounds = test_dataset.compounds
        LOGGER.debug "test dataset size: "+@compounds.size.to_s
        raise "test dataset is empty "+test_dataset_uri.to_s unless @compounds.size>0
        class_values = feature_type=="classification" ? OpenTox::Feature.find(prediction_feature, subjectid).domain : nil
        
        actual_values = []
        @compounds.each do |c|
          case feature_type
          when "classification"
            actual_values << classification_value(test_target_dataset, c, prediction_feature, class_values)
          when "regression"
            actual_values << regression_value(test_target_dataset, c, prediction_feature)
          end
        end
        task.progress(40) if task # loaded actual values
        
        prediction_dataset = OpenTox::Dataset.find prediction_dataset_uri,subjectid
        raise "prediction dataset not found: '"+prediction_dataset_uri.to_s+"'" unless prediction_dataset
        
        # TODO: remove LAZAR_PREDICTION_DATASET_HACK
        no_prediction_feature = prediction_dataset.features.keys.index(predicted_variable)==nil
        if no_prediction_feature
          one_entry_per_compound = true
          @compounds.each do |c|
            if prediction_dataset.data_entries[c] and prediction_dataset.data_entries[c].size != 1
              one_entry_per_compound = false
              break
            end
          end
          msg = "prediction-feature not found: '"+predicted_variable+"' in prediction-dataset: "+prediction_dataset_uri.to_s+", available features: "+
            prediction_dataset.features.keys.inspect
          if one_entry_per_compound
            LOGGER.warn msg
          else
            raise msg
          end
        end

        raise "more predicted than test compounds test:"+@compounds.size.to_s+" < prediction:"+
          prediction_dataset.compounds.size.to_s if @compounds.size < prediction_dataset.compounds.size
        if CHECK_VALUES
          prediction_dataset.compounds.each do |c| 
            raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+
              @compounds.collect{|c| c.to_s}.join("\n") if @compounds.index(c)==nil
          end
        end
        
        predicted_values = []
        confidence_values = []
        @compounds.each do |c|
          if prediction_dataset.compounds.index(c)==nil
            predicted_values << nil
            confidence_values << nil
          else
            case feature_type
            when "classification"
              # TODO: remove LAZAR_PREDICTION_DATASET_HACK
              predicted_values << classification_value(prediction_dataset, c, no_prediction_feature ? nil : predicted_variable, class_values)
            when "regression"
              predicted_values << regression_value(prediction_dataset, c, no_prediction_feature ? nil : predicted_variable)
            end
            # TODO confidence_values << prediction_dataset.get_prediction_confidence(c, predicted_variable)
            conf = 1
            begin
              feature = prediction_dataset.data_entries[c].keys[0]
              feature_data = prediction_dataset.features[feature]
              conf = feature_data[OT.confidence] if feature_data[OT.confidence]!=nil 
            rescue
              LOGGER.warn "could not get confidence"
            end
            confidence_values << conf
          end
        end
        task.progress(80) if task # loaded predicted values and confidence
        
        super(predicted_values, actual_values, confidence_values, feature_type, class_values)
        raise "illegal num compounds "+num_info if  @compounds.size != @predicted_values.size
        task.progress(100) if task # done with the mathmatics
    end
    
    private
    def regression_value(dataset, compound, feature)
      v = value(dataset, compound, feature)
      begin
        v = v.to_f unless v==nil or v.is_a?(Numeric)
        v
      rescue
        LOGGER.warn "no numeric value for regression: '"+v.to_s+"'"
        nil
      end
    end
    
    def classification_value(dataset, compound, feature, class_values)
      v = value(dataset, compound, feature)
      i = class_values.index(v)
      raise "illegal class_value of prediction (value is '"+v.to_s+"', class is '"+v.class.to_s+"'), possible values are "+
        class_values.inspect unless v==nil or i!=nil
      i
    end
    
    def value(dataset, compound, feature)
      return nil if dataset.data_entries[compound]==nil
      if feature==nil
        v = dataset.data_entries[compound].values[0]
      else
        v = dataset.data_entries[compound][feature]
      end
      return nil if v==nil 
      raise "no array "+v.class.to_s+" : '"+v.to_s+"'" unless v.is_a?(Array)
      if v.size>1
        v.uniq!
        if v.size>1
          v = nil
          LOGGER.warn "not yet implemented: multiple non-equal values "+compound.to_s+" "+v.inspect
        else
          v = v[0]
        end
      elsif v.size==1
        v = v[0]
      else
        v = nil
      end
      raise "array" if v.is_a?(Array)
      v = nil if v.to_s.size==0
      v
    end

    public
    def compute_stats
    
      res = {}
      case @feature_type
      when "classification"
        (Lib::VAL_CLASS_PROPS).each{ |s| res[s] = send(s)}  
      when "regression"
        (Lib::VAL_REGR_PROPS).each{ |s| res[s] = send(s) }  
      end
      return res
    end
    
    def to_array()
      OTPredictions.to_array( [self] )
    end
    
    def self.to_array( predictions, add_pic=false, format=false )
  
      res = []
      predictions.each do |p|
        (0..p.num_instances-1).each do |i|
          a = []
          
          #PENDING!
          begin
            #a.push( "http://ambit.uni-plovdiv.bg:8080/ambit2/depict/cdk?search="+
            #  URI.encode(OpenTox::Compound.new(:uri=>p.identifier(i)).smiles) ) if add_pic
            a << p.identifier(i)+"/image"
          rescue => ex
            raise ex
            #a.push("Could not add pic: "+ex.message)
            #a.push(p.identifier(i))
          end
          
          a << (format ? p.actual_value(i).to_nice_s : p.actual_value(i))
          a << (format ? p.predicted_value(i).to_nice_s : p.predicted_value(i))
          if p.feature_type=="classification"
            if (p.predicted_value(i)!=nil and p.actual_value(i)!=nil)
              a << (p.classification_miss?(i) ? 1 : 0)
            else
              a << nil
            end
          end
          if p.confidence_values_available?
            a << (format ? p.confidence_value(i).to_nice_s : p.confidence_value(i))
          end
          a << p.identifier(i)
          res << a
        end
      end
        
      header = []
      header << "compound" if add_pic
      header << "actual value"
      header << "predicted value"
      header << "missclassified" if predictions[0].feature_type=="classification"
      header << "confidence value" if predictions[0].confidence_values_available?
      header << "compound-uri"
      res.insert(0, header)
      
      return res
  end
    
  end
end