merged with dev and removed comments

author: dv <dv@dv.de> 2011-07-19 14:49:34 +0200
committer: dv <dv@dv.de> 2011-07-19 14:49:34 +0200
commit: b52a34f062fc4ad5cacf403e88861b24c3117f91 (patch)
tree: 6e364bbfb8856fd0cb1cc60072292c1285070660
parent: 733fe6dddbd427589b91eccace7a13d75c8c761a (diff)
1 files changed, 353 insertions, 138 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index a50d568..43845fb 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -3,6 +3,7 @@
 # avoids compiling R with X
 R = nil
 require "rinruby" 
+require "statsample"
 
 module OpenTox
 
@@ -80,18 +81,6 @@ module OpenTox
             next
           end
           
-          # AM: take log if appropriate
-          take_logs=true
-          entry.each do |feature,values|
-             values.each do |value|
-                if @prediction_feature.feature_type == "regression"
-                   if (! value.nil?) && (value.to_f <= 0)
-                     take_logs=false
-                   end
-                end
-             end
-          end
-          
           value_map=params[:value_map] unless params[:value_map].nil?
           entry.each do |feature,values|
             if feature == @prediction_feature.uri
@@ -103,7 +92,7 @@ module OpenTox
                     activity= value_map.invert[value].to_i # activities are mapped to 1..n
                     @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
                   elsif @prediction_feature.feature_type == "regression"
-                    activity= take_logs ? Math.log10(value.to_f) : value.to_f 
+                    activity= value.to_f 
                   end
                   begin
                     fminer_instance.AddCompound(smiles,id)
@@ -210,78 +199,82 @@ module OpenTox
 
       # Local multi-linear regression (MLR) prediction from neighbors. 
       # Uses propositionalized setting.
-      # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
-      # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
-      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
       # @return [Numeric] A prediction value.
-      def self.local_mlr_prop(neighbors, params, props)
-
-        take_logs=true
-
-        neighbors.each do |n| 
-          if (! n[:activity].nil?) && (n[:activity].to_f < 0.0)
-            take_logs = false
-          end
-        end
-
-        acts = neighbors.collect do |n|
-          act = n[:activity] 
-          take_logs ? Math.log10(act.to_f) : act.to_f
-        end # activities of neighbors for supervised learning
-
+      def self.local_mlr_prop(params)
 
+        raise "No neighbors found." unless params[:neighbors].size>0
         begin
 
+          props = params[:prop_kernel] ? get_props(params) : nil
+          acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
+          sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
+
           LOGGER.debug "Local MLR (Propositionalization / GSL)."
-          n_prop = props[0] # is a matrix, i.e. two nested Arrays.
-          q_prop = props[1] # is an Array.
-          n_prop_x_size = n_prop[0].size
-          n_prop_y_size = n_prop.size
+          prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
+          transformer = eval "OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})"
+          prediction = transformer.values[0]
+          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+          sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
+          conf = sims.inject{|sum,x| sum + x }
+          confidence = conf/params[:neighbors].size if params[:neighbors].size > 0
+          {:prediction => prediction, :confidence => confidence}
 
-          n_prop.flatten!
-          y_x_rel = n_prop_y_size.to_f / n_prop_x_size
-          repeat_factor = (1/y_x_rel).ceil
-          n_prop_tmp = Array.new ; repeat_factor.times { n_prop_tmp.concat n_prop } ; n_prop = n_prop_tmp
-          acts_tmp = Array.new ; repeat_factor.times { acts_tmp.concat acts } ; acts = acts_tmp
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+        end
 
-          if n_prop.size == 0
-            raise "No neighbors found."
-          else
-            begin
-              LOGGER.debug "Setting GSL data ..."
-              # set data
-              prop_matrix = GSL::Matrix[n_prop, n_prop_y_size * repeat_factor, n_prop_x_size]
-              y = GSL::Vector[acts]
-              q_prop = GSL::Vector[q_prop]
+      end
 
-              # model + support vectors
-              LOGGER.debug "Creating MLR model ..."
-              work = GSL::MultiFit::Workspace.alloc(n_prop_y_size * repeat_factor, n_prop_x_size)
-              c, cov, chisq, status = GSL::MultiFit::linear(prop_matrix, y, work)
-              LOGGER.debug "Predicting ..."
-              prediction = GSL::MultiFit::linear_est(q_prop, c, cov)[0]
-            rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
-            end
-          end
+      def self.mlr(params)
 
-          prediction = (take_logs ? 10**(prediction.to_f) : prediction.to_f)
-          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+        # GSL matrix operations: 
+        # to_a : row-wise conversion to nested array
+        #
+        # Statsample operations (build on GSL):
+        # to_scale: convert into Statsample format
+
+        begin
+          n_prop = params[:n_prop].collect { |v| v }
+          q_prop = params[:q_prop].collect { |v| v }
+          n_prop << q_prop # attach q_prop
+          nr_cases, nr_features = get_sizes n_prop
+          data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
+
+          # Principal Components Analysis
+          LOGGER.debug "PCA..."
+          pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
+          data_matrix = pca.data_transformed_matrix
+
+          # Attach intercept column to data
+          intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
+          data_matrix = data_matrix.horzcat(intercept)
+          (0..data_matrix.size2-2).each { |i|
+            autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
+            data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
+          }
+
+          # Detach query instance
+          n_prop = data_matrix.to_a
+          q_prop = n_prop.pop 
+          nr_cases, nr_features = get_sizes n_prop
+          data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
+
+          # model + support vectors
+          LOGGER.debug "Creating MLR model ..."
+          c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
+          GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
         rescue Exception => e
-          LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+          LOGGER.debug "#{e.class}: #{e.message}"
         end
 
-        sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
-        conf = sims.inject{|sum,x| sum + x }
-        confidence = conf/neighbors.size if neighbors.size > 0
-        {:prediction => prediction, :confidence => confidence}
-    end
+      end
 
       # Classification with majority vote from neighbors weighted by similarity
-      # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity`
-      # @param [optional] params Ignored (only for compatibility with local_svm_regression)
-      # @return [Hash] Hash with keys `:prediction, :confidence`
-      def self.weighted_majority_vote(neighbors,params={}, props=nil)
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @return [Numeric] A prediction value.
+      def self.weighted_majority_vote(params)
+
         neighbor_contribution = 0.0
         confidence_sum = 0.0
         confidence = 0.0
@@ -289,7 +282,7 @@ module OpenTox
         positive_map_value= nil
         negative_map_value= nil
 
-        neighbors.each do |neighbor|
+        params[:neighbors].each do |neighbor|
           neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
           neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
 
@@ -307,89 +300,71 @@ module OpenTox
 
         if params[:value_map].size == 2 
           if confidence_sum >= 0.0
-            prediction = 2 unless neighbors.size==0
+            prediction = 2 unless params[:neighbors].size==0
           elsif confidence_sum < 0.0
-            prediction = 1 unless neighbors.size==0
+            prediction = 1 unless params[:neighbors].size==0
           end
         else 
-          prediction = (neighbor_contribution/confidence_sum).round  unless neighbors.size==0  # AM: new multinomial prediction
+          prediction = (neighbor_contribution/confidence_sum).round  unless params[:neighbors].size==0  # AM: new multinomial prediction
         end 
 
-        confidence = confidence_sum/neighbors.size if neighbors.size > 0
+        confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
         return {:prediction => prediction, :confidence => confidence.abs}
       end
 
       # Local support vector regression from neighbors 
-      # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
-      # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
-      # @return [Hash] Hash with keys `:prediction, :confidence`
-      def self.local_svm_regression(neighbors, params, props=nil)
-        take_logs=true
-        neighbors.each do |n| 
-          if (! n[:activity].nil?) && (n[:activity].to_f < 0.0)
-            take_logs = false
-          end
-        end
-        acts = neighbors.collect do |n|
-          act = n[:activity] 
-          take_logs ? Math.log10(act.to_f) : act.to_f
-        end # activities of neighbors for supervised learning
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm_regression(params)
 
-        sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
+        raise "No neighbors found." unless params[:neighbors].size>0
         begin
-          prediction = (props.nil? ? local_svm(neighbors, acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr", params))
-          prediction = (take_logs ? 10**(prediction.to_f) : prediction.to_f)
+          props = params[:prop_kernel] ? get_props(params) : nil
+          acts = params[:neighbors].collect{ |n| n[:activity].to_f }
+          sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
+          prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
+          transformer = eval "OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})"
+          prediction = transformer.values[0]
           LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
-        rescue Exception => e
-          LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
-        end
-
-        begin
           sim_median = Algorithm.median(sims)
-          #confidence = nil
-          if sim_median.nil? 
+          if sim_median.nil?
+            confidence = nil
             LOGGER.debug "dv ------------ sim_median is nil"
           else
-            #@r_sd = RinRuby.new(false,false)
-            #@r_sd.r_regression_acts = acts
-            #standard_deviation = @r_sd.pull "as.numeric(sd(r_regression_acts))"#calculate standard deviation
-            #@r_sd.quit #free R  
             standard_deviation = acts.std_dev
-            LOGGER.debug "dv ------------ sd: #{standard_deviation}"
             confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
             if confidence.nan?
               confidence = nil
             end
           end
           LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
+          return {:prediction => prediction, :confidence => confidence}
         rescue Exception => e
-          LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
         end
-        return {:prediction => prediction, :confidence => confidence}
+        
       end
 
       # Local support vector classification from neighbors 
-      # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
-      # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
-      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
-      # @return [Hash] Hash with keys `:prediction, :confidence`
-      def self.local_svm_classification(neighbors, params, props=nil)
-        acts = neighbors.collect do |n|
-          act = n[:activity]
-        end # activities of neighbors for supervised learning
-#        acts_f = acts.collect {|v| v == true ? 1.0 : 0.0}
-        acts_f = acts
-        sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm_classification(params)
+
+        raise "No neighbors found." unless params[:neighbors].size>0
         begin 
-          prediction = (props.nil? ? local_svm(neighbors, acts_f, sims, "C-bsvc", params) : local_svm_prop(props, acts_f, "C-bsvc", params))
+          props = params[:prop_kernel] ? get_props(params) : nil
+          acts = params[:neighbors].collect { |n| act = n[:activity] }
+          sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
+          prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
           LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+          conf = sims.inject{|sum,x| sum + x }
+          confidence = conf/params[:neighbors].size if params[:neighbors].size > 0
+          {:prediction => prediction, :confidence => confidence}
         rescue Exception => e
-          LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
         end
-
-        conf = sims.inject{|sum,x| sum + x }
-        confidence = conf/neighbors.size if neighbors.size > 0
-        {:prediction => prediction, :confidence => confidence}
         
       end
 
@@ -397,16 +372,14 @@ module OpenTox
       # Local support vector prediction from neighbors. 
       # Uses pre-defined Kernel Matrix.
       # Not to be called directly (use local_svm_regression or local_svm_classification).
-      # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
       # @param [Array] acts, activities for neighbors.
       # @param [Array] sims, similarities for neighbors.
       # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
-      # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
-      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
       # @return [Numeric] A prediction value.
-      def self.local_svm(neighbors, acts, sims, type, params)
+      def self.local_svm(acts, sims, type, params)
         LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
-        neighbor_matches = neighbors.collect{ |n| n[:features] } # URIs of matches
+        neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
         gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
         if neighbor_matches.size == 0
           raise "No neighbors found."
@@ -461,7 +434,8 @@ module OpenTox
             end
             @r.quit # free R
           rescue Exception => e
-            LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
           end
 
         end
@@ -471,13 +445,11 @@ module OpenTox
       # Local support vector prediction from neighbors. 
       # Uses propositionalized setting.
       # Not to be called directly (use local_svm_regression or local_svm_classification).
-      # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
-      # @param [Array] acts, activities for neighbors.
       # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+      # @param [Array] acts, activities for neighbors.
       # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
-      # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
       # @return [Numeric] A prediction value.
-      def self.local_svm_prop(props, acts, type, params)
+      def self.local_svm_prop(props, acts, type)
 
           LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
           n_prop = props[0] # is a matrix, i.e. two nested Arrays.
@@ -523,12 +495,57 @@ module OpenTox
               end
               @r.quit # free R
             rescue Exception => e
-              LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
             end
           end
           prediction
       end
 
+      # Get X and Y size of a nested Array (Matrix)
+      def self.get_sizes(matrix)
+        begin
+          nr_cases = matrix.size
+          nr_features = matrix[0].size
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+        end
+        #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
+        [ nr_cases, nr_features ]
+      end
+
+      # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
+      # Same for the vector describing the query compound
+      # @param[Array] neighbors.
+      # @param[OpenTox::Compound] query compound.
+      # @param[Array] Dataset Features.
+      # @param[Array] Fingerprints of neighbors.
+      # @param[Float] p-values of Features.
+      def self.get_props (params)
+        matrix = Array.new
+        begin 
+          params[:neighbors].each do |n|
+            n = n[:compound]
+            row = []
+            params[:features].each do |f|
+              if ! params[:fingerprints][n].nil? 
+                row << (params[:fingerprints][n].include?(f) ? params[:p_values][f] : 0.0)
+              else
+                row << 0.0
+              end
+            end
+            matrix << row
+          end
+          row = []
+          params[:features].each do |f|
+            row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
+          end
+        rescue Exception => e
+          LOGGER.debug "get_props failed with '" + $! + "'"
+        end
+        [ matrix, row ]
+      end
 
     end
 
@@ -549,6 +566,195 @@ module OpenTox
       def features(dataset_uri,compound_uri)
       end
     end
+
+    module Transform
+      include Algorithm
+
+      # The transformer that inverts values.
+      # 1/x is used, after values have been moved >= 1.
+      class Inverter
+        attr_accessor :offset, :values
+
+        # @params[Array] Values to transform.
+        # @params[Float] Offset for restore.
+        def initialize *args
+          case args.size
+          when 1
+            begin
+              values=args[0]
+              raise "Cannot transform, values empty." if @values.size==0
+              @values = values.collect { |v| -1.0 * v }  
+              @offset = 1.0 - @values.minmax[0] 
+              @offset = -1.0 * @offset if @offset>0.0 
+              @values.collect! { |v| v - @offset }   # slide >1
+              @values.collect! { |v| 1 / v }         # invert to [0,1]
+            rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+            end
+          when 2
+            @offset = args[1].to_f
+            @values = args[0].collect { |v| 1 / v }
+            @values.collect! { |v| v + @offset }
+            @values.collect! { |v| -1.0 * v }
+          end
+        end
+      end
+
+      # The transformer that takes logs.
+      # Log10 is used, after values have been moved > 0.
+      class Log10
+        attr_accessor :offset, :values
+
+        # @params[Array] Values to transform / restore.
+        # @params[Float] Offset for restore.
+        def initialize *args
+          @distance_to_zero = 0.000000001 # 1 / 1 billion
+          case args.size
+          when 1
+            begin
+              values=args[0]
+              raise "Cannot transform, values empty." if values.size==0
+              @offset = values.minmax[0] 
+              @offset = -1.0 * @offset if @offset>0.0 
+              @values = values.collect { |v| v - @offset }   # slide > anchor
+              @values.collect! { |v| v + @distance_to_zero }  #
+              @values.collect! { |v| Math::log10 v } # log10 (can fail)
+            rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+            end
+          when 2
+            @offset = args[1].to_f
+            @values = args[0].collect { |v| 10**v }
+            @values.collect! { |v| v - @distance_to_zero }
+            @values.collect! { |v| v + @offset }
+          end
+        end
+      end
+
+      # The transformer that does nothing (No OPeration).
+      class NOP
+        attr_accessor :offset, :values
+
+        # @params[Array] Values to transform / restore.
+        # @params[Float] Offset for restore.
+        def initialize *args
+          @offset = 0.0
+          @distance_to_zero = 0.0
+          case args.size
+          when 1
+            @values = args[0]
+          when 2
+            @values = args[0]
+          end
+        end
+      end
+
+
+      # Auto-Scaler for Arrays
+      # Center on mean and divide by standard deviation
+      class AutoScale 
+        attr_accessor :scaled_values, :mean, :stdev
+
+        # @params[Array] Values to transform.
+        def initialize values
+          @scaled_values = values
+          @mean = @scaled_values.to_scale.mean
+          @stdev = @scaled_values.to_scale.standard_deviation_sample
+          @scaled_values = @scaled_values.collect {|vi| vi - @mean }
+          @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
+        end
+      end
+
+      # Principal Components Analysis
+      # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
+      class PCA
+        attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
+
+        # Creates a transformed dataset as GSL::Matrix.
+        # @param [GSL::Matrix] Data matrix.
+        # @param [Float] Compression ratio from [0,1].
+        # @return [GSL::Matrix] Data transformed matrix.
+        def initialize data_matrix, compression=0.05
+          begin
+            @data_matrix = data_matrix
+            @compression = compression.to_f
+            @stdev = Array.new
+            @mean = Array.new
+
+            # Objective Feature Selection
+            raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
+            @data_matrix_selected = nil
+            (0..@data_matrix.size2-1).each { |i|
+              if !Algorithm::isnull_or_singular?(@data_matrix.col(i).to_a)
+                if @data_matrix_selected.nil?
+                  @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) 
+                  @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
+                else
+                  @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
+                end
+              end             
+            }
+            raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
+
+            # Scaling of Axes
+            @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
+            (0..@data_matrix_selected.size2-1).each { |i|
+              @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
+              @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
+              @stdev << @autoscaler.stdev
+              @mean << @autoscaler.mean
+            }
+
+            data_matrix_hash = Hash.new
+            (0..@data_matrix_scaled.size2-1).each { |i|
+              column_view = @data_matrix_scaled.col(i)
+              data_matrix_hash[i] = column_view.to_scale
+            }
+            dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
+            cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
+            pca=Statsample::Factor::PCA.new(cor_matrix)
+            pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
+            @eigenvalue_sums = Array.new
+            (0..dataset_hash.fields.size-1).each { |i|
+              @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
+            }
+            eigenvectors_selected = Array.new
+            pca.eigenvectors.each_with_index { |ev, i|
+              if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
+                eigenvectors_selected << ev.to_a
+              end
+            }
+            @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
+            dataset_matrix = dataset_hash.to_gsl.transpose
+            @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
+          rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+        # Restores data in the original feature space (possibly with compression loss).
+        # @return [GSL::Matrix] Data matrix.
+        def restore
+          begin 
+            data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
+            # reverse scaling
+            (0..data_matrix_restored.size2-1).each { |i|
+              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
+              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
+            }
+            data_matrix_restored
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+
+      end
+
+    end
     
     # Gauss kernel
     # @return [Float] 
@@ -556,6 +762,16 @@ module OpenTox
       d = 1.0 - x.to_f
       Math.exp(-(d*d)/(2*sigma*sigma))
     end
+
+    # For symbolic features
+    # @param [Array] Array to test, must indicate non-occurrence with 0.
+    # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
+    def self.isnull_or_singular?(array)
+      nr_zeroes = array.count(0)
+      return (nr_zeroes == array.size) ||    # remove non-occurring feature
+             (nr_zeroes == array.size-1) ||  # remove singular feature
+             (nr_zeroes == 0)                # also remove feature present everywhere
+    end
     
     # Median of an array
     # @param [Array] Array with values
@@ -583,14 +799,13 @@ module OpenTox
       return sum
     end
 
-
     # Minimum Frequency
     # @param [Integer] per-mil value
     # return [Integer] min-frequency
     def self.min_frequency(training_dataset,per_mil)
-      minfreq = per_mil*training_dataset.compounds.size/1000 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+      minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
       minfreq = 2 unless minfreq > 2
-      minfreq
+      Integer (minfreq)
     end
 
     # Effect calculation for classification
author	dv <dv@dv.de>	2011-07-19 14:49:34 +0200
committer	dv <dv@dv.de>	2011-07-19 14:49:34 +0200
commit	b52a34f062fc4ad5cacf403e88861b24c3117f91 (patch)
tree	6e364bbfb8856fd0cb1cc60072292c1285070660
parent	733fe6dddbd427589b91eccace7a13d75c8c761a (diff)