summaryrefslogtreecommitdiff
path: root/lib/algorithm/transform.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/algorithm/transform.rb')
-rw-r--r--lib/algorithm/transform.rb282
1 files changed, 116 insertions, 166 deletions
diff --git a/lib/algorithm/transform.rb b/lib/algorithm/transform.rb
index afc76f9..ec25526 100644
--- a/lib/algorithm/transform.rb
+++ b/lib/algorithm/transform.rb
@@ -18,42 +18,27 @@ module OpenTox
# @param [GSL::Vector] values Values to transform using AutoScaling.
def initialize values
- begin
- raise "Cannot transform, values empty." if values.size==0
- vs = values.clone
- @mean = vs.to_scale.mean
- @stdev = vs.to_scale.standard_deviation_population
- @stdev = 0.0 if @stdev.nan?
- @vs = transform vs
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ bad_request_error "Cannot transform, values empty." if values.size==0
+ vs = values.clone
+ @mean = vs.to_scale.mean
+ @stdev = vs.to_scale.standard_deviation_population
+ @stdev = 0.0 if @stdev.nan?
+ @vs = transform vs
end
# @param [GSL::Vector] values Values to transform.
# @return [GSL::Vector] transformed values.
def transform values
- begin
- raise "Cannot transform, values empty." if values.size==0
- autoscale values.clone
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ bad_request_error "Cannot transform, values empty." if values.size==0
+ autoscale values.clone
end
# @param [GSL::Vector] values Values to restore.
# @return [GSL::Vector] transformed values.
def restore values
- begin
- raise "Cannot transform, values empty." if values.size==0
- rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
- (rv_ss + @mean).to_gsl
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ bad_request_error "Cannot transform, values empty." if values.size==0
+ rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
+ (rv_ss + @mean).to_gsl
end
# @param [GSL::Vector] values to transform.
@@ -77,68 +62,63 @@ module OpenTox
# @return [GSL::Matrix] Data transformed matrix.
def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
- begin
- @data_matrix = data_matrix.clone
- @compression = compression.to_f
- @mean = Array.new
- @autoscaler = Array.new
- @cols = Array.new
- @maxcols = maxcols
-
- # Objective Feature Selection
- raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
- @data_matrix_selected = nil
- (0..@data_matrix.size2-1).each { |i|
- if !@data_matrix.col(i).to_a.zero_variance?
- if @data_matrix_selected.nil?
- @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
- @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
- else
- @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
- end
- @cols << i
- end
- }
- raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
-
- # PCA uses internal centering on 0
- @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
- (0..@cols.size-1).each { |i|
- as = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
- @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
- @mean << as.mean
- @autoscaler << as
- }
-
- # PCA
- data_matrix_hash = Hash.new
- (0..@cols.size-1).each { |i|
- column_view = @data_matrix_scaled.col(i)
- data_matrix_hash[i] = column_view.to_scale
- }
- dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
- cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
- pca=Statsample::Factor::PCA.new(cor_matrix)
-
- # Select best eigenvectors
- pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
- @eigenvalue_sums = Array.new
- (0..@cols.size-1).each { |i|
- @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
- }
- eigenvectors_selected = Array.new
- pca.eigenvectors.each_with_index { |ev, i|
- if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
- eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
+ @data_matrix = data_matrix.clone
+ @compression = compression.to_f
+ @mean = Array.new
+ @autoscaler = Array.new
+ @cols = Array.new
+ @maxcols = maxcols
+
+ # Objective Feature Selection
+ bad_request_error "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
+ @data_matrix_selected = nil
+ (0..@data_matrix.size2-1).each { |i|
+ if !@data_matrix.col(i).to_a.zero_variance?
+ if @data_matrix_selected.nil?
+ @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
+ @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
+ else
+ @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
end
- }
- @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
- @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
+ @cols << i
+ end
+ }
+ bad_request_error "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
+
+ # PCA uses internal centering on 0
+ @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
+ (0..@cols.size-1).each { |i|
+ as = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
+ @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
+ @mean << as.mean
+ @autoscaler << as
+ }
+
+ # PCA
+ data_matrix_hash = Hash.new
+ (0..@cols.size-1).each { |i|
+ column_view = @data_matrix_scaled.col(i)
+ data_matrix_hash[i] = column_view.to_scale
+ }
+ dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
+ cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
+ pca=Statsample::Factor::PCA.new(cor_matrix)
+
+ # Select best eigenvectors
+ pca.eigenvalues.each { |ev| bad_request_error "PCA failed!" unless !ev.nan? }
+ @eigenvalue_sums = Array.new
+ (0..@cols.size-1).each { |i|
+ @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
+ }
+ eigenvectors_selected = Array.new
+ pca.eigenvectors.each_with_index { |ev, i|
+ if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
+ eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
+ end
+ }
+ @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
+ @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
end
# Transforms data to feature space found by PCA.
@@ -146,35 +126,25 @@ module OpenTox
# @param [GSL::Matrix] values Data matrix.
# @return [GSL::Matrix] Transformed data matrix.
def transform values
- begin
- vs = values.clone
- raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
- data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
- @cols.each_with_index { |i,j|
- data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
- }
- (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ vs = values.clone
+ bad_request_error "Error! Too few columns for transformation." if vs.size2 < @cols.max
+ data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
+ @cols.each_with_index { |i,j|
+ data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
+ }
+ (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
end
# Restores data in the original feature space (possibly with compression loss).
#
# @return [GSL::Matrix] Data matrix.
def restore
- begin
- data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
- # reverse scaling
- (0..@cols.size-1).each { |i|
- data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
- }
- data_matrix_restored
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
+ # reverse scaling
+ (0..@cols.size-1).each { |i|
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
+ }
+ data_matrix_restored
end
end
@@ -191,45 +161,40 @@ module OpenTox
# @return [GSL::Matrix] Data transformed matrix
def initialize data_matrix, compression=0.05
- begin
- @data_matrix = data_matrix.clone
- @compression = compression
-
- # Compute the SV Decomposition X=USV
- # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
- u, vt, s = data_matrix.SV_decomp
-
- # Determine cutoff index
- s2 = s.mul(s) ; s2_sum = s2.sum
- s2_run = 0
- k = s2.size - 1
- s2.to_a.reverse.each { |v|
- s2_run += v
- frac = s2_run / s2_sum
- break if frac > compression
- k -= 1
- }
- k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
-
- # Take the k-rank approximation of the Matrix
- # - Take first k columns of u
- # - Take first k columns of vt
- # - Take the first k eigenvalues
- @uk = u.submatrix(nil, (0..k)) # used to transform column format data
- @vk = vt.submatrix(nil, (0..k)) # used to transform row format data
- s = GSL::Matrix.diagonal(s)
- @eigk = s.submatrix((0..k), (0..k))
- @eigk_inv = @eigk.inv
-
- # Transform data
- @data_transformed_matrix = @uk # = u for all SVs
- # NOTE: @data_transformed_matrix is also equal to
- # @data_matrix * @vk * @eigk_inv
-
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ @data_matrix = data_matrix.clone
+ @compression = compression
+
+ # Compute the SV Decomposition X=USV
+ # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
+ u, vt, s = data_matrix.SV_decomp
+
+ # Determine cutoff index
+ s2 = s.mul(s) ; s2_sum = s2.sum
+ s2_run = 0
+ k = s2.size - 1
+ s2.to_a.reverse.each { |v|
+ s2_run += v
+ frac = s2_run / s2_sum
+ break if frac > compression
+ k -= 1
+ }
+ k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
+
+ # Take the k-rank approximation of the Matrix
+ # - Take first k columns of u
+ # - Take first k columns of vt
+ # - Take the first k eigenvalues
+ @uk = u.submatrix(nil, (0..k)) # used to transform column format data
+ @vk = vt.submatrix(nil, (0..k)) # used to transform row format data
+ s = GSL::Matrix.diagonal(s)
+ @eigk = s.submatrix((0..k), (0..k))
+ @eigk_inv = @eigk.inv
+
+ # Transform data
+ @data_transformed_matrix = @uk # = u for all SVs
+ # NOTE: @data_transformed_matrix is also equal to
+ # @data_matrix * @vk * @eigk_inv
+
end
@@ -238,12 +203,7 @@ module OpenTox
# @param [GSL::Matrix] values Data matrix (1 x m).
# @return [GSL::Matrix] Transformed data matrix.
def transform_instance values
- begin
- values * @vk * @eigk_inv
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ values * @vk * @eigk_inv
end
alias :transform :transform_instance # make this the default (see PCA interface)
@@ -252,12 +212,7 @@ module OpenTox
# @param [GSL::Matrix] values Data matrix (1 x n).
# @return [GSL::Matrix] Transformed data matrix.
def transform_feature values
- begin
- values * @uk * @eigk_inv
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ values * @uk * @eigk_inv
end
@@ -265,12 +220,7 @@ module OpenTox
#
# @return [GSL::Matrix] Data matrix.
def restore
- begin
- @data_transformed_matrix * @eigk * @vk.transpose # reverse svd
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
+ @data_transformed_matrix * @eigk * @vk.transpose # reverse svd
end