=begin * Name: transform.rb * Description: Transformation algorithms * Author: Andreas Maunz compression k -= 1 } k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1) # Take the k-rank approximation of the Matrix # - Take first k columns of u # - Take first k columns of vt # - Take the first k eigenvalues @uk = u.submatrix(nil, (0..k)) # used to transform column format data @vk = vt.submatrix(nil, (0..k)) # used to transform row format data s = GSL::Matrix.diagonal(s) @eigk = s.submatrix((0..k), (0..k)) @eigk_inv = @eigk.inv # Transform data @data_transformed_matrix = @uk # = u for all SVs # NOTE: @data_transformed_matrix is also equal to # @data_matrix * @vk * @eigk_inv end # Transforms data instance (1 row) to feature space found by SVD. # # @param [GSL::Matrix] values Data matrix (1 x m). # @return [GSL::Matrix] Transformed data matrix. def transform_instance values values * @vk * @eigk_inv end alias :transform :transform_instance # make this the default (see PCA interface) # Transforms data feature (1 column) to feature space found by SVD. # # @param [GSL::Matrix] values Data matrix (1 x n). # @return [GSL::Matrix] Transformed data matrix. def transform_feature values values * @uk * @eigk_inv end # Restores data in the original feature space (possibly with compression loss). # # @return [GSL::Matrix] Data matrix. def restore @data_transformed_matrix * @eigk * @vk.transpose # reverse svd end end # Attaches transformations to an OpenTox::Model # Stores props, sims, performs similarity calculations class ModelTransformer attr_accessor :model, :similarity_algorithm, :activities, :sims, :n_prop, :q_prop # @params[OpenTox::Model] model Model to transform def initialize model @model = model end # Transforms the model def transform get_matrices # creates @n_prop, @q_prop, @activities from ordered fingerprints @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors if (@model.similarity_algorithm =~ /cosine/) # truncate nil-columns and -rows $logger.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" while @q_prop.size>0 idx = @q_prop.index(nil) break if idx.nil? @q_prop.slice!(idx) @n_prop.each { |r| r.slice!(idx) } end $logger.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" remove_nils # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors $logger.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" # adjust rest #fingerprints_tmp = []; @ids.each { |idx| fingerprints_tmp << @fingerprints[idx] }; @fingerprints = fingerprints_tmp compounds_tmp = []; @ids.each { |idx| compounds_tmp << @compounds[idx] }; @compounds = compounds_tmp acts_tmp = []; @ids.each { |idx| acts_tmp << @activities[idx] }; @activities = acts_tmp # scale and svd nr_cases, nr_features = @n_prop.size, @n_prop[0].size gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup (0...nr_features).each { |i| autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(gsl_n_prop.col(i)) gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i) } svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0) @n_prop = svd.data_transformed_matrix.to_a @q_prop = svd.transform(gsl_q_prop).row(0).to_a $logger.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" else convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched end # neighbor calculation @ids = [] # surviving compounds become neighbors @sims = [] # calculated by neighbor routine =begin neighbors n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix acts_tmp = []; @ids.each { |idx| acts_tmp << @activities[idx] }; @activities = acts_tmp # Sims between neighbors, if necessary gram_matrix = [] if !@model.propositionalized && !@model.prediction_algorithm == "weighted_majority_vote" # need gram matrix for SVM (n. prop.) @n_prop.each_index do |i| gram_matrix[i] = [] unless gram_matrix[i] @n_prop.each_index do |j| if (j>i) sim = OpenTox::Algorithm::Similarity.send(@similarity_algorithm.to_sym, @n_prop[i], @n_prop[j]) gram_matrix[i][j] = sim gram_matrix[j] = [] unless gram_matrix[j] gram_matrix[j][i] = gram_matrix[i][j] end end gram_matrix[i][i] = 1.0 end end # reclaim original data (if svd was performed) if svd @n_prop = gsl_n_prop_orig.to_a n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp @q_prop = gsl_q_prop_orig.row(0).to_a end $logger.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop) $logger.debug "Sims: #{@sims.size}, Acts: #{@activities.size}" @sims = [ gram_matrix, @sims ] =end end # Find neighbors and store them as object variable, access all compounds for that. def neighbors @model.neighbors = [] @n_prop.each_with_index do |fp, idx| # AM: access all compounds add_neighbor fp, idx end end # Adds a neighbor to @neighbors if it passes the similarity threshold # @param[Array] training_props Propositionalized data for this neighbor # @param[Integer] Index of neighbor def add_neighbor(training_props, idx) unless @model.training_activities[idx].nil? sim = similarity(training_props) if sim > @model.min_sim.to_f @model.neighbors << { :compound => @compounds[idx], :similarity => sim, :activity => activities[idx] } @sims << sim @ids << idx end end end # Removes nil entries from n_prop and q_prop. # Matrix is a nested two-dimensional array. # Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed. # Tie break: columns take precedence. # Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero. # Enables the use of cosine similarity / SVD def remove_nils return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0) col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) } row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) } m_cols = col_nr_nils.max m_rows = row_nr_nils.max idx_cols = col_nr_nils.index(m_cols) idx_rows = row_nr_nils.index(m_rows) while ((m_cols > 0) || (m_rows > 0)) do if m_cols >= m_rows @n_prop.each { |row| row.slice!(idx_cols) } @q_prop.slice!(idx_cols) else @n_prop.slice!(idx_rows) @ids.slice!(idx_rows) end break if (@n_prop.length == 0) || (@n_prop[0].length == 0) col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) } row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) } m_cols = col_nr_nils.max m_rows = row_nr_nils.max idx_cols= col_nr_nils.index(m_cols) idx_rows = row_nr_nils.index(m_rows) end end # Replaces nils by zeroes in n_prop and q_prop # Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop) def convert_nils @n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } } @q_prop.collect! { |v| v.nil? ? 0 : v } end # Executes model similarity_algorithm # @param[Array] A propositionalized data entry # @return[Float] Similarity to query structure def similarity(training_props) eval("#{@model.similarity_algorithm}(#{training_props}, #{@q_prop})") #OpenTox::Algorithm::Similarity.send(@model.similarity_algorithm,training_props, @q_prop) end # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed. # Same for compound fingerprints. def get_matrices @compounds = @model.training_dataset.compounds.clone # TODO select predicted variable @activities = @model.training_activities @n_prop = @model.feature_dataset.data_entries.clone @q_prop = @model.query_fingerprint.flatten.clone end # Returns propositionalized data, if appropriate, or nil # @return [Array] Propositionalized data, or nil def props @model.propositionalized ? [ @n_prop, @q_prop ] : nil end end end end end