diff options
author | mguetlein <martin.guetlein@gmail.com> | 2012-02-28 10:34:58 +0100 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2012-02-28 10:34:58 +0100 |
commit | 32cd79cd055ccd0aa9de29d68ca8c3e63663b126 (patch) | |
tree | 5058c65d249e39983eac1d2ae0641d069f68937e | |
parent | 04e8d118ec8e4d09b4e3ad6538aa7e8974613076 (diff) | |
parent | 754fd31f83be3ac7557a2f7832ff44fb262a98a5 (diff) |
Merge branch 'development' of github.com:opentox/opentox-ruby into development
-rw-r--r-- | Rakefile | 1 | ||||
-rw-r--r-- | lib/algorithm.rb | 908 | ||||
-rw-r--r-- | lib/compound.rb | 61 | ||||
-rw-r--r-- | lib/dataset.rb | 12 | ||||
-rw-r--r-- | lib/environment.rb | 2 | ||||
-rw-r--r-- | lib/model.rb | 107 | ||||
-rw-r--r-- | lib/parser.rb | 170 | ||||
-rw-r--r-- | lib/rest_client_wrapper.rb | 2 | ||||
-rw-r--r-- | lib/serializer.rb | 73 | ||||
-rw-r--r-- | lib/transform.rb | 520 | ||||
-rw-r--r-- | lib/utils.rb | 372 | ||||
-rw-r--r-- | opentox-ruby.gemspec | 5 |
12 files changed, 1382 insertions, 851 deletions
@@ -45,7 +45,6 @@ begin gem.add_dependency "ruby-plot", "=0.6.1" gem.add_dependency "gsl", "=1.14.7" gem.add_dependency "statsample", "=1.1.0" - #gem.add_dependency "statsample-optimization", "=2.1.0" gem.add_development_dependency 'jeweler' gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore'] diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 3bfe789..db21c46 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -5,6 +5,8 @@ R = nil require "rinruby" require "statsample" require 'uri' +require 'transform.rb' +require 'utils.rb' module OpenTox @@ -13,7 +15,7 @@ module OpenTox include OpenTox - # Execute algorithm with parameters, please consult the OpenTox API and the webservice documentation for acceptable parameters + # Execute algorithm with parameters, consult OpenTox API and webservice documentation for acceptable parameters # @param [optional,Hash] params Algorithm parameters # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly # @return [String] URI of new resource (dataset, model, ...) @@ -21,7 +23,7 @@ module OpenTox LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s end - + # Get OWL-DL representation in RDF/XML format # @return [application/rdf+xml] RDF/XML representation def to_rdfxml @@ -33,7 +35,7 @@ module OpenTox # Generic Algorithm class, should work with all OpenTox webservices class Generic include Algorithm - + # Find Generic Opentox Algorithm via URI, and loads metadata, could raise NotFound/NotAuthorized error # @param [String] uri Algorithm URI # @return [OpenTox::Algorithm::Generic] Algorithm instance @@ -44,14 +46,14 @@ module OpenTox raise "cannot load algorithm metadata" if alg.metadata==nil or alg.metadata.size==0 alg end - + end # Fminer algorithms (https://github.com/amaunz/fminer2) class Fminer include Algorithm attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi - + def check_params(params,per_mil,subjectid=nil) raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? @@ -81,7 +83,7 @@ module OpenTox LOGGER.warn "Cannot find smiles for #{compound.to_s}." next end - + value_map=params[:value_map] unless params[:value_map].nil? entry.each do |feature,values| if feature == @prediction_feature.uri @@ -115,23 +117,23 @@ module OpenTox end - # Backbone Refinement Class mining (http://bbrc.maunz.de/) - class BBRC < Fminer - # Initialize bbrc algorithm - def initialize(subjectid=nil) - super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc") - load_metadata(subjectid) - end + # Backbone Refinement Class mining (http://bbrc.maunz.de/) + class BBRC < Fminer + # Initialize bbrc algorithm + def initialize(subjectid=nil) + super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc") + load_metadata(subjectid) end + end - # LAtent STructure Pattern Mining (http://last-pm.maunz.de) - class LAST < Fminer - # Initialize last algorithm - def initialize(subjectid=nil) - super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last") - load_metadata(subjectid) - end + # LAtent STructure Pattern Mining (http://last-pm.maunz.de) + class LAST < Fminer + # Initialize last algorithm + def initialize(subjectid=nil) + super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last") + load_metadata(subjectid) end + end # Create lazar prediction model @@ -144,72 +146,6 @@ module OpenTox end end - # Utility methods without dedicated webservices - - # Similarity calculations - module Similarity - include Algorithm - - # Tanimoto similarity - # @param [Array] features_a Features of first compound - # @param [Array] features_b Features of second compound - # @param [optional, Hash] weights Weights for all features - # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required - # @return [Float] (Weighted) tanimoto similarity - def self.tanimoto(features_a,features_b,weights=nil,params=nil) - common_features = features_a & features_b - all_features = (features_a + features_b).uniq - #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}" - if common_features.size > 0 - if weights - #LOGGER.debug "nr_hits: #{params[:nr_hits]}" - if !params.nil? && params[:nr_hits] - params[:weights] = weights - params[:mode] = "min" - params[:features] = common_features - common_p_sum = Algorithm.p_sum_support(params) - params[:mode] = "max" - params[:features] = all_features - all_p_sum = Algorithm.p_sum_support(params) - else - common_p_sum = 0.0 - common_features.each{|f| common_p_sum += weights[f]} - all_p_sum = 0.0 - all_features.each{|f| all_p_sum += weights[f]} - end - #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}" - common_p_sum/all_p_sum - else - #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}" - common_features.size.to_f/all_features.size.to_f - end - else - 0.0 - end - end - - # Euclidean similarity - # @param [Hash] properties_a Properties of first compound - # @param [Hash] properties_b Properties of second compound - # @param [optional, Hash] weights Weights for all properties - # @return [Float] (Weighted) euclidean similarity - def self.euclidean(properties_a,properties_b,weights=nil) - common_properties = properties_a.keys & properties_b.keys - if common_properties.size > 1 - dist_sum = 0 - common_properties.each do |p| - if weights - dist_sum += ( (properties_a[p] - properties_b[p]) * weights[p] )**2 - else - dist_sum += (properties_a[p] - properties_b[p])**2 - end - end - 1/(1+Math.sqrt(dist_sum)) - else - 0.0 - end - end - end # Structural Graph Clustering by TU Munich # Finds clusters similar to a query structure in a given training dataset @@ -226,7 +162,7 @@ module OpenTox raise "Invalid URI." end @training_dataset_uri = training_dataset_uri - if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1 + if !self.numeric? training_threshold || training_threshold <0 || training_threshold >1 raise "Training threshold out of bounds." end @training_threshold = training_threshold.to_f @@ -259,7 +195,7 @@ module OpenTox # @params[Float] Similarity threshold for query to clusters (optional) def get_clusters query_compound_uri, query_threshold = 0.5 - if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1 + if !self.numeric? query_threshold || query_threshold <0 || query_threshold >1 raise "Query threshold out of bounds." end @query_threshold = query_threshold.to_f @@ -285,7 +221,7 @@ module OpenTox metadata[DC.title][pattern]="" feature_clusterid_map[feature_uri] = metadata[DC.title].to_i } - + # Integrity check unless cluster_query_dataset.compounds.size == 1 raise "Number of predicted compounds is != 1." @@ -295,11 +231,11 @@ module OpenTox query_compound_uri = cluster_query_dataset.compounds[0] @target_clusters_array = Array.new cluster_query_dataset.features.keys.each { |cluster_membership_feature| - + # Getting dataset URI for cluster target_cluster = feature_clusterid_map[cluster_membership_feature] dataset = @clusterid_dataset_map[target_cluster] - + # Finally look up presence data_entry = cluster_query_dataset.data_entries[query_compound_uri] present = data_entry[cluster_membership_feature][0] @@ -311,89 +247,13 @@ module OpenTox end - module Neighbors - - # Local multi-linear regression (MLR) prediction from neighbors. - # Uses propositionalized setting. - # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required - # @return [Numeric] A prediction value. - def self.local_mlr_prop(params) - - confidence=0.0 - prediction=nil - - if params[:neighbors].size>0 - props = params[:prop_kernel] ? get_props(params) : nil - acts = params[:neighbors].collect { |n| act = n[:activity].to_f } - sims = params[:neighbors].collect { |n| n[:similarity] } - LOGGER.debug "Local MLR (Propositionalization / GSL)." - prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} ) - if params[:transform].to_s == "classNOP" - transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}])") - else - transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}]), #{params[:transform]["offset"]})") - end - prediction = transformer.values[0] - prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction - LOGGER.debug "Prediction is: '" + prediction.to_s + "'." - params[:conf_stdev] = false if params[:conf_stdev].nil? - confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]}) - confidence = nil if prediction.nil? - end - {:prediction => prediction, :confidence => confidence} - - end - - # Multi-linear regression weighted by similarity. - # Objective Feature Selection, Principal Components Analysis, Scaling of Axes. - # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required - # @return [Numeric] A prediction value. - def self.mlr(params) - - # GSL matrix operations: - # to_a : row-wise conversion to nested array - # - # Statsample operations (build on GSL): - # to_scale: convert into Statsample format - - begin - n_prop = params[:n_prop].collect { |v| v } - q_prop = params[:q_prop].collect { |v| v } - n_prop << q_prop # attach q_prop - nr_cases, nr_features = get_sizes n_prop - data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features) - - # Principal Components Analysis - LOGGER.debug "PCA..." - pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix) - data_matrix = pca.data_transformed_matrix - - # Attach intercept column to data - intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1) - data_matrix = data_matrix.horzcat(intercept) - (0..data_matrix.size2-2).each { |i| - autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i)) - data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values - } - # Detach query instance - n_prop = data_matrix.to_a - q_prop = n_prop.pop - nr_cases, nr_features = get_sizes n_prop - data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features) - # model + support vectors - LOGGER.debug "Creating MLR model ..." - c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl) - GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0] - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - end + module Neighbors - end # Classification with majority vote from neighbors weighted by similarity - # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required + # @param [Hash] params Keys `:acts, :sims, :value_map` are required # @return [Numeric] A prediction value. def self.weighted_majority_vote(params) @@ -402,12 +262,13 @@ module OpenTox confidence = 0.0 prediction = nil - params[:neighbors].each do |neighbor| - neighbor_weight = neighbor[:similarity].to_f - neighbor_contribution += neighbor[:activity].to_f * neighbor_weight + LOGGER.debug "Weighted Majority Vote Classification." + params[:acts].each_index do |idx| + neighbor_weight = params[:sims][1][idx] + neighbor_contribution += params[:acts][idx] * neighbor_weight if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true - case neighbor[:activity] + case params[:acts][idx] when 1 confidence_sum -= neighbor_weight when 2 @@ -417,298 +278,257 @@ module OpenTox confidence_sum += neighbor_weight end end - if params[:value_map].size == 2 if confidence_sum >= 0.0 - prediction = 2 unless params[:neighbors].size==0 + prediction = 2 unless params[:acts].size==0 elsif confidence_sum < 0.0 - prediction = 1 unless params[:neighbors].size==0 + prediction = 1 unless params[:acts].size==0 end else - prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction + prediction = (neighbor_contribution/confidence_sum).round unless params[:acts].size==0 # AM: new multinomial prediction end + LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil? - confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0 + confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0 LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil? return {:prediction => prediction, :confidence => confidence.abs} end + + # Local support vector regression from neighbors - # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required + # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required # @return [Numeric] A prediction value. def self.local_svm_regression(params) - confidence = 0.0 - prediction = nil - if params[:neighbors].size>0 - props = params[:prop_kernel] ? get_props(params) : nil - acts = params[:neighbors].collect{ |n| n[:activity].to_f } - sims = params[:neighbors].collect{ |n| n[:similarity] } - prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr") - if params[:transform].to_s == "classNOP" - transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}])") - else - transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}]), #{params[:transform]["offset"]})") + begin + confidence = 0.0 + prediction = nil + + LOGGER.debug "Local SVM." + if params[:acts].size>0 + if params[:props] + n_prop = params[:props][0].collect + q_prop = params[:props][1].collect + props = [ n_prop, q_prop ] + end + acts = params[:acts].collect + prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + prediction = nil if (!prediction.nil? && prediction.infinite?) + LOGGER.debug "Prediction is: '" + prediction.to_s + "'." + confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]}) + confidence = 0.0 if prediction.nil? end - prediction = transformer.values[0] - prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction - LOGGER.debug "Prediction is: '" + prediction.to_s + "'." - params[:conf_stdev] = false if params[:conf_stdev].nil? - confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]}) - confidence = nil if prediction.nil? + {:prediction => prediction, :confidence => confidence} + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" end - {:prediction => prediction, :confidence => confidence} - + end - # Local support vector classification from neighbors - # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required + + # Local support vector regression from neighbors + # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required # @return [Numeric] A prediction value. def self.local_svm_classification(params) - confidence = 0.0 - prediction = nil - if params[:neighbors].size>0 - props = params[:prop_kernel] ? get_props(params) : nil - acts = params[:neighbors].collect { |n| act = n[:activity] } - sims = params[:neighbors].collect{ |n| n[:similarity] } # similarity values btwn q and nbors - prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc") - LOGGER.debug "Prediction is: '" + prediction.to_s + "'." - params[:conf_stdev] = false if params[:conf_stdev].nil? - confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]}) + begin + confidence = 0.0 + prediction = nil + + LOGGER.debug "Local SVM." + if params[:acts].size>0 + if params[:props] + n_prop = params[:props][0].collect + q_prop = params[:props][1].collect + props = [ n_prop, q_prop ] + end + acts = params[:acts].collect + acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification + prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + prediction = prediction.sub(/Val/,"") if prediction # Convert back to Float + confidence = 0.0 if prediction.nil? + LOGGER.debug "Prediction is: '" + prediction.to_s + "'." + confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]}) + end + {:prediction => prediction, :confidence => confidence} + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" end - {:prediction => prediction, :confidence => confidence} - + end + # Local support vector prediction from neighbors. - # Uses pre-defined Kernel Matrix. + # Uses propositionalized setting. # Not to be called directly (use local_svm_regression or local_svm_classification). + # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] # @param [Array] acts, activities for neighbors. - # @param [Array] sims, similarities for neighbors. - # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification). - # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required + # @param [Float] min_train_performance, parameter to control censoring # @return [Numeric] A prediction value. - def self.local_svm(acts, sims, type, params) - LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)." - neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches - gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel + def self.local_svm_prop(props, acts, min_train_performance) + + LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)." + n_prop = props[0] # is a matrix, i.e. two nested Arrays. + q_prop = props[1] # is an Array. prediction = nil if Algorithm::zero_variance? acts prediction = acts[0] else - # gram matrix - (0..(neighbor_matches.length-1)).each do |i| - neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]] - gram_matrix[i] = [] unless gram_matrix[i] - # upper triangle - ((i+1)..(neighbor_matches.length-1)).each do |j| - neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]] - sim_params = {} - if params[:nr_hits] - sim_params[:nr_hits] = true - sim_params[:compound_features_hits] = neighbor_i_hits - sim_params[:training_compound_features_hits] = neighbor_j_hits - end - sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)") - gram_matrix[i][j] = sim - gram_matrix[j] = [] unless gram_matrix[j] - gram_matrix[j][i] = gram_matrix[i][j] # lower triangle - end - gram_matrix[i][i] = 1.0 - end - - #LOGGER.debug gram_matrix.to_yaml @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests - @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed - LOGGER.debug "Setting R data ..." - # set data - @r.gram_matrix = gram_matrix.flatten - @r.n = neighbor_matches.size - @r.y = acts - @r.sims = sims - + @r.eval "set.seed(1)" + @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab" + @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore" + @r.eval "registerDoMC()" # switch on parallel processing begin - LOGGER.debug "Preparing R data ..." - # prepare data - @r.eval "y<-as.vector(y)" - @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))" - @r.eval "sims<-as.vector(sims)" - - # model + support vectors - LOGGER.debug "Creating SVM model ..." - @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)" - @r.eval "sv<-as.vector(SVindex(model))" - @r.eval "sims<-sims[sv]" - @r.eval "sims<-as.kernelMatrix(matrix(sims,1))" - LOGGER.debug "Predicting ..." - if type == "nu-svr" - @r.eval "p<-predict(model,sims)[1,1]" - elsif type == "C-bsvc" - @r.eval "p<-predict(model,sims)" - end - if type == "nu-svr" - prediction = @r.p - elsif type == "C-bsvc" - #prediction = (@r.p.to_f == 1.0 ? true : false) - prediction = @r.p - end - @r.quit # free R - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - end - end - prediction - end - - # Local support vector prediction from neighbors. - # Uses propositionalized setting. - # Not to be called directly (use local_svm_regression or local_svm_classification). - # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] - # @param [Array] acts, activities for neighbors. - # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification). - # @return [Numeric] A prediction value. - def self.local_svm_prop(props, acts, type) - - LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)." - n_prop = props[0] # is a matrix, i.e. two nested Arrays. - q_prop = props[1] # is an Array. - - prediction = nil - if Algorithm::zero_variance? acts - prediction = acts[0] - else - #LOGGER.debug gram_matrix.to_yaml - @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests - @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed - LOGGER.debug "Setting R data ..." # set data + LOGGER.debug "Setting R data ..." @r.n_prop = n_prop.flatten @r.n_prop_x_size = n_prop.size @r.n_prop_y_size = n_prop[0].size @r.y = acts @r.q_prop = q_prop + #@r.eval "y = matrix(y)" + @r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)" + @r.eval "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)" - begin - LOGGER.debug "Preparing R data ..." - # prepare data - @r.eval "y<-matrix(y)" - @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)" - @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)" - - # model + support vectors - LOGGER.debug "Creating SVM model ..." - @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)" - LOGGER.debug "Predicting ..." - if type == "nu-svr" - @r.eval "p<-predict(model,q_prop)[1,1]" - elsif type == "C-bsvc" - @r.eval "p<-predict(model,q_prop)" - end - if type == "nu-svr" - prediction = @r.p - elsif type == "C-bsvc" - #prediction = (@r.p.to_f == 1.0 ? true : false) - prediction = @r.p - end - @r.quit # free R - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - end - end - prediction - end + # prepare data + LOGGER.debug "Preparing R data ..." + @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification + + @r.eval <<-EOR + rem = nearZeroVar(prop_matrix) + if (length(rem) > 0) { + prop_matrix = prop_matrix[,-rem,drop=F] + q_prop = q_prop[,-rem,drop=F] + } + rem = findCorrelation(cor(prop_matrix)) + if (length(rem) > 0) { + prop_matrix = prop_matrix[,-rem,drop=F] + q_prop = q_prop[,-rem,drop=F] + } + EOR - # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set. - # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev - # @return[Float] Confidence - def self.get_confidence(params) - if params[:conf_stdev] - sim_median = params[:sims].to_scale.median - if sim_median.nil? - confidence = nil - else - standard_deviation = params[:acts].to_scale.standard_deviation_sample - confidence = (sim_median*Math.exp(-1*standard_deviation)).abs - if confidence.nan? - confidence = nil - end - end - else - conf = params[:sims].inject{|sum,x| sum + x } - confidence = conf/params[:neighbors].size - end - LOGGER.debug "Confidence is: '" + confidence.to_s + "'." - return confidence - end + # model + support vectors + LOGGER.debug "Creating R SVM model ..." + @r.eval <<-EOR + model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale")) + perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) + EOR - # Get X and Y size of a nested Array (Matrix) - def self.get_sizes(matrix) - begin - nr_cases = matrix.size - nr_features = matrix[0].size - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - end - #puts "NRC: #{nr_cases}, NRF: #{nr_features}" - [ nr_cases, nr_features ] - end - # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features) - # Same for the vector describing the query compound - # @param[Array] neighbors. - # @param[OpenTox::Compound] query compound. - # @param[Array] Dataset Features. - # @param[Array] Fingerprints of neighbors. - # @param[Float] p-values of Features. - def self.get_props (params) - matrix = Array.new - begin - params[:neighbors].each do |n| - n = n[:compound] - row = [] - params[:features].each do |f| - if ! params[:fingerprints][n].nil? - row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0) - else - row << 0.0 - end - end - matrix << row - end - row = [] - params[:features].each do |f| - if params[:nr_hits] - compound_feature_hits = params[:compound].match_hits([f]) - row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f])) - else - row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f]) - end + # prediction + LOGGER.debug "Predicting ..." + @r.eval "p = predict(model,q_prop)" + @r.eval "if (class(y)!='numeric') p = as.character(p)" + prediction = @r.p + + # censoring + prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance ) + LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}" + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" end - rescue Exception => e - LOGGER.debug "get_props failed with '" + $! + "'" + @r.quit # free R end - [ matrix, row ] + prediction end end + module FeatureSelection + include Algorithm + # Recursive Feature Elimination using caret + # @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values). + # @return [String] feature dataset CSV file composed of selected features. + def self.rfe(params) + @r=RinRuby.new(false,false) + @r.ds_csv_file = params[:ds_csv_file].to_s + @r.prediction_feature = params[:prediction_feature].to_s + @r.fds_csv_file = params[:fds_csv_file].to_s + @r.del_missing = params[:del_missing] == true ? 1 : 0 + r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_") + @r.f_fds_r = r_result_file.to_s + + # need packs 'randomForest', 'RANN' + @r.eval <<-EOR + set.seed(1) + suppressPackageStartupMessages(library('caret')) + suppressPackageStartupMessages(library('randomForest')) + suppressPackageStartupMessages(library('RANN')) + suppressPackageStartupMessages(library('doMC')) + registerDoMC() + + acts = read.csv(ds_csv_file, check.names=F) + feats = read.csv(fds_csv_file, check.names=F) + ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-) + + features = ds[,(dim(acts)[2]+1):(dim(ds)[2])] + y = ds[,which(names(ds) == prediction_feature)] + + # assumes a data matrix 'features' and a vector 'y' of target values + row.names(features)=NULL + + pp = NULL + if (del_missing) { + # needed if rows should be removed + na_ids = apply(features,1,function(x)any(is.na(x))) + features = features[!na_ids,] + y = y[!na_ids] + pp = preProcess(features, method=c("scale", "center")) + } else { + # Use imputation if NA's random (only then!) + pp = preProcess(features, method=c("scale", "center", "knnImpute")) + } + features = predict(pp, features) + + # determine subsets + subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) + subsets = c(2,3,4,5,7,10,subsets) + subsets = unique(sort(round(subsets))) + subsets = subsets[subsets<=dim(features)[2]] + subsets = subsets[subsets>1] + + # Recursive feature elimination + rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets) + + # read existing dataset and select most useful features + csv=feats[,c("SMILES", rfProfile$optVariables)] + write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='') + EOR + r_result_file + end + end + module Substructure include Algorithm # Substructure matching - # @param [OpenTox::Compound] compound Compound - # @param [Array] features Array with Smarts strings + # @param [Hash] required keys: compound, features # @return [Array] Array with matching Smarts - def self.match(compound,features) - compound.match(features) + def self.match(params) + params[:compound].match(params[:features]) end + + # Substructure matching with number of non-unique hits + # @param [Hash] required keys: compound, features + # @return [Hash] Hash with matching Smarts and number of hits + def self.match_hits(params) + params[:compound].match_hits(params[:features]) + end + + # Substructure matching with number of non-unique hits + # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type + # @return [Hash] Hash with matching Smarts and number of hits + def self.lookup(params) + params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type]) + end end module Dataset @@ -717,283 +537,5 @@ module OpenTox def features(dataset_uri,compound_uri) end end - - module Transform - include Algorithm - - # The transformer that inverts values. - # 1/x is used, after values have been moved >= 1. - class Inverter - attr_accessor :offset, :values - - # @params[Array] Values to transform. - # @params[Float] Offset for restore. - def initialize *args - case args.size - when 1 - begin - values=args[0] - raise "Cannot transform, values empty." if @values.size==0 - @values = values.collect { |v| -1.0 * v } - @offset = 1.0 - @values.minmax[0] - @offset = -1.0 * @offset if @offset>0.0 - @values.collect! { |v| v - @offset } # slide >1 - @values.collect! { |v| 1 / v } # invert to [0,1] - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - end - when 2 - @offset = args[1].to_f - @values = args[0].collect { |v| 1 / v } - @values.collect! { |v| v + @offset } - @values.collect! { |v| -1.0 * v } - end - end - end - - # The transformer that takes logs. - # Log10 is used, after values have been moved > 0. - class Log10 - attr_accessor :offset, :values - - # @params[Array] Values to transform / restore. - # @params[Float] Offset for restore. - def initialize *args - @distance_to_zero = 0.000000001 # 1 / 1 billion - case args.size - when 1 - begin - values=args[0] - raise "Cannot transform, values empty." if values.size==0 - @offset = values.minmax[0] - @offset = -1.0 * @offset if @offset>0.0 - @values = values.collect { |v| v - @offset } # slide > anchor - @values.collect! { |v| v + @distance_to_zero } # - @values.collect! { |v| Math::log10 v } # log10 (can fail) - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - end - when 2 - @offset = args[1].to_f - @values = args[0].collect { |v| 10**v } - @values.collect! { |v| v - @distance_to_zero } - @values.collect! { |v| v + @offset } - end - end - end - - # The transformer that does nothing (No OPeration). - class NOP - attr_accessor :offset, :values - - # @params[Array] Values to transform / restore. - # @params[Float] Offset for restore. - def initialize *args - @offset = 0.0 - @distance_to_zero = 0.0 - case args.size - when 1 - @values = args[0] - when 2 - @values = args[0] - end - end - end - - - # Auto-Scaler for Arrays - # Center on mean and divide by standard deviation - class AutoScale - attr_accessor :scaled_values, :mean, :stdev - - # @params[Array] Values to transform. - def initialize values - @scaled_values = values - @mean = @scaled_values.to_scale.mean - @stdev = @scaled_values.to_scale.standard_deviation_sample - @scaled_values = @scaled_values.collect {|vi| vi - @mean } - @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0 - end - end - - # Principal Components Analysis - # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos - class PCA - attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler - - # Creates a transformed dataset as GSL::Matrix. - # @param [GSL::Matrix] Data matrix. - # @param [Float] Compression ratio from [0,1]. - # @return [GSL::Matrix] Data transformed matrix. - def initialize data_matrix, compression=0.05 - begin - @data_matrix = data_matrix - @compression = compression.to_f - @stdev = Array.new - @mean = Array.new - - # Objective Feature Selection - raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2 - @data_matrix_selected = nil - (0..@data_matrix.size2-1).each { |i| - if !Algorithm::zero_variance?(@data_matrix.col(i).to_a) - if @data_matrix_selected.nil? - @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) - @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i) - else - @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1)) - end - end - } - raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2) - - # Scaling of Axes - @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2) - (0..@data_matrix_selected.size2-1).each { |i| - @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i)) - @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values - @stdev << @autoscaler.stdev - @mean << @autoscaler.mean - } - - data_matrix_hash = Hash.new - (0..@data_matrix_scaled.size2-1).each { |i| - column_view = @data_matrix_scaled.col(i) - data_matrix_hash[i] = column_view.to_scale - } - dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9 - cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash) - pca=Statsample::Factor::PCA.new(cor_matrix) - pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? } - @eigenvalue_sums = Array.new - (0..dataset_hash.fields.size-1).each { |i| - @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev } - } - eigenvectors_selected = Array.new - pca.eigenvectors.each_with_index { |ev, i| - if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0) - eigenvectors_selected << ev.to_a - end - } - @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose - dataset_matrix = dataset_hash.to_gsl.transpose - @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - end - end - - # Restores data in the original feature space (possibly with compression loss). - # @return [GSL::Matrix] Data matrix. - def restore - begin - data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca - # reverse scaling - (0..data_matrix_restored.size2-1).each { |i| - data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0 - data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i] - } - data_matrix_restored - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - end - end - - end - - end - - # DV: Removed Gauss kernel in this file since the additional calculations has no benefit for the result. - # See experiment summary: http://goo.gl/uwiKt - # Gauss kernel - # @return [Float] - #def self.gauss(x, sigma = 0.3) - # d = 1.0 - x.to_f - # Math.exp(-(d*d)/(2*sigma*sigma)) - #end - - # For symbolic features - # @param [Array] Array to test, must indicate non-occurrence with 0. - # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere. - def self.isnull_or_singular?(array) - nr_zeroes = array.count(0) - return (nr_zeroes == array.size) || # remove non-occurring feature - (nr_zeroes == array.size-1) || # remove singular feature - (nr_zeroes == 0) # also remove feature present everywhere - end - - # Numeric value test - # @param[Object] value - # @return [Boolean] Whether value is a number - def self.numeric?(value) - true if Float(value) rescue false - end - - # For symbolic features - # @param [Array] Array to test, must indicate non-occurrence with 0. - # @return [Boolean] Whether the feature has variance zero. - def self.zero_variance?(array) - return (array.to_scale.variance_population == 0.0) - end - - # Sum of an array for Arrays. - # @param [Array] Array with values - # @return [Integer] Sum of size of values - def self.sum_size(array) - sum=0 - array.each { |e| sum += e.size } - return sum - end - - # Minimum Frequency - # @param [Integer] per-mil value - # return [Integer] min-frequency - def self.min_frequency(training_dataset,per_mil) - minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST - minfreq = 2 unless minfreq > 2 - Integer (minfreq) - end - - # Effect calculation for classification - # @param [Array] Array of occurrences per class in the form of Enumerables. - # @param [Array] Array of database instance counts per class. - def self.effect(occurrences, db_instances) - max=0 - max_value=0 - nr_o = self.sum_size(occurrences) - nr_db = db_instances.to_scale.sum - - occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity. - actual = o.size.to_f/nr_o - expected = db_instances[i].to_f/nr_db - if actual > expected - if ((actual - expected) / actual) > max_value - max_value = (actual - expected) / actual # 'Schleppzeiger' - max = i - end - end - } - max - end - - # Returns Support value of an fingerprint - # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required - # return [Numeric] Support value - def self.p_sum_support(params) - p_sum = 0.0 - params[:features].each{|f| - compound_hits = params[:compound_features_hits][f] - neighbor_hits = params[:training_compound_features_hits][f] - p_sum += eval("(params[:weights][f] * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))") - } - p_sum - end - end end - - diff --git a/lib/compound.rb b/lib/compound.rb index e7b4da0..a2b7e48 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -6,13 +6,15 @@ module OpenTox # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure). class Compound + include OpenTox + attr_accessor :inchi, :uri # Create compound with optional uri # @example - # compound = OpenTox::Compound.new("http://webservices.in-silico.ch/compound/InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"") + # compound = Compound.new("http://webservices.in-silico.ch/compound/InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"") # @param [optional, String] uri Compound URI - # @return [OpenTox::Compound] Compound + # @return [Compound] Compound def initialize(uri=nil) @uri = uri case @uri @@ -36,9 +38,9 @@ module OpenTox # Create a compound from smiles string # @example - # compound = OpenTox::Compound.from_smiles("c1ccccc1") + # compound = Compound.from_smiles("c1ccccc1") # @param [String] smiles Smiles string - # @return [OpenTox::Compound] Compound + # @return [Compound] Compound def self.from_smiles(smiles) c = Compound.new c.inchi = Compound.smiles2inchi(smiles) @@ -48,7 +50,7 @@ module OpenTox # Create a compound from inchi string # @param [String] smiles InChI string - # @return [OpenTox::Compound] Compound + # @return [Compound] Compound def self.from_inchi(inchi) c = Compound.new c.inchi = inchi @@ -58,7 +60,7 @@ module OpenTox # Create a compound from sdf string # @param [String] smiles SDF string - # @return [OpenTox::Compound] Compound + # @return [Compound] Compound def self.from_sdf(sdf) c = Compound.new c.inchi = Compound.sdf2inchi(sdf) @@ -68,9 +70,9 @@ module OpenTox # Create a compound from name. Relies on an external service for name lookups. # @example - # compound = OpenTox::Compound.from_name("Benzene") + # compound = Compound.from_name("Benzene") # @param [String] name name can be also an InChI/InChiKey, CAS number, etc - # @return [OpenTox::Compound] Compound + # @return [Compound] Compound def self.from_name(name) c = Compound.new # paranoid URI encoding to keep SMILES charges and brackets @@ -131,7 +133,7 @@ module OpenTox # Match a smarts string # @example - # compound = OpenTox::Compound.from_name("Benzene") + # compound = Compound.from_name("Benzene") # compound.match?("cN") # returns false # @param [String] smarts Smarts string def match?(smarts) @@ -146,7 +148,7 @@ module OpenTox # Match an array of smarts strings, returns array with matching smarts # @example - # compound = OpenTox::Compound.from_name("Benzene") + # compound = Compound.from_name("Benzene") # compound.match(['cc','cN']) # returns ['cc'] # @param [Array] smarts_array Array with Smarts strings # @return [Array] Array with matching Smarts strings @@ -166,7 +168,7 @@ module OpenTox # Match_hits an array of smarts strings, returns hash with matching smarts as key and number of non-unique hits as value # @example - # compound = OpenTox::Compound.from_name("Benzene") + # compound = Compound.from_name("Benzene") # compound.match(['cc','cN']) # returns ['cc'] # @param [Array] smarts_array Array with Smarts strings # @return [Hash] Hash with matching smarts as key and number of non-unique hits as value @@ -191,6 +193,43 @@ module OpenTox return smarts_hits #smarts_array.collect { |s| s if match?(s)}.compact end + + # Lookup numerical values, returns hash with feature name as key and value as value + # @param [Array] Array of feature names + # @param [String] Feature dataset uri + # @return [Hash] Hash with feature name as key and value as value + def lookup(feature_array,feature_dataset_uri,pc_type) + ds = OpenTox::Dataset.find(feature_dataset_uri) + + #entry = ds.data_entries[self.uri] + entry = nil + ds.data_entries.each { |c_uri, values| + if c_uri.split('/compound/').last == self.to_inchi + entry = ds.data_entries[self.uri] + break + end + } + + LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil? + + if entry.nil? + uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type}) + uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi) + ds = OpenTox::Dataset.find(uri) + entry = ds.data_entries[self.uri] + ds.delete + end + + features = entry.keys + features.each { |feature| + new_feature = File.join(feature_dataset_uri, "feature", feature.split("/").last) + entry[new_feature] = entry[feature].flatten.first.to_f # see algorithm/lazar.rb:182, to_f because feature type detection doesn't work w 1 entry + entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit + } + #res = feature_array.collect {|v| entry[v]} + #LOGGER.debug "----- am #{entry.to_yaml}" + entry + end # Get URI of compound image with highlighted fragments diff --git a/lib/dataset.rb b/lib/dataset.rb index 3662527..95c1918 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -288,7 +288,7 @@ module OpenTox # Insert a statement (compound_uri,feature_uri,value) # @example Insert a statement (compound_uri,feature_uri,value) - # dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", true + # dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", 1 # @param [String] compound Compound URI # @param [String] feature Compound URI # @param [Boolean,Float] value Feature value @@ -315,6 +315,16 @@ module OpenTox @features[feature] = metadata end + # Complete feature values by adding zeroes + def complete_data_entries + all_features = @features.keys + @data_entries.each { |c, e| + (Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f| + self.add(c,f,0) + } + } + end + # Add/modify metadata for a feature # @param [String] feature Feature URI # @param [Hash] metadata Hash with feature metadata diff --git a/lib/environment.rb b/lib/environment.rb index 6a72ba5..c1b8312 100644 --- a/lib/environment.rb +++ b/lib/environment.rb @@ -91,5 +91,5 @@ DC = OwlNamespace.new 'http://purl.org/dc/elements/1.1/' OT = OwlNamespace.new 'http://www.opentox.org/api/1.1#' OTA = OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#' XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#' -BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#' +#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#' diff --git a/lib/model.rb b/lib/model.rb index 0b116c2..b3de1a3 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -102,8 +102,8 @@ module OpenTox include Algorithm include Model - attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim, :subjectid, :prop_kernel, :value_map, :nr_hits, :transform, :conf_stdev, :prediction_min_max + attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors def initialize(uri=nil) if uri @@ -120,18 +120,11 @@ module OpenTox @p_values = {} @fingerprints = {} @value_map = {} - @prediction_min_max = [] @feature_calculation_algorithm = "Substructure.match" @similarity_algorithm = "Similarity.tanimoto" @prediction_algorithm = "Neighbors.weighted_majority_vote" - @nr_hits = false - @min_sim = 0.3 - @prop_kernel = false - @transform = { "class" => "NOP" } - @conf_stdev = false - end # Get URIs of all lazar models @@ -174,19 +167,14 @@ module OpenTox lazar.feature_calculation_algorithm = hash["feature_calculation_algorithm"] if hash["feature_calculation_algorithm"] lazar.similarity_algorithm = hash["similarity_algorithm"] if hash["similarity_algorithm"] lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"] - lazar.min_sim = hash["min_sim"] if hash["min_sim"] lazar.subjectid = hash["subjectid"] if hash["subjectid"] - lazar.prop_kernel = hash["prop_kernel"] if hash["prop_kernel"] lazar.value_map = hash["value_map"] if hash["value_map"] - lazar.nr_hits = hash["nr_hits"] if hash["nr_hits"] - lazar.transform = hash["transform"] if hash["transform"] - lazar.conf_stdev = hash["conf_stdev"] if hash["conf_stdev"] - lazar.prediction_min_max = hash["prediction_min_max"] if hash["prediction_min_max"] + lazar end def to_json - Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :min_sim => @min_sim, :subjectid => @subjectid, :prop_kernel => @prop_kernel, :value_map => @value_map, :nr_hits => @nr_hits, :transform => @transform, :conf_stdev => @conf_stdev, :prediction_min_max => @prediction_min_max}) + Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map}) end def run( params, accept_header=nil, waiting_task=nil ) @@ -230,8 +218,11 @@ module OpenTox predict(compound_uri,false,subjectid) count += 1 waiting_task.progress( count/d.compounds.size.to_f*100.0 ) if waiting_task - rescue => ex - LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+ex.message+" subjectid: #{subjectid}" + rescue => e + LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+e.message+" subjectid: #{subjectid}" + #LOGGER.debug "#{e.class}: #{e.message}" + #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end end #@prediction_dataset.save(subjectid) @@ -261,25 +252,41 @@ module OpenTox if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression" all_activities = [] all_activities = @activities.values.flatten.collect! { |i| i.to_f } - @prediction_min_max[0] = (all_activities.to_scale.min/2) - @prediction_min_max[1] = (all_activities.to_scale.max*2) end unless database_activity(subjectid) # adds database activity to @prediction_dataset - neighbors - prediction = eval("#{@prediction_algorithm} ( { :neighbors => @neighbors, - :compound => @compound, - :features => @features, - :p_values => @p_values, - :fingerprints => @fingerprints, - :similarity_algorithm => @similarity_algorithm, - :prop_kernel => @prop_kernel, + # Calculation of needed values for query compound + @compound_features = eval("#{@feature_calculation_algorithm}({ + :compound => @compound, + :features => @features, + :feature_dataset_uri => @metadata[OT.featureDataset], + :pc_type => self.parameter(\"pc_type\") + })") + + # Adding fingerprint of query compound with features and values(p_value*nr_hits) + @compound_fingerprints = {} + @compound_features.each do |feature, value| # value is nil if "Substructure.match" + if @feature_calculation_algorithm == "Substructure.match_hits" + @compound_fingerprints[feature] = @p_values[feature] * value + elsif @feature_calculation_algorithm == "Substructure.match" + @compound_fingerprints[feature] = @p_values[feature] + elsif @feature_calculation_algorithm == "Substructure.lookup" + @compound_fingerprints[feature] = value + end + end + + # Transform model data to machine learning scheme (tables of data) + mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) + mtf.transform + + # Make a prediction + prediction = eval("#{@prediction_algorithm}( { :props => mtf.props, + :acts => mtf.acts, + :sims => mtf.sims, :value_map => @value_map, - :nr_hits => @nr_hits, - :conf_stdev => @conf_stdev, - :prediction_min_max => @prediction_min_max, - :transform => @transform } ) ") + :min_train_performance => self.parameter(\"min_train_performance\") + } ) ") value_feature_uri = File.join( @uri, "predicted", "value") confidence_feature_uri = File.join( @uri, "predicted", "confidence") @@ -355,44 +362,6 @@ module OpenTox @prediction_dataset end - - - # Find neighbors and store them as object variable, access all compounds for that. - def neighbors - @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm - @neighbors = [] - @fingerprints.keys.each do |training_compound| # AM: access all compounds - add_neighbor @fingerprints[training_compound].keys, training_compound - end - end - - # Adds a neighbor to @neighbors if it passes the similarity threshold. - def add_neighbor(training_features, training_compound) - compound_features_hits = {} - training_compound_features_hits = {} - if @nr_hits - compound_features_hits = @compound.match_hits(@compound_features) - training_compound_features_hits = @fingerprints[training_compound] - #LOGGER.debug "dv ------------ training_compound_features_hits:#{training_compound_features_hits.class} #{training_compound_features_hits}" - end - params = {} - params[:nr_hits] = @nr_hits - params[:compound_features_hits] = compound_features_hits - params[:training_compound_features_hits] = training_compound_features_hits - - sim = eval("#{@similarity_algorithm}(training_features, @compound_features, @p_values, params)") - if sim > @min_sim - @activities[training_compound].each do |act| - @neighbors << { - :compound => training_compound, - :similarity => sim, - :features => training_features, - :activity => act - } - end - end - end - # Find database activities and store them in @prediction_dataset # @return [Boolean] true if compound has databasse activities, false if not def database_activity(subjectid) diff --git a/lib/parser.rb b/lib/parser.rb index 749a41d..56e4fed 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -290,10 +290,11 @@ module OpenTox @features = [] @feature_types = {} - @format_errors = "" - @smiles_errors = [] + @format_errors = [] + @id_errors = [] @activity_errors = [] @duplicates = {} + @max_class_values = 3 end def detect_new_values(row, value_maps) @@ -309,9 +310,10 @@ module OpenTox # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help) # @param [Excel] book Excel workbook object (created with roo gem) # @return [OpenTox::Dataset] Dataset object with Excel data - def load_spreadsheet(book) + def load_spreadsheet(book, drop_missing=false) book.default_sheet = 0 - add_features book.row(1) + headers = book.row(1) + add_features headers value_maps = Array.new regression_features=Array.new @@ -319,15 +321,27 @@ module OpenTox row = book.row(i) value_maps = detect_new_values(row, value_maps) value_maps.each_with_index { |vm,j| - if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer. + if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer. regression_features[j]=true else regression_features[j]=false end } } + 2.upto(book.last_row) { |i| - add_values book.row(i), regression_features + drop=false + row = book.row(i) + raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size + if row.include?("") + @format_errors << "Row #{i} has #{row.count("")} missing values" + drop=true + drop_missing=true if (row.count("") == row.size-1) + end + add_values(row, regression_features) unless (drop_missing && drop) + if (drop_missing && drop) + @format_errors << "Row #{i} not added" + end } warnings @dataset @@ -336,10 +350,11 @@ module OpenTox # Load CSV string (format specification: http://toxcreate.org/help) # @param [String] csv CSV representation of the dataset # @return [OpenTox::Dataset] Dataset object with CSV data - def load_csv(csv) + def load_csv(csv, drop_missing=false) row = 0 input = csv.split("\n") - add_features split_row(input.shift) + headers = split_row(input.shift) + add_features(headers) value_maps = Array.new regression_features=Array.new @@ -347,15 +362,27 @@ module OpenTox row = split_row(row) value_maps = detect_new_values(row, value_maps) value_maps.each_with_index { |vm,j| - if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer. + if vm.size > @max_class_values # max @max_class_values classes. regression_features[j]=true else regression_features[j]=false end } } - input.each { |row| - add_values split_row(row), regression_features + + input.each_with_index { |row, i| + drop=false + row = split_row(row) + raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size + if row.include?("") + @format_errors << "Row #{i} has #{row.count("")} missing values" + drop=true + drop_missing=true if (row.count("") == row.size-1) + end + add_values(row, regression_features) unless (drop_missing && drop) + if (drop_missing && drop) + @format_errors << "Row #{i} not added" + end } warnings @dataset @@ -367,88 +394,115 @@ module OpenTox info = '' @feature_types.each do |feature,types| - if types.uniq.size > 1 + if types.uniq.size == 0 + type = "helper#MissingFeature" + elsif types.uniq.size > 1 type = OT.NumericFeature else type = types.first end @dataset.add_feature_metadata(feature,{RDF.type => [type]}) - info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." + info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type # TODO: rewrite feature values - # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored." + # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored." end @dataset.metadata[OT.Info] = info warnings = '' - warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty? + warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty? warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty? + warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty? duplicate_warnings = '' @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 } - warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty? + warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty? @dataset.metadata[OT.Warnings] = warnings end + # Adds a row of features to a dataset + # @param Array A row split up as an array + # @return Array Indices for duplicate features def add_features(row) - row.shift # get rid of smiles entry - row.each do |feature_name| + row=row.collect + row.shift # get rid of id entry + @duplicate_feature_indices = [] # starts with 0 at first f after id + row.each_with_index do |feature_name, idx| feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name)) - @feature_types[feature_uri] = [] - @features << feature_uri - @dataset.add_feature(feature_uri,{DC.title => feature_name}) + unless @features.include? feature_uri + @feature_types[feature_uri] = [] + @features << feature_uri + @dataset.add_feature(feature_uri,{DC.title => feature_name}) + else + @duplicate_feature_indices << idx + @format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}" + end end end # Adds a row to a dataset # @param Array A row split up as an array # @param Array Indicator for regression for each field + # @param Array Indices for duplicate features def add_values(row, regression_features) - smiles = row.shift - compound = Compound.from_smiles(smiles) + id = row.shift + case id + when /InChI/ + compound = Compound.from_inchi(URI.decode_www_form_component(id)) + else + compound = Compound.from_smiles(id) + end + if compound.nil? or compound.inchi.nil? or compound.inchi == "" - @smiles_errors << smiles+", "+row.join(", ") + @id_errors << id+", "+row.join(", ") return false end @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi] - @duplicates[compound.inchi] << smiles+", "+row.join(", ") + @duplicates[compound.inchi] << id+", "+row.join(", ") + feature_idx = 0 row.each_index do |i| - value = row[i] - feature = @features[i] - type = nil - if (regression_features[i]) - type = feature_type(value) - if type != OT.NumericFeature - raise "Error! Expected numeric values." + unless @duplicate_feature_indices.include? i + + value = row[i] + #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty + feature = @features[feature_idx] + + type = feature_type(value) # May be NIL + type = OT.NominalFeature unless (type.nil? || regression_features[i]) + @feature_types[feature] << type if type + + val = nil + case type + when OT.NumericFeature + val = value.to_f + when OT.NominalFeature + val = value.to_s end - else - type = OT.NominalFeature - end - @feature_types[feature] << type - case type - when OT.NumericFeature - val = value.to_f - when OT.NominalFeature - val = value.to_s - end - if val!=nil - @dataset.add(compound.uri, feature, val) - if type!=OT.NumericFeature - @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue] - @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s) + feature_idx += 1 + + if val != nil + @dataset.add(compound.uri, feature, val) + if type != OT.NumericFeature + @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue] + @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s) + end end + end + end end def feature_type(value) - if OpenTox::Algorithm::numeric? value + if value == "" + return nil + elsif OpenTox::Algorithm::numeric? value return OT.NumericFeature else return OT.NominalFeature @@ -456,7 +510,7 @@ module OpenTox end def split_row(row) - row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes + row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells end end @@ -468,6 +522,7 @@ module OpenTox def initialize @data = {} @activity_errors = [] + @max_class_values = 3 end def feature_values(feature) @@ -485,14 +540,14 @@ module OpenTox def clean_features ignored_features = [] features.each do |feature| - if feature_values(feature).size > 5 + if feature_values(feature).size > @max_class_values if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature # REGRESSION elsif feature_types(feature).include? OT.NumericFeature @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)." else - @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)." + @activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)." ignored_features << feature next end @@ -543,12 +598,15 @@ module OpenTox private def feature_type(value) - if OpenTox::Algorithm::numeric? value + if value.nil? + return nil + elsif OpenTox::Algorithm::numeric? value return OT.NumericFeature else return OT.NominalFeature end end + end # quick hack to enable sdf import via csv @@ -589,20 +647,20 @@ module OpenTox @duplicates[inchi] << rec #inchi#+", "+row.join(", ") compound = Compound.from_inchi inchi rescue - @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}" + @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}" next end row = {} obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) } table.data[compound.uri] = row end - - # finda and remove ignored_features + + # find and remove ignored_features @activity_errors = table.clean_features table.add_to_dataset @dataset warnings = '' - warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty? + warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty? warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty? duplicate_warnings = '' @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 } diff --git a/lib/rest_client_wrapper.rb b/lib/rest_client_wrapper.rb index 6d25bb3..fcadebb 100644 --- a/lib/rest_client_wrapper.rb +++ b/lib/rest_client_wrapper.rb @@ -70,7 +70,7 @@ module OpenTox begin #LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect - resource = RestClient::Resource.new(uri,{:timeout => 60}) + resource = RestClient::Resource.new(uri,{:timeout => 600}) if rest_call=="post" || rest_call=="put" result = resource.send(rest_call, payload, headers) else diff --git a/lib/serializer.rb b/lib/serializer.rb index a010ce5..30cb2ba 100644 --- a/lib/serializer.rb +++ b/lib/serializer.rb @@ -55,7 +55,7 @@ module OpenTox OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } , OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } , - #object props for validation# + #object props for validation# OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } , OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } , OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } , @@ -87,7 +87,7 @@ module OpenTox OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } , OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } , - # annotation props for validation + # annotation props for validation OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } , OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } , OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } , @@ -143,8 +143,8 @@ module OpenTox @data_entries = {} @values_id = 0 @parameter_id = 0 - - @classes = Set.new + + @classes = Set.new @object_properties = Set.new @annotation_properties = Set.new @datatype_properties = Set.new @@ -208,7 +208,7 @@ module OpenTox @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] } add_metadata uri, metadata end - + # Add a resource defined by resource_class and content # (see documentation of add_content for example) # @param [String] uri of resource @@ -223,10 +223,10 @@ module OpenTox def add_uri(uri,type) @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] } end - + private @@content_id = 1 - + #Recursiv function to add content #@example # { DC.description => "bla", @@ -244,7 +244,7 @@ module OpenTox hash.each do |u,v| if v.is_a? Hash # value is again a hash, i.e. a new owl class is added - # first make sure type (==class) is set + # first make sure type (==class) is set type = v[RDF.type] raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type) @@ -256,7 +256,7 @@ module OpenTox # add content to new class add_content(genid,v) elsif v.is_a? Array - # value is an array, i.e. a list of values with property is added + # value is an array, i.e. a list of values with property is added v.each{ |vv| add_content( uri, { u => vv } ) } else # v.is_a? String # simple string value @@ -268,7 +268,7 @@ module OpenTox end end end - + public # Add metadata @@ -329,7 +329,7 @@ module OpenTox v = [{ "type" => "uri", "value" => value}] when "literal" v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }] - else + else raise "Illegal type #{type(value)} for #{value}." end @object[values] = { @@ -342,7 +342,7 @@ module OpenTox end # Serializers - + # Convert to N-Triples # @return [text/plain] Object OWL-DL in N-Triples format def to_ntriples @@ -353,7 +353,7 @@ module OpenTox entry.each do |p,objects| p = url(p) objects.each do |o| - case o["type"] + case o["type"] when "uri" o = url(o["value"]) when "literal" @@ -371,9 +371,15 @@ module OpenTox # Convert to RDF/XML # @return [text/plain] Object OWL-DL in RDF/XML format def to_rdfxml - Tempfile.open("owl-serializer"){|f| f.write(self.to_ntriples); @path = f.path} + tmpf = Tempfile.open("owl-serializer") + tmpf.write(self.to_ntriples) + tmpf.flush + @path = tmpf.path # TODO: add base uri for ist services - `rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null` + res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null` + tmpf.close + tmpf.delete + res end # Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification @@ -427,20 +433,20 @@ module OpenTox end def literal(value,type) - # concat and << are faster string concatination operators than + + # concat and << are faster string concatination operators than + '"'.concat(value.to_s).concat('"^^<').concat(type).concat('>') end def url(uri) - # concat and << are faster string concatination operators than + + # concat and << are faster string concatination operators than + '<'.concat(uri).concat('>') end def rdf_types - @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } } - @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } } - @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } } - @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } } + @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } } + @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } } + @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } } + @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } } end end @@ -457,27 +463,38 @@ module OpenTox @rows.first << features @rows.first.flatten! dataset.data_entries.each do |compound,entries| - smiles = Compound.new(compound).to_smiles + cmpd = Compound.new(compound) + smiles = cmpd.to_smiles + inchi = URI.encode_www_form_component(cmpd.to_inchi) + row_container = Array.new row = Array.new(@rows.first.size) - row[0] = smiles + row_container << row + #row[0] = smiles + row[0] = inchi entries.each do |feature, values| i = features.index(feature)+1 values.each do |value| - if row[i] - row[i] = "#{row[i]} #{value}" # multiple values + if row_container[0][i] + #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'" + row_container << row_container.last.collect + row_container.last[i] = value + #LOGGER.debug "RC: #{row_container.to_yaml}" else - row[i] = value + row_container.each { |r| r[i] = value } end end end - @rows << row + row_container.each { |r| @rows << r } end end # Convert to CSV string # @return [String] CSV string def to_csv - @rows.collect{|r| r.join(", ")}.join("\n") + rows = @rows.collect + result = "" + result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name + result << rows.collect{ |r| r.join(",") }.join("\n") end # Convert to spreadsheet workbook diff --git a/lib/transform.rb b/lib/transform.rb new file mode 100644 index 0000000..8fe1093 --- /dev/null +++ b/lib/transform.rb @@ -0,0 +1,520 @@ +module OpenTox + module Transform + # Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos + + # LogAutoScaler for GSL vectors. + # Take log and scale. + class LogAutoScale + attr_accessor :vs, :offset, :autoscaler + + # @param [GSL::Vector] Values to transform using LogAutoScaling. + def initialize values + @distance_to_zero = 1.0 + begin + raise "Cannot transform, values empty." if values.size==0 + vs = values.clone + @offset = vs.min - @distance_to_zero + @autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs) + @vs = @autoscaler.vs + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + # @param [GSL::Vector] values to restore. + # @return [GSL::Vector] transformed values. + def restore values + begin + raise "Cannot transform, values empty." if values.size==0 + vs = values.clone + rv = @autoscaler.restore(vs) + rv.to_a.collect { |v| (10**v) + @offset }.to_gv + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + # @param [GSL::Vector] values to transform. + # @return [GSL::Vector] transformed values. + def mvlog values + values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv + end + + end + + + # Auto-Scaler for GSL vectors. + # Center on mean and divide by standard deviation. + class AutoScale + attr_accessor :vs, :mean, :stdev + + # @param [GSL::Vector] values to transform using AutoScaling. + def initialize values + begin + raise "Cannot transform, values empty." if values.size==0 + vs = values.clone + @mean = vs.to_scale.mean + @stdev = vs.to_scale.standard_deviation_population + @stdev = 0.0 if @stdev.nan? + @vs = transform vs + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + # @param [GSL::Vector] values to transform. + # @return [GSL::Vector] transformed values. + def transform values + begin + raise "Cannot transform, values empty." if values.size==0 + autoscale values.clone + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + # @param [GSL::Vector] Values to restore. + # @return [GSL::Vector] transformed values. + def restore values + begin + raise "Cannot transform, values empty." if values.size==0 + rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0 + (rv_ss + @mean).to_gsl + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + # @param [GSL::Vector] values to transform. + # @return [GSL::Vector] transformed values. + def autoscale values + vs_ss = values.clone.to_scale - @mean + @stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl + end + + end + + + # Principal Components Analysis. + class PCA + attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler + + # Creates a transformed dataset as GSL::Matrix. + # + # @param [GSL::Matrix] Data matrix. + # @param [Float] Compression ratio from [0,1], default 0.05. + # @return [GSL::Matrix] Data transformed matrix. + def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0) + begin + @data_matrix = data_matrix.clone + @compression = compression.to_f + @mean = Array.new + @autoscaler = Array.new + @cols = Array.new + @maxcols = maxcols + + # Objective Feature Selection + raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2 + @data_matrix_selected = nil + (0..@data_matrix.size2-1).each { |i| + if !Algorithm::zero_variance?(@data_matrix.col(i).to_a) + if @data_matrix_selected.nil? + @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) + @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i) + else + @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1)) + end + @cols << i + end + } + raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2) + + # PCA uses internal centering on 0 + @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size) + (0..@cols.size-1).each { |i| + as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i)) + @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev + @mean << as.mean + @autoscaler << as + } + + # PCA + data_matrix_hash = Hash.new + (0..@cols.size-1).each { |i| + column_view = @data_matrix_scaled.col(i) + data_matrix_hash[i] = column_view.to_scale + } + dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9 + cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash) + pca=Statsample::Factor::PCA.new(cor_matrix) + + # Select best eigenvectors + pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? } + @eigenvalue_sums = Array.new + (0..@cols.size-1).each { |i| + @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev } + } + eigenvectors_selected = Array.new + pca.eigenvectors.each_with_index { |ev, i| + if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0) + eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size + end + } + @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose + @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose + + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + # Transforms data to feature space found by PCA. + # + # @param [GSL::Matrix] Data matrix. + # @return [GSL::Matrix] Transformed data matrix. + def transform values + begin + vs = values.clone + raise "Error! Too few columns for transformation." if vs.size2 < @cols.max + data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size) + @cols.each_with_index { |i,j| + data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev + } + (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + # Restores data in the original feature space (possibly with compression loss). + # + # @param [GSL::Matrix] Transformed data matrix. + # @return [GSL::Matrix] Data matrix. + def restore + begin + data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca + # reverse scaling + (0..@cols.size-1).each { |i| + data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i] + } + data_matrix_restored + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + end + + + # Singular Value Decomposition + class SVD + attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv + + # Creates a transformed dataset as GSL::Matrix. + # + # @param [GSL::Matrix] Data matrix + # @param [Float] Compression ratio from [0,1], default 0.05 + # @return [GSL::Matrix] Data transformed matrix + + def initialize data_matrix, compression=0.05 + begin + @data_matrix = data_matrix.clone + @compression = compression + + # Compute the SV Decomposition X=USV + # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)! + u, vt, s = data_matrix.SV_decomp + + # Determine cutoff index + s2 = s.mul(s) ; s2_sum = s2.sum + s2_run = 0 + k = s2.size - 1 + s2.to_a.reverse.each { |v| + s2_run += v + frac = s2_run / s2_sum + break if frac > compression + k -= 1 + } + k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1) + + # Take the k-rank approximation of the Matrix + # - Take first k columns of u + # - Take first k columns of vt + # - Take the first k eigenvalues + @uk = u.submatrix(nil, (0..k)) # used to transform column format data + @vk = vt.submatrix(nil, (0..k)) # used to transform row format data + s = GSL::Matrix.diagonal(s) + @eigk = s.submatrix((0..k), (0..k)) + @eigk_inv = @eigk.inv + + # Transform data + @data_transformed_matrix = @uk # = u for all SVs + # NOTE: @data_transformed_matrix is also equal to + # @data_matrix * @vk * @eigk_inv + + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + + # Transforms data instance (1 row) to feature space found by SVD. + # + # @param [GSL::Matrix] Data matrix (1 x m). + # @return [GSL::Matrix] Transformed data matrix. + def transform_instance values + begin + values * @vk * @eigk_inv + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + alias :transform :transform_instance # make this the default (see PCA interface) + + # Transforms data feature (1 column) to feature space found by SVD. + # + # @param [GSL::Matrix] Data matrix (1 x n). + # @return [GSL::Matrix] Transformed data matrix. + def transform_feature values + begin + values * @uk * @eigk_inv + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + + # Restores data in the original feature space (possibly with compression loss). + # + # @param [GSL::Matrix] Transformed data matrix. + # @return [GSL::Matrix] Data matrix. + def restore + begin + @data_transformed_matrix * @eigk * @vk.transpose # reverse svd + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + + end + + + + # Attaches transformations to an OpenTox::Model + # Stores props, sims, performs similarity calculations + class ModelTransformer + attr_accessor :model, :similarity_algorithm, :acts, :sims + + # @params[OpenTox::Model] model to transform + def initialize model + @model = model + @similarity_algorithm = @model.similarity_algorithm + end + + def transform + get_matrices # creates @n_prop, @q_prop, @acts from ordered fps + @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors + + # Preprocessing + if (@model.similarity_algorithm == "Similarity.cosine") + # truncate nil-columns and -rows + LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" + while @q_prop.size>0 + idx = @q_prop.index(nil) + break if idx.nil? + @q_prop.slice!(idx) + @n_prop.each { |r| r.slice!(idx) } + end + LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" + remove_nils # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors + LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" + + # adjust rest + fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp + cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp + acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp + + # scale and svd + nr_cases, nr_features = @n_prop.size, @n_prop[0].size + gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup + gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup + (0...nr_features).each { |i| + autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i)) + gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs + gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i) + } + svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0) + @n_prop = svd.data_transformed_matrix.to_a + @q_prop = svd.transform(gsl_q_prop).row(0).to_a + LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" + else + convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched + end + + # neighbor calculation + @ids = [] # surviving compounds become neighbors + @sims = [] # calculated by neighbor routine + neighbors + n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix + acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp + + + # Sims between neighbors, if necessary + gram_matrix = [] + if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.) + @n_prop.each_index do |i| + gram_matrix[i] = [] unless gram_matrix[i] + @n_prop.each_index do |j| + if (j>i) + sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])") + gram_matrix[i][j] = sim + gram_matrix[j] = [] unless gram_matrix[j] + gram_matrix[j][i] = gram_matrix[i][j] + end + end + gram_matrix[i][i] = 1.0 + end + end + + # reclaim original data (if svd was performed) + if svd + @n_prop = gsl_n_prop_orig.to_a + n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp + @q_prop = gsl_q_prop_orig.row(0).to_a + end + + LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" + LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" + + @sims = [ gram_matrix, @sims ] + + end + + + + + # Find neighbors and store them as object variable, access all compounds for that. + def neighbors + @model.neighbors = [] + @n_prop.each_with_index do |fp, idx| # AM: access all compounds + add_neighbor fp, idx + end + end + + + # Adds a neighbor to @neighbors if it passes the similarity threshold + # adjusts @ids to signal the + def add_neighbor(training_props, idx) + + sim = similarity(training_props) + if sim > @model.parameter("min_sim") + if @model.activities[@cmpds[idx]] + @model.activities[@cmpds[idx]].each do |act| + @model.neighbors << { + :compound => @cmpds[idx], + :similarity => sim, + :features => @fps[idx].keys, + :activity => act + } + @sims << sim + @ids << idx + end + end + end + end + + + # Removes nil entries from n_prop and q_prop. + # Matrix is a nested two-dimensional array. + # Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed. + # Tie break: columns take precedence. + # Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero. + # Enables the use of cosine similarity / SVD + def remove_nils + return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0) + col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) } + row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) } + m_cols = col_nr_nils.max + m_rows = row_nr_nils.max + idx_cols = col_nr_nils.index(m_cols) + idx_rows = row_nr_nils.index(m_rows) + while ((m_cols > 0) || (m_rows > 0)) do + if m_cols >= m_rows + @n_prop.each { |row| row.slice!(idx_cols) } + @q_prop.slice!(idx_cols) + else + @n_prop.slice!(idx_rows) + @ids.slice!(idx_rows) + end + break if (@n_prop.length == 0) || (@n_prop[0].length == 0) + col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) } + row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) } + m_cols = col_nr_nils.max + m_rows = row_nr_nils.max + idx_cols= col_nr_nils.index(m_cols) + idx_rows = row_nr_nils.index(m_rows) + end + end + + + # Replaces nils by zeroes in n_prop and q_prop + # Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop) + def convert_nils + @n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } } + @q_prop.collect! { |v| v.nil? ? 0 : v } + end + + + # Executes model similarity_algorithm + def similarity(training_props) + eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)") + end + + + # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed. + # Same for compound fingerprints. + def get_matrices + + @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = [] + + @model.fingerprints.each { |fp| + cmpd = fp[0]; fp = fp[1] + if @model.activities[cmpd] # row good + acts = @model.activities[cmpd]; @acts += acts + LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1 + row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's + acts.size.times { # multiple additions for multiple activities + @n_prop << row.collect + @cmpds << cmpd + @fps << Marshal.load(Marshal.dump(fp)) + } + else + LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'" + end + } + + @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure + + end + + def props + @model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil + end + + end + + end +end diff --git a/lib/utils.rb b/lib/utils.rb new file mode 100644 index 0000000..c9e9eca --- /dev/null +++ b/lib/utils.rb @@ -0,0 +1,372 @@ +require 'csv' + + +module OpenTox + + module Algorithm + + include OpenTox + + # Calculate physico-chemical descriptors. + # @param[Hash] Required keys: :dataset_uri, :pc_type + # @return[String] dataset uri + + def self.pc_descriptors(params) + + begin + ds = OpenTox::Dataset.find(params[:dataset_uri]) + compounds = ds.compounds.collect + ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } ) + #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing + LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'" + load_ds_csv(ambit_result_uri, smiles_to_inchi) + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + + end + + # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit. + # @param[Hash] Required keys: :compounds, :pc_type + # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features + def self.get_pc_descriptors(params) + + begin + + ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/" + ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632" + descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") ) + descs_uris = [] + params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type + types = params[:pc_type].split(",") + descs.each { |uri, cat_name| + if types.include? cat_name[:category] + descs_uris << uri + end + } + if descs_uris.size == 0 + raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?" + end + #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}" + + begin + # Create SMI + smiles_array = []; smiles_to_inchi = {} + params[:compounds].each do |n| + cmpd = OpenTox::Compound.new(n) + smiles_string = cmpd.to_smiles + smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi) + smiles_array << smiles_string + end + smi_file = Tempfile.open(['pc_ambit', '.csv']) + pc_descriptors = nil + + # Create Ambit dataset + smi_file.puts( "SMILES\n" ) + smi_file.puts( smiles_array.join("\n") ) + smi_file.flush + ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} ) + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + smi_file.close! if smi_file + end + ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp + + # Calculate 3D for CPSA + if types.include? "cpsa" + ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} ) + LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }" + end + + # Get Ambit results + ambit_result_uri = [] # 1st pos: base uri, then features + ambit_result_uri << ambit_ds_uri + "?" + ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&") + descs_uris.each_with_index do |uri, i| + algorithm = Algorithm::Generic.new(uri) + result_uri = algorithm.run({:dataset_uri => ambit_ds_uri}) + ambit_result_uri << result_uri.split("?")[1] + "&" + LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}" + end + #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}" + [ ambit_result_uri, smiles_to_inchi ] + + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + + # Load dataset via CSV + # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features + # @return[String] dataset uri + def self.load_ds_csv(ambit_result_uri, smiles_to_inchi) + + master=nil + (1...ambit_result_uri.size).collect { |idx| + curr_uri = ambit_result_uri[0] + ambit_result_uri[idx] + LOGGER.debug "Requesting #{curr_uri}" + csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv"}) ) + if csv_data[0] && csv_data[0].size>1 + if master.nil? # This is the smiles entry + (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] } + master = csv_data + next + else + index_uri = csv_data[0].index("SMILES") + csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information + + nr_cols = (csv_data[0].size)-1 + LOGGER.debug "Merging #{nr_cols} new columns" + master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows + csv_data.each do |row| + temp = master.assoc(row[0]) # Finds the appropriate line in master + ((-1*nr_cols)..-1).collect.each { |idx| + temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found + } + end + end + end + } + + index_uri = master[0].index("Compound") + master.map {|i| i.delete_at(index_uri)} + master[0].each {|cell| cell.chomp!(" ")} + master[0][0] = "Compound" #"SMILES" + index_smi = master[0].index("SMILES") + master.map {|i| i.delete_at(index_smi)} if index_smi + #master[0][0] = "SMILES" + + #LOGGER.debug "-------- AM: Writing to dumpfile" + #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) } + + parser = OpenTox::Parser::Spreadsheets.new + ds = OpenTox::Dataset.new + ds.save + parser.dataset = ds + ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n")) + ds.save + end + + + # Gauss kernel + # @return [Float] + def self.gauss(x, sigma = 0.3) + d = 1.0 - x.to_f + Math.exp(-(d*d)/(2*sigma*sigma)) + end + + + # For symbolic features + # @param [Array] Array to test, must indicate non-occurrence with 0. + # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere. + def self.isnull_or_singular?(array) + nr_zeroes = array.count(0) + return (nr_zeroes == array.size) || # remove non-occurring feature + (nr_zeroes == array.size-1) || # remove singular feature + (nr_zeroes == 0) # also remove feature present everywhere + end + + + # Numeric value test + # @param[Object] value + # @return [Boolean] Whether value is a number + def self.numeric?(value) + true if Float(value) rescue false + end + + + # For symbolic features + # @param [Array] Array to test, must indicate non-occurrence with 0. + # @return [Boolean] Whether the feature has variance zero. + def self.zero_variance?(array) + return array.uniq.size == 1 + end + + + # Sum of an array for Arrays. + # @param [Array] Array with values + # @return [Integer] Sum of size of values + def self.sum_size(array) + sum=0 + array.each { |e| sum += e.size } + return sum + end + + + # Minimum Frequency + # @param [Integer] per-mil value + # return [Integer] min-frequency + def self.min_frequency(training_dataset,per_mil) + minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST + minfreq = 2 unless minfreq > 2 + Integer (minfreq) + end + + + # Effect calculation for classification + # @param [Array] Array of occurrences per class in the form of Enumerables. + # @param [Array] Array of database instance counts per class. + def self.effect(occurrences, db_instances) + max=0 + max_value=0 + nr_o = self.sum_size(occurrences) + nr_db = db_instances.to_scale.sum + + occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity. + actual = o.size.to_f/nr_o + expected = db_instances[i].to_f/nr_db + if actual > expected + if ((actual - expected) / actual) > max_value + max_value = (actual - expected) / actual # 'Schleppzeiger' + max = i + end + end + } + max + end + + + # neighbors + + module Neighbors + + # Get confidence. + # @param[Hash] Required keys: :sims, :acts + # @return[Float] Confidence + def self.get_confidence(params) + conf = params[:sims].inject{|sum,x| sum + x } + confidence = conf/params[:sims].size + LOGGER.debug "Confidence is: '" + confidence.to_s + "'." + return confidence + end + + end + + + # Similarity calculations + module Similarity + + # Tanimoto similarity + # @param [Hash, Array] fingerprints of first compound + # @param [Hash, Array] fingerprints of second compound + # @return [Float] (Weighted) tanimoto similarity + def self.tanimoto(fingerprints_a,fingerprints_b,weights=nil,params=nil) + + common_p_sum = 0.0 + all_p_sum = 0.0 + + # fingerprints are hashes + if fingerprints_a.class == Hash && fingerprints_b.class == Hash + common_features = fingerprints_a.keys & fingerprints_b.keys + all_features = (fingerprints_a.keys + fingerprints_b.keys).uniq + if common_features.size > 0 + common_features.each{ |f| common_p_sum += [ fingerprints_a[f], fingerprints_b[f] ].min } + all_features.each{ |f| all_p_sum += [ fingerprints_a[f],fingerprints_b[f] ].compact.max } # compact, since one fp may be empty at that pos + end + + # fingerprints are arrays + elsif fingerprints_a.class == Array && fingerprints_b.class == Array + size = [ fingerprints_a.size, fingerprints_b.size ].min + LOGGER.warn "fingerprints don't have equal size" if fingerprints_a.size != fingerprints_b.size + (0...size).each { |idx| + common_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].min + all_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].max + } + end + + (all_p_sum > 0.0) ? (common_p_sum/all_p_sum) : 0.0 + + end + + + # Cosine similarity + # @param [Hash] properties_a key-value properties of first compound + # @param [Hash] properties_b key-value properties of second compound + # @return [Float] cosine of angle enclosed between vectors induced by keys present in both a and b + def self.cosine(fingerprints_a,fingerprints_b,weights=nil) + + # fingerprints are hashes + if fingerprints_a.class == Hash && fingerprints_b.class == Hash + a = []; b = [] + common_features = fingerprints_a.keys & fingerprints_b.keys + if common_features.size > 1 + common_features.each do |p| + a << fingerprints_a[p] + b << fingerprints_b[p] + end + end + + # fingerprints are arrays + elsif fingerprints_a.class == Array && fingerprints_b.class == Array + a = fingerprints_a + b = fingerprints_b + end + + (a.size > 0 && b.size > 0) ? self.cosine_num(a.to_gv, b.to_gv) : 0.0 + + end + + + # Cosine similarity + # @param [GSL::Vector] a + # @param [GSL::Vector] b + # @return [Float] cosine of angle enclosed between a and b + def self.cosine_num(a, b) + if a.size>12 && b.size>12 + a = a[0..11] + b = b[0..11] + end + a.dot(b) / (a.norm * b.norm) + end + + + # Outlier detection based on Mahalanobis distances + # Multivariate detection on X, univariate detection on y + # Uses an existing Rinruby instance, if possible + # @param[Hash] Keys query_matrix, data_matrix, acts are required; r, p_outlier optional + # @return[Array] indices identifying outliers (may occur several times, this is intended) + def self.outliers(params) + outlier_array = [] + data_matrix = params[:data_matrix] + query_matrix = params[:query_matrix] + acts = params[:acts] + begin + LOGGER.debug "Outliers (p=#{params[:p_outlier] || 0.9999})..." + r = ( params[:r] || RinRuby.new(false,false) ) + r.eval "suppressPackageStartupMessages(library(\"robustbase\"))" + r.eval "outlier_threshold = #{params[:p_outlier] || 0.999}" + nr_cases, nr_features = data_matrix.to_a.size, data_matrix.to_a[0].size + r.odx = data_matrix.to_a.flatten + r.q = query_matrix.to_a.flatten + r.y = acts.to_a.flatten + r.eval "odx = matrix(odx, #{nr_cases}, #{nr_features}, byrow=T)" + r.eval 'odx = rbind(q,odx)' # query is nr 0 (1) in ruby (R) + r.eval 'mah = covMcd(odx)$mah' # run MCD alg + r.eval "mah = pchisq(mah,#{nr_features})" + r.eval 'outlier_array = which(mah>outlier_threshold)' # multivariate outliers using robust mahalanobis + outlier_array = r.outlier_array.to_a.collect{|v| v-2 } # translate to ruby index (-1 for q, -1 due to ruby) + r.eval 'fqu = matrix(summary(y))[2]' + r.eval 'tqu = matrix(summary(y))[5]' + r.eval 'outlier_array = which(y>(tqu+1.5*IQR(y)))' # univariate outliers due to Tukey (http://goo.gl/mwzNH) + outlier_array += r.outlier_array.to_a.collect{|v| v-1 } # translate to ruby index (-1 due to ruby) + r.eval 'outlier_array = which(y<(fqu-1.5*IQR(y)))' + outlier_array += r.outlier_array.to_a.collect{|v| v-1 } + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + outlier_array + end + + end + + + end + +end + diff --git a/opentox-ruby.gemspec b/opentox-ruby.gemspec index d316689..d3ae2d7 100644 --- a/opentox-ruby.gemspec +++ b/opentox-ruby.gemspec @@ -48,6 +48,8 @@ Gem::Specification.new do |s| "lib/templates/default_guest_policy.xml", "lib/templates/default_policy.xml", "lib/to-html.rb", + "lib/transform.rb", + "lib/utils.rb" "lib/validation.rb" ] s.homepage = %q{http://github.com/helma/opentox-ruby} @@ -88,6 +90,7 @@ Gem::Specification.new do |s| s.add_runtime_dependency(%q<dm-sqlite-adapter>, [">= 0"]) s.add_runtime_dependency(%q<haml>, [">= 3"]) s.add_runtime_dependency(%q<ruby-plot>, ["~> 0.4.0"]) + s.add_runtime_dependency(%q<statsample>, [">= 0"]) s.add_development_dependency(%q<jeweler>, [">= 0"]) else s.add_dependency(%q<sinatra>, [">= 0"]) @@ -119,6 +122,7 @@ Gem::Specification.new do |s| s.add_dependency(%q<dm-sqlite-adapter>, [">= 0"]) s.add_dependency(%q<haml>, [">= 3"]) s.add_dependency(%q<ruby-plot>, ["~> 0.4.0"]) + s.add_dependency(%q<statsample>, [">= 0"]) s.add_dependency(%q<jeweler>, [">= 0"]) end else @@ -151,6 +155,7 @@ Gem::Specification.new do |s| s.add_dependency(%q<dm-sqlite-adapter>, [">= 0"]) s.add_dependency(%q<haml>, [">= 3"]) s.add_dependency(%q<ruby-plot>, ["~> 0.4.0"]) + s.add_dependency(%q<statsample>, [">= 0"]) s.add_dependency(%q<jeweler>, [">= 0"]) end end |