diff options
author | mguetlein <martin.guetlein@gmail.com> | 2011-08-03 15:26:26 +0200 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2011-08-03 15:26:26 +0200 |
commit | 3fddd473ee16757ac5ae98b5ebcdca2834439ded (patch) | |
tree | 64ecfee255ddde801d1db2610a0f177a8609d23b | |
parent | 0c21b5c58977d16c74d7e976d37d5361ffcb63d1 (diff) | |
parent | acfe33c4fd91efe5d5455892f20a3ffe20c3954c (diff) |
Merge branch 'development' of github.com:opentox/opentox-ruby into development
-rw-r--r-- | Rakefile | 4 | ||||
-rw-r--r-- | lib/algorithm.rb | 713 | ||||
-rw-r--r-- | lib/compound.rb | 29 | ||||
-rw-r--r-- | lib/dataset.rb | 31 | ||||
-rw-r--r-- | lib/model.rb | 187 | ||||
-rw-r--r-- | lib/parser.rb | 192 |
6 files changed, 846 insertions, 310 deletions
@@ -16,7 +16,7 @@ begin gem.add_dependency "sinatra-respond_to", "=0.7.0" gem.add_dependency "sinatra-static-assets", "=0.5.0" gem.add_dependency "rest-client", "=1.6.1" - gem.add_dependency "rack", "=1.3.0" + gem.add_dependency "rack", "=1.3.1" gem.add_dependency "rack-contrib", "=1.1.0" gem.add_dependency "rack-flash", "=0.1.1" gem.add_dependency "nokogiri", "=1.4.4" @@ -44,6 +44,8 @@ begin gem.add_dependency "dm-sqlite-adapter", "=1.1.0" gem.add_dependency "ruby-plot", "=0.5.0" gem.add_dependency "gsl", "=1.14.7" + gem.add_dependency "statsample", "=1.1.0" + #gem.add_dependency "statsample-optimization", "=2.1.0" gem.add_development_dependency 'jeweler' gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore'] diff --git a/lib/algorithm.rb b/lib/algorithm.rb index bfa79d3..9a5ff01 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -3,6 +3,8 @@ # avoids compiling R with X R = nil require "rinruby" +require "statsample" +require 'uri' module OpenTox @@ -50,11 +52,11 @@ module OpenTox include Algorithm attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi - def check_params(params,per_mil) + def check_params(params,per_mil,subjectid=nil) raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? - @prediction_feature = OpenTox::Feature.find params[:prediction_feature], @subjectid - @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", @subjectid + @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid + @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature]) unless params[:min_frequency].nil? @@ -80,18 +82,6 @@ module OpenTox next end - # AM: take log if appropriate - take_logs=true - entry.each do |feature,values| - values.each do |value| - if @prediction_feature.feature_type == "regression" - if (! value.nil?) && (value.to_f <= 0) - take_logs=false - end - end - end - end - value_map=params[:value_map] unless params[:value_map].nil? entry.each do |feature,values| if feature == @prediction_feature.uri @@ -103,7 +93,7 @@ module OpenTox activity= value_map.invert[value].to_i # activities are mapped to 1..n @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect elsif @prediction_feature.feature_type == "regression" - activity= take_logs ? Math.log10(value.to_f) : value.to_f + activity= value.to_f end begin fminer_instance.AddCompound(smiles,id) @@ -164,19 +154,34 @@ module OpenTox # @param [Array] features_a Features of first compound # @param [Array] features_b Features of second compound # @param [optional, Hash] weights Weights for all features + # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required # @return [Float] (Weighted) tanimoto similarity - def self.tanimoto(features_a,features_b,weights=nil) + def self.tanimoto(features_a,features_b,weights=nil,params=nil) common_features = features_a & features_b all_features = (features_a + features_b).uniq - common_p_sum = 0.0 + #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}" if common_features.size > 0 if weights - common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])} - all_p_sum = 0.0 - all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])} + #LOGGER.debug "nr_hits: #{params[:nr_hits]}" + if !params.nil? && params[:nr_hits] + params[:weights] = weights + params[:mode] = "min" + params[:features] = common_features + common_p_sum = Algorithm.p_sum_support(params) + params[:mode] = "max" + params[:features] = all_features + all_p_sum = Algorithm.p_sum_support(params) + else + common_p_sum = 0.0 + common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])} + all_p_sum = 0.0 + all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])} + end + #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}" common_p_sum/all_p_sum else - common_features.to_f/all_features + #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}" + common_features.size.to_f/all_features.size.to_f end else 0.0 @@ -206,90 +211,192 @@ module OpenTox end end - module Neighbors + # Structural Graph Clustering by TU Munich + # Finds clusters similar to a query structure in a given training dataset + # May be queried for cluster membership of an unknown compound + class StructuralClustering + attr_accessor :training_dataset_uri, :training_threshold, :query_dataset_uri, :query_threshold, :target_clusters_array - # Local multi-linear regression (MLR) prediction from neighbors. - # Uses propositionalized setting. - # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features` - # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required - # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] - # @return [Numeric] A prediction value. - def self.local_mlr_prop(neighbors, params, props) + # @params[String] Training dataset_uri + # @params[Float] Similarity threshold for training (optional) + # @params[String] Cluster service uri (no AA) + def initialize training_dataset_uri, training_threshold=0.8, cluster_service_uri = "http://opentox-dev.informatik.tu-muenchen.de:8080/OpenTox/algorithm/StructuralClustering" - take_logs=true + if (training_dataset_uri =~ URI::regexp).nil? || (cluster_service_uri =~ URI::regexp).nil? + raise "Invalid URI." + end + @training_dataset_uri = training_dataset_uri + if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1 + raise "Training threshold out of bounds." + end + @training_threshold = training_threshold.to_f - neighbors.each do |n| - if (! n[:activity].nil?) && (n[:activity].to_f < 0.0) - take_logs = false + # Train a cluster model + params = {:dataset_uri => @training_dataset_uri, :threshold => @training_threshold } + @cluster_model_uri = OpenTox::RestClientWrapper.post cluster_service_uri, params + cluster_model_rdf = OpenTox::RestClientWrapper.get @cluster_model_uri + @datasets = OpenTox::Parser::Owl.from_rdf cluster_model_rdf, OT.Dataset, true # must extract OT.Datasets from model + + # Process parsed OWL objects + @clusterid_dataset_map = Hash.new + @datasets.each { |d| + begin + d.metadata[OT.hasSource]["Structural Clustering cluster "] = "" # must parse in metadata for string (not elegant) + @clusterid_dataset_map[d.metadata[OT.hasSource].to_i] = d.uri + rescue Exception => e + # ignore other entries! end + } + end + + # Whether a model has been trained + def trained? + !@cluster_model_uri.nil? + end + + # Instance query: clusters for a compound + # @params[String] Query compound + # @params[Float] Similarity threshold for query to clusters (optional) + def get_clusters query_compound_uri, query_threshold = 0.5 + + if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1 + raise "Query threshold out of bounds." + end + @query_threshold = query_threshold.to_f + + + # Preparing a query dataset + query_dataset = OpenTox::Dataset.new + @query_dataset_uri = query_dataset.save + query_dataset = OpenTox::Dataset.find @query_dataset_uri + query_dataset.add_compound query_compound_uri + @query_dataset_uri = query_dataset.save + + # Obtaining a clustering for query compound + params = { :dataset_uri => @query_dataset_uri, :threshold => @query_threshold } + cluster_query_dataset_uri = OpenTox::RestClientWrapper.post @cluster_model_uri, params + cluster_query_dataset = OpenTox::Dataset.new cluster_query_dataset_uri + cluster_query_dataset.load_all + + # Reading cluster ids for features from metadata + feature_clusterid_map = Hash.new + pattern="Prediction feature for cluster assignment " # must parse for string in metadata (not elegant) + cluster_query_dataset.features.each { |feature_uri,metadata| + metadata[DC.title][pattern]="" + feature_clusterid_map[feature_uri] = metadata[DC.title].to_i + } + + # Integrity check + unless cluster_query_dataset.compounds.size == 1 + raise "Number of predicted compounds is != 1." end - acts = neighbors.collect do |n| - act = n[:activity] - take_logs ? Math.log10(act.to_f) : act.to_f - end # activities of neighbors for supervised learning + # Process data entry + query_compound_uri = cluster_query_dataset.compounds[0] + @target_clusters_array = Array.new + cluster_query_dataset.features.keys.each { |cluster_membership_feature| + + # Getting dataset URI for cluster + target_cluster = feature_clusterid_map[cluster_membership_feature] + dataset = @clusterid_dataset_map[target_cluster] + + # Finally look up presence + data_entry = cluster_query_dataset.data_entries[query_compound_uri] + present = data_entry[cluster_membership_feature][0] + + # Store result + @target_clusters_array << dataset if present > 0.5 # 0.0 for absence, 1.0 for presence + } + end + end - begin + module Neighbors + # Local multi-linear regression (MLR) prediction from neighbors. + # Uses propositionalized setting. + # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required + # @return [Numeric] A prediction value. + def self.local_mlr_prop(params) + + confidence=0.0 + prediction=nil + + if params[:neighbors].size>0 + props = params[:prop_kernel] ? get_props(params) : nil + acts = params[:neighbors].collect { |n| act = n[:activity].to_f } + sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) } LOGGER.debug "Local MLR (Propositionalization / GSL)." - n_prop = props[0] # is a matrix, i.e. two nested Arrays. - q_prop = props[1] # is an Array. - n_prop_x_size = n_prop[0].size - n_prop_y_size = n_prop.size + prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} ) + transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})") + prediction = transformer.values[0] + LOGGER.debug "Prediction is: '" + prediction.to_s + "'." + params[:conf_stdev] = false if params[:conf_stdev].nil? + confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]}) + end + {:prediction => prediction, :confidence => confidence} - n_prop.flatten! - y_x_rel = n_prop_y_size.to_f / n_prop_x_size - repeat_factor = (1/y_x_rel).ceil - n_prop_tmp = Array.new ; repeat_factor.times { n_prop_tmp.concat n_prop } ; n_prop = n_prop_tmp - acts_tmp = Array.new ; repeat_factor.times { acts_tmp.concat acts } ; acts = acts_tmp + end - if n_prop.size == 0 - raise "No neighbors found." - else - begin - LOGGER.debug "Setting GSL data ..." - # set data - prop_matrix = GSL::Matrix[n_prop, n_prop_y_size * repeat_factor, n_prop_x_size] - y = GSL::Vector[acts] - q_prop = GSL::Vector[q_prop] + # Multi-linear regression weighted by similarity. + # Objective Feature Selection, Principal Components Analysis, Scaling of Axes. + # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required + # @return [Numeric] A prediction value. + def self.mlr(params) - # model + support vectors - LOGGER.debug "Creating MLR model ..." - work = GSL::MultiFit::Workspace.alloc(n_prop_y_size * repeat_factor, n_prop_x_size) - c, cov, chisq, status = GSL::MultiFit::linear(prop_matrix, y, work) - LOGGER.debug "Predicting ..." - prediction = GSL::MultiFit::linear_est(q_prop, c, cov)[0] - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" - end - end + # GSL matrix operations: + # to_a : row-wise conversion to nested array + # + # Statsample operations (build on GSL): + # to_scale: convert into Statsample format - prediction = (take_logs ? 10**(prediction.to_f) : prediction.to_f) - LOGGER.debug "Prediction is: '" + prediction.to_s + "'." + begin + n_prop = params[:n_prop].collect { |v| v } + q_prop = params[:q_prop].collect { |v| v } + n_prop << q_prop # attach q_prop + nr_cases, nr_features = get_sizes n_prop + data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features) + + # Principal Components Analysis + LOGGER.debug "PCA..." + pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix) + data_matrix = pca.data_transformed_matrix + + # Attach intercept column to data + intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1) + data_matrix = data_matrix.horzcat(intercept) + (0..data_matrix.size2-2).each { |i| + autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i)) + data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values + } + + # Detach query instance + n_prop = data_matrix.to_a + q_prop = n_prop.pop + nr_cases, nr_features = get_sizes n_prop + data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features) + + # model + support vectors + LOGGER.debug "Creating MLR model ..." + c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl) + GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0] rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" + LOGGER.debug "#{e.class}: #{e.message}" end - sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors - conf = sims.inject{|sum,x| sum + x } - confidence = conf/neighbors.size if neighbors.size > 0 - {:prediction => prediction, :confidence => confidence} - end + end # Classification with majority vote from neighbors weighted by similarity - # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity` - # @param [optional] params Ignored (only for compatibility with local_svm_regression) - # @return [Hash] Hash with keys `:prediction, :confidence` - def self.weighted_majority_vote(neighbors,params={}, props=nil) + # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required + # @return [Numeric] A prediction value. + def self.weighted_majority_vote(params) + neighbor_contribution = 0.0 confidence_sum = 0.0 confidence = 0.0 prediction = nil - positive_map_value= nil - negative_map_value= nil - neighbors.each do |neighbor| + params[:neighbors].each do |neighbor| neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f neighbor_contribution += neighbor[:activity].to_f * neighbor_weight @@ -307,70 +414,57 @@ module OpenTox if params[:value_map].size == 2 if confidence_sum >= 0.0 - prediction = 2 unless neighbors.size==0 + prediction = 2 unless params[:neighbors].size==0 elsif confidence_sum < 0.0 - prediction = 1 unless neighbors.size==0 + prediction = 1 unless params[:neighbors].size==0 end else - prediction = (neighbor_contribution/confidence_sum).round unless neighbors.size==0 # AM: new multinomial prediction + prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction end - - confidence = confidence_sum/neighbors.size if neighbors.size > 0 + LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil? + confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0 + LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil? return {:prediction => prediction, :confidence => confidence.abs} end # Local support vector regression from neighbors - # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features` - # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required - # @return [Hash] Hash with keys `:prediction, :confidence` - def self.local_svm_regression(neighbors, params, props=nil) - take_logs=true - neighbors.each do |n| - if (! n[:activity].nil?) && (n[:activity].to_f < 0.0) - take_logs = false - end - end - acts = neighbors.collect do |n| - act = n[:activity] - take_logs ? Math.log10(act.to_f) : act.to_f - end # activities of neighbors for supervised learning + # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required + # @return [Numeric] A prediction value. + def self.local_svm_regression(params) - sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors - begin - prediction = (props.nil? ? local_svm(neighbors, acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr", params)) - prediction = (take_logs ? 10**(prediction.to_f) : prediction.to_f) + confidence = 0.0 + prediction = nil + if params[:neighbors].size>0 + props = params[:prop_kernel] ? get_props(params) : nil + acts = params[:neighbors].collect{ |n| n[:activity].to_f } + sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } + prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr") + transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})") + prediction = transformer.values[0] LOGGER.debug "Prediction is: '" + prediction.to_s + "'." - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" + params[:conf_stdev] = false if params[:conf_stdev].nil? + confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]}) end - - conf = sims.inject{|sum,x| sum + x } - confidence = conf/neighbors.size if neighbors.size > 0 {:prediction => prediction, :confidence => confidence} end # Local support vector classification from neighbors - # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features` - # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required - # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] - # @return [Hash] Hash with keys `:prediction, :confidence` - def self.local_svm_classification(neighbors, params, props=nil) - acts = neighbors.collect do |n| - act = n[:activity] - end # activities of neighbors for supervised learning -# acts_f = acts.collect {|v| v == true ? 1.0 : 0.0} - acts_f = acts - sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors - begin - prediction = (props.nil? ? local_svm(neighbors, acts_f, sims, "C-bsvc", params) : local_svm_prop(props, acts_f, "C-bsvc", params)) + # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required + # @return [Numeric] A prediction value. + def self.local_svm_classification(params) + + confidence = 0.0 + prediction = nil + if params[:neighbors].size>0 + props = params[:prop_kernel] ? get_props(params) : nil + acts = params[:neighbors].collect { |n| act = n[:activity] } + sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors + prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc") LOGGER.debug "Prediction is: '" + prediction.to_s + "'." - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" + params[:conf_stdev] = false if params[:conf_stdev].nil? + confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]}) end - - conf = sims.inject{|sum,x| sum + x } - confidence = conf/neighbors.size if neighbors.size > 0 {:prediction => prediction, :confidence => confidence} end @@ -379,26 +473,34 @@ module OpenTox # Local support vector prediction from neighbors. # Uses pre-defined Kernel Matrix. # Not to be called directly (use local_svm_regression or local_svm_classification). - # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features` # @param [Array] acts, activities for neighbors. # @param [Array] sims, similarities for neighbors. # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification). - # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required - # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] + # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required # @return [Numeric] A prediction value. - def self.local_svm(neighbors, acts, sims, type, params) + def self.local_svm(acts, sims, type, params) LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)." - neighbor_matches = neighbors.collect{ |n| n[:features] } # URIs of matches + neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel - if neighbor_matches.size == 0 - raise "No neighbors found." + + prediction = nil + if Algorithm::zero_variance? acts + prediction = acts[0] else # gram matrix (0..(neighbor_matches.length-1)).each do |i| + neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]] gram_matrix[i] = [] unless gram_matrix[i] # upper triangle ((i+1)..(neighbor_matches.length-1)).each do |j| - sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values])") + neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]] + sim_params = {} + if params[:nr_hits] + sim_params[:nr_hits] = true + sim_params[:compound_features_hits] = neighbor_i_hits + sim_params[:training_compound_features_hits] = neighbor_j_hits + end + sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)") gram_matrix[i][j] = Algorithm.gauss(sim) gram_matrix[j] = [] unless gram_matrix[j] gram_matrix[j][i] = gram_matrix[i][j] # lower triangle @@ -406,6 +508,7 @@ module OpenTox gram_matrix[i][i] = 1.0 end + #LOGGER.debug gram_matrix.to_yaml @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed @@ -443,7 +546,8 @@ module OpenTox end @r.quit # free R rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" end end @@ -453,22 +557,19 @@ module OpenTox # Local support vector prediction from neighbors. # Uses propositionalized setting. # Not to be called directly (use local_svm_regression or local_svm_classification). - # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features` - # @param [Array] acts, activities for neighbors. # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] + # @param [Array] acts, activities for neighbors. # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification). - # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required # @return [Numeric] A prediction value. - def self.local_svm_prop(props, acts, type, params) + def self.local_svm_prop(props, acts, type) LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)." n_prop = props[0] # is a matrix, i.e. two nested Arrays. q_prop = props[1] # is an Array. - #neighbor_matches = neighbors.collect{ |n| n[:features] } # URIs of matches - #gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel - if n_prop.size == 0 - raise "No neighbors found." + prediction = nil + if Algorithm::zero_variance? acts + prediction = acts[0] else #LOGGER.debug gram_matrix.to_yaml @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests @@ -505,12 +606,85 @@ module OpenTox end @r.quit # free R rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" end end prediction end + # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set. + # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev + # @return[Float] Confidence + def self.get_confidence(params) + if params[:conf_stdev] + sim_median = params[:sims].to_scale.median + if sim_median.nil? + confidence = nil + else + standard_deviation = params[:acts].to_scale.standard_deviation_sample + confidence = (sim_median*Math.exp(-1*standard_deviation)).abs + if confidence.nan? + confidence = nil + end + end + else + conf = params[:sims].inject{|sum,x| sum + x } + confidence = conf/params[:neighbors].size + end + LOGGER.debug "Confidence is: '" + confidence.to_s + "'." + return confidence + end + + # Get X and Y size of a nested Array (Matrix) + def self.get_sizes(matrix) + begin + nr_cases = matrix.size + nr_features = matrix[0].size + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + #puts "NRC: #{nr_cases}, NRF: #{nr_features}" + [ nr_cases, nr_features ] + end + + # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features) + # Same for the vector describing the query compound + # @param[Array] neighbors. + # @param[OpenTox::Compound] query compound. + # @param[Array] Dataset Features. + # @param[Array] Fingerprints of neighbors. + # @param[Float] p-values of Features. + def self.get_props (params) + matrix = Array.new + begin + params[:neighbors].each do |n| + n = n[:compound] + row = [] + params[:features].each do |f| + if ! params[:fingerprints][n].nil? + row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0) + else + row << 0.0 + end + end + matrix << row + end + row = [] + params[:features].each do |f| + if params[:nr_hits] + compound_feature_hits = params[:compound].match_hits([f]) + row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f])) + else + row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f]) + end + end + rescue Exception => e + LOGGER.debug "get_props failed with '" + $! + "'" + end + [ matrix, row ] + end end @@ -531,6 +705,195 @@ module OpenTox def features(dataset_uri,compound_uri) end end + + module Transform + include Algorithm + + # The transformer that inverts values. + # 1/x is used, after values have been moved >= 1. + class Inverter + attr_accessor :offset, :values + + # @params[Array] Values to transform. + # @params[Float] Offset for restore. + def initialize *args + case args.size + when 1 + begin + values=args[0] + raise "Cannot transform, values empty." if @values.size==0 + @values = values.collect { |v| -1.0 * v } + @offset = 1.0 - @values.minmax[0] + @offset = -1.0 * @offset if @offset>0.0 + @values.collect! { |v| v - @offset } # slide >1 + @values.collect! { |v| 1 / v } # invert to [0,1] + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + when 2 + @offset = args[1].to_f + @values = args[0].collect { |v| 1 / v } + @values.collect! { |v| v + @offset } + @values.collect! { |v| -1.0 * v } + end + end + end + + # The transformer that takes logs. + # Log10 is used, after values have been moved > 0. + class Log10 + attr_accessor :offset, :values + + # @params[Array] Values to transform / restore. + # @params[Float] Offset for restore. + def initialize *args + @distance_to_zero = 0.000000001 # 1 / 1 billion + case args.size + when 1 + begin + values=args[0] + raise "Cannot transform, values empty." if values.size==0 + @offset = values.minmax[0] + @offset = -1.0 * @offset if @offset>0.0 + @values = values.collect { |v| v - @offset } # slide > anchor + @values.collect! { |v| v + @distance_to_zero } # + @values.collect! { |v| Math::log10 v } # log10 (can fail) + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + when 2 + @offset = args[1].to_f + @values = args[0].collect { |v| 10**v } + @values.collect! { |v| v - @distance_to_zero } + @values.collect! { |v| v + @offset } + end + end + end + + # The transformer that does nothing (No OPeration). + class NOP + attr_accessor :offset, :values + + # @params[Array] Values to transform / restore. + # @params[Float] Offset for restore. + def initialize *args + @offset = 0.0 + @distance_to_zero = 0.0 + case args.size + when 1 + @values = args[0] + when 2 + @values = args[0] + end + end + end + + + # Auto-Scaler for Arrays + # Center on mean and divide by standard deviation + class AutoScale + attr_accessor :scaled_values, :mean, :stdev + + # @params[Array] Values to transform. + def initialize values + @scaled_values = values + @mean = @scaled_values.to_scale.mean + @stdev = @scaled_values.to_scale.standard_deviation_sample + @scaled_values = @scaled_values.collect {|vi| vi - @mean } + @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0 + end + end + + # Principal Components Analysis + # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos + class PCA + attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler + + # Creates a transformed dataset as GSL::Matrix. + # @param [GSL::Matrix] Data matrix. + # @param [Float] Compression ratio from [0,1]. + # @return [GSL::Matrix] Data transformed matrix. + def initialize data_matrix, compression=0.05 + begin + @data_matrix = data_matrix + @compression = compression.to_f + @stdev = Array.new + @mean = Array.new + + # Objective Feature Selection + raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2 + @data_matrix_selected = nil + (0..@data_matrix.size2-1).each { |i| + if !Algorithm::zero_variance?(@data_matrix.col(i).to_a) + if @data_matrix_selected.nil? + @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) + @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i) + else + @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1)) + end + end + } + raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2) + + # Scaling of Axes + @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2) + (0..@data_matrix_selected.size2-1).each { |i| + @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i)) + @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values + @stdev << @autoscaler.stdev + @mean << @autoscaler.mean + } + + data_matrix_hash = Hash.new + (0..@data_matrix_scaled.size2-1).each { |i| + column_view = @data_matrix_scaled.col(i) + data_matrix_hash[i] = column_view.to_scale + } + dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9 + cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash) + pca=Statsample::Factor::PCA.new(cor_matrix) + pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? } + @eigenvalue_sums = Array.new + (0..dataset_hash.fields.size-1).each { |i| + @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev } + } + eigenvectors_selected = Array.new + pca.eigenvectors.each_with_index { |ev, i| + if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0) + eigenvectors_selected << ev.to_a + end + } + @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose + dataset_matrix = dataset_hash.to_gsl.transpose + @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + # Restores data in the original feature space (possibly with compression loss). + # @return [GSL::Matrix] Data matrix. + def restore + begin + data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca + # reverse scaling + (0..data_matrix_restored.size2-1).each { |i| + data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0 + data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i] + } + data_matrix_restored + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + end + end + + end + + end # Gauss kernel # @return [Float] @@ -538,24 +901,31 @@ module OpenTox d = 1.0 - x.to_f Math.exp(-(d*d)/(2*sigma*sigma)) end - - # Median of an array - # @param [Array] Array with values - # @return [Float] Median - def self.median(array) - return nil if array.empty? - array.sort! - m_pos = array.size / 2 - return array.size % 2 == 1 ? array[m_pos] : (array[m_pos-1] + array[m_pos])/2 + + # For symbolic features + # @param [Array] Array to test, must indicate non-occurrence with 0. + # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere. + def self.isnull_or_singular?(array) + nr_zeroes = array.count(0) + return (nr_zeroes == array.size) || # remove non-occurring feature + (nr_zeroes == array.size-1) || # remove singular feature + (nr_zeroes == 0) # also remove feature present everywhere end - # Sum of an array for Numeric values - # @param [Array] Array with values - # @return [Integer] Sum of values - def self.sum(array) - array.inject{|s,x| s + x } + # Numeric value test + # @param[Object] value + # @return [Boolean] Whether value is a number + def self.numeric?(value) + true if Float(value) rescue false end + # For symbolic features + # @param [Array] Array to test, must indicate non-occurrence with 0. + # @return [Boolean] Whether the feature has variance zero. + def self.zero_variance?(array) + return (array.to_scale.variance_sample == 0.0) + end + # Sum of an array for Arrays. # @param [Array] Array with values # @return [Integer] Sum of size of values @@ -565,14 +935,13 @@ module OpenTox return sum end - # Minimum Frequency # @param [Integer] per-mil value # return [Integer] min-frequency def self.min_frequency(training_dataset,per_mil) - minfreq = per_mil*training_dataset.compounds.size/1000 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST + minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST minfreq = 2 unless minfreq > 2 - minfreq + Integer (minfreq) end # Effect calculation for classification @@ -582,7 +951,7 @@ module OpenTox max=0 max_value=0 nr_o = self.sum_size(occurrences) - nr_db = self.sum(db_instances) + nr_db = db_instances.to_scale.sum occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity. actual = o.size.to_f/nr_o @@ -596,8 +965,20 @@ module OpenTox } max end - - + + # Returns Support value of an fingerprint + # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required + # return [Numeric] Support value + def self.p_sum_support(params) + p_sum = 0.0 + params[:features].each{|f| + compound_hits = params[:compound_features_hits][f] + neighbor_hits = params[:training_compound_features_hits][f] + p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))") + } + p_sum + end + end end diff --git a/lib/compound.rb b/lib/compound.rb index d374b02..e7b4da0 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -164,6 +164,35 @@ module OpenTox #smarts_array.collect { |s| s if match?(s)}.compact end + # Match_hits an array of smarts strings, returns hash with matching smarts as key and number of non-unique hits as value + # @example + # compound = OpenTox::Compound.from_name("Benzene") + # compound.match(['cc','cN']) # returns ['cc'] + # @param [Array] smarts_array Array with Smarts strings + # @return [Hash] Hash with matching smarts as key and number of non-unique hits as value + def match_hits(smarts_array) + # avoid recreation of OpenBabel objects + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format('inchi') + obconversion.read_string(obmol,@inchi) + smarts_pattern = OpenBabel::OBSmartsPattern.new + smarts_hits = {} + #LOGGER.debug "dv ----------- obmol #{Compound.new(@inchi).to_smiles}" + smarts_array.collect do |smarts| + #LOGGER.debug "dv ----------- all smarts #{smarts}" + smarts_pattern.init(smarts) + if smarts_pattern.match(obmol) + hits = smarts_pattern.get_map_list + smarts_hits[smarts] = hits.size + end + end + #LOGGER.debug "dv ----------- smarts => hits #{smarts_hits}" + return smarts_hits + #smarts_array.collect { |s| s if match?(s)}.compact + end + + # Get URI of compound image with highlighted fragments # # @param [Array] activating Array with activating Smarts strings diff --git a/lib/dataset.rb b/lib/dataset.rb index f13c0d3..5ebad0f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -102,6 +102,13 @@ module OpenTox copy parser.load_uri(subjectid) end + def load_sdf(sdf,subjectid=nil) + save(subjectid) unless @uri # get a uri for creating features + parser = Parser::Sdf.new + parser.dataset = self + parser.load_sdf(sdf) + end + # Load CSV string (format specification: http://toxcreate.org/help) # - loads data_entries, compounds, features # - sets metadata (warnings) for parser errors @@ -230,6 +237,30 @@ module OpenTox s.to_rdfxml end + # Get SDF representation of compounds + # @return [String] SDF representation + def to_sdf + sum="" + @compounds.each{ |c| + sum << OpenTox::Compound.new(c).to_inchi + sum << OpenTox::Compound.new(c).to_sdf.sub(/\n\$\$\$\$/,'') + @data_entries[c].each{ |f,v| + sum << "> <\"#{f}\">\n" + sum << v.join(", ") + sum << "\n\n" + } + sum << "$$$$\n" + } + sum + end + + def to_urilist + @compounds.inject { |sum, c| + sum << OpenTox::Compound.new(c).uri + sum + "\n" + } + end + # Get name (DC.title) of a feature # @param [String] feature Feature URI # @return [String] Feture title diff --git a/lib/model.rb b/lib/model.rb index 825f697..26c42a5 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -50,38 +50,49 @@ module OpenTox @predicted_variable end + def predicted_variables( subjectid ) + load_predicted_variables( subjectid, false ) unless @predicted_variables + @predicted_variables + end + def predicted_confidence( subjectid ) load_predicted_variables( subjectid ) unless @predicted_confidence @predicted_confidence end private - def load_predicted_variables( subjectid=nil ) + def load_predicted_variables( subjectid=nil, use_confidence=true ) load_metadata(subjectid) if @metadata==nil or @metadata.size==0 or (@metadata.size==1 && @metadata.values[0]==@uri) if @metadata[OT.predictedVariables] predictedVariables = @metadata[OT.predictedVariables] if predictedVariables.is_a?(Array) if (predictedVariables.size==1) @predicted_variable = predictedVariables[0] - elsif (predictedVariables.size==2) + elsif (predictedVariables.size>=2) # PENDING identify confidence - conf_index = -1 - predictedVariables.size.times do |i| - f = OpenTox::Feature.find(predictedVariables[i]) - conf_index = i if f.metadata[DC.title]=~/(?i)confidence/ + if use_confidence + conf_index = -1 + predictedVariables.size.times do |i| + f = OpenTox::Feature.find(predictedVariables[i], subjectid) + conf_index = i if f.metadata[DC.title]=~/(?i)confidence/ + end + raise "could not estimate predicted variable from model: '"+uri.to_s+ + "', number of predicted-variables==2, but no confidence found" if conf_index==-1 + end + if (predictedVariables.size==2) && use_confidence + @predicted_variable = predictedVariables[1-conf_index] + @predicted_confidence = predictedVariables[conf_index] + else + @predicted_variables = predictedVariables end - raise "could not estimate predicted variable from model: '"+uri.to_s+ - "', number of predicted-variables==2, but no confidence found" if conf_index==-1 - @predicted_variable = predictedVariables[1-conf_index] - @predicted_confidence = predictedVariables[conf_index] else - raise "could not estimate predicted variable from model: '"+uri.to_s+"', number of predicted-variables > 2" + raise "could not estimate predicted variable from model: '"+uri.to_s+"', number of predicted-variables == 0" end else raise "could not estimate predicted variable from model: '"+uri.to_s+"', predicted-variables is no array" end end - raise "could not estimate predicted variable from model: '"+uri.to_s+"'" unless @predicted_variable + raise "could not estimate predicted variable from model: '"+uri.to_s+"'" unless (@predicted_variable || @predicted_variables) end end @@ -91,7 +102,7 @@ module OpenTox include Algorithm include Model - attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim, :subjectid, :prop_kernel, :value_map, :balanced + attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim, :subjectid, :prop_kernel, :value_map, :nr_hits, :transform, :conf_stdev def initialize(uri=nil) @@ -113,10 +124,12 @@ module OpenTox @feature_calculation_algorithm = "Substructure.match" @similarity_algorithm = "Similarity.tanimoto" @prediction_algorithm = "Neighbors.weighted_majority_vote" - + + @nr_hits = false @min_sim = 0.3 @prop_kernel = false - @balanced = false + @transform = { "class" => "NOP" } + @conf_stdev = false end @@ -168,6 +181,7 @@ module OpenTox # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly # @return [OpenTox::Dataset] Dataset with predictions def predict_dataset(dataset_uri, subjectid=nil, waiting_task=nil) + @prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid) @prediction_dataset.add_metadata({ OT.hasSource => @uri, @@ -212,90 +226,33 @@ module OpenTox unless database_activity(subjectid) # adds database activity to @prediction_dataset - if @balanced && OpenTox::Feature.find(metadata[OT.dependentVariables]).feature_type == "classification" - # AM: Balancing, see http://www.maunz.de/wordpress/opentox/2011/balanced-lazar - l = Array.new # larger - s = Array.new # smaller fraction - - raise "no fingerprints in model" if @fingerprints.size==0 - - @fingerprints.each do |training_compound,training_features| - @activities[training_compound].each do |act| - case act.to_s - when "0" - l << training_compound - when "1" - s << training_compound - else - LOGGER.warn "BLAZAR: Activity #{act.to_s} should not be reached (supports only two classes)." - end - end - end - if s.size > l.size then - l,s = s,l # happy swapping - LOGGER.info "BLAZAR: |s|=#{s.size}, |l|=#{l.size}." - end - # determine ratio - modulo = l.size.divmod(s.size)# modulo[0]=ratio, modulo[1]=rest - LOGGER.info "BLAZAR: Balance: #{modulo[0]}, rest #{modulo[1]}." - - # AM: Balanced predictions - addon = (modulo[1].to_f/modulo[0]).ceil # what will be added in each round - slack = (addon!=0 ? modulo[1].divmod(addon)[1] : 0) # what remains for the last round - position = 0 - predictions = Array.new - - prediction_best=nil - neighbors_best=nil - - begin - for i in 1..modulo[0] do - (i == modulo[0]) && (slack>0) ? lr_size = s.size + slack : lr_size = s.size + addon # determine fraction - LOGGER.info "BLAZAR: Neighbors round #{i}: #{position} + #{lr_size}." - neighbors_balanced(s, l, position, lr_size) # get ratio fraction of larger part - if @prop_kernel && ( @prediction_algorithm.include?("svm") || @prediction_algorithm.include?("local_mlr_prop") ) - props = get_props - else - props = nil - end - prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values, :value_map => @value_map}, props)") - if prediction_best.nil? || prediction[:confidence].abs > prediction_best[:confidence].abs - prediction_best=prediction - neighbors_best=@neighbors - end - position = position + lr_size - end - rescue Exception => e - LOGGER.error "BLAZAR failed in prediction: "+e.class.to_s+": "+e.message - end - - prediction=prediction_best - @neighbors=neighbors_best - ### END AM balanced predictions + neighbors + prediction = eval("#{@prediction_algorithm} ( { :neighbors => @neighbors, + :compound => @compound, + :features => @features, + :p_values => @p_values, + :fingerprints => @fingerprints, + :similarity_algorithm => @similarity_algorithm, + :prop_kernel => @prop_kernel, + :value_map => @value_map, + :nr_hits => @nr_hits, + :conf_stdev => @conf_stdev, + :transform => @transform } ) ") - else # AM: no balancing or regression - LOGGER.info "LAZAR: Unbalanced." - neighbors - if @prop_kernel && ( @prediction_algorithm.include?("svm") || @prediction_algorithm.include?("local_mlr_prop") ) - props = get_props - else - props = nil - end - prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values, :value_map => @value_map}, props)") - end - value_feature_uri = File.join( @uri, "predicted", "value") confidence_feature_uri = File.join( @uri, "predicted", "confidence") @prediction_dataset.metadata[OT.dependentVariables] = @metadata[OT.dependentVariables] unless @prediction_dataset.metadata[OT.dependentVariables] @prediction_dataset.metadata[OT.predictedVariables] = [value_feature_uri, confidence_feature_uri] unless @prediction_dataset.metadata[OT.predictedVariables] - if OpenTox::Feature.find(metadata[OT.dependentVariables]).feature_type == "classification" + if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification" @prediction_dataset.add @compound.uri, value_feature_uri, @value_map[prediction[:prediction]] else @prediction_dataset.add @compound.uri, value_feature_uri, prediction[:prediction] end @prediction_dataset.add @compound.uri, confidence_feature_uri, prediction[:confidence] + @prediction_dataset.features[value_feature_uri][DC.title] = @prediction_dataset.metadata[DC.title] + @prediction_dataset.features[confidence_feature_uri][DC.title] = "Confidence" if verbose if @feature_calculation_algorithm == "Substructure.match" @@ -356,56 +313,32 @@ module OpenTox @prediction_dataset end - # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features) - # Same for the vector describing the query compound - def get_props - matrix = Array.new - begin - @neighbors.each do |n| - n = n[:compound] - row = [] - @features.each do |f| - if ! @fingerprints[n].nil? - row << (@fingerprints[n].include?(f) ? 0.0 : @p_values[f]) - else - row << 0.0 - end - end - matrix << row - end - row = [] - @features.each do |f| - row << (@compound.match([f]).size == 0 ? 0.0 : @p_values[f]) - end - rescue Exception => e - LOGGER.debug "get_props failed with '" + $! + "'" - end - [ matrix, row ] - end - - # Find neighbors and store them as object variable, access only a subset of compounds for that. - def neighbors_balanced(s, l, start, offset) - @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm - @neighbors = [] - [ l[start, offset ] , s ].flatten.each do |training_compound| # AM: access only a balanced subset - training_features = @fingerprints[training_compound] - add_neighbor training_features, training_compound - end - - end + # Find neighbors and store them as object variable, access all compounds for that. def neighbors @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm @neighbors = [] - @fingerprints.each do |training_compound,training_features| # AM: access all compounds - add_neighbor training_features, training_compound + @fingerprints.keys.each do |training_compound| # AM: access all compounds + add_neighbor @fingerprints[training_compound].keys, training_compound end end # Adds a neighbor to @neighbors if it passes the similarity threshold. def add_neighbor(training_features, training_compound) - sim = eval("#{@similarity_algorithm}(@compound_features,training_features,@p_values)") + compound_features_hits = {} + training_compound_features_hits = {} + if @nr_hits + compound_features_hits = @compound.match_hits(@compound_features) + training_compound_features_hits = @fingerprints[training_compound] + #LOGGER.debug "dv ------------ training_compound_features_hits:#{training_compound_features_hits.class} #{training_compound_features_hits}" + end + params = {} + params[:nr_hits] = @nr_hits + params[:compound_features_hits] = compound_features_hits + params[:training_compound_features_hits] = training_compound_features_hits + + sim = eval("#{@similarity_algorithm}(training_features, @compound_features, @p_values, params)") if sim > @min_sim @activities[training_compound].each do |act| @neighbors << { diff --git a/lib/parser.rb b/lib/parser.rb index 07bee67..d0975af 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -86,7 +86,11 @@ module OpenTox # @param [String] rdf # @param [String] type of the info (e.g. OT.Task, OT.ErrorReport) needed to get the subject-uri # @return [Owl] with uri and metadata set - def self.from_rdf( rdf, type ) + def self.from_rdf( rdf, type, allow_multiple = false ) + + uris = Array.new + owls = Array.new + # write to file and read convert with rapper into tripples file = Tempfile.new("ot-rdfxml") file.puts rdf @@ -99,20 +103,27 @@ module OpenTox triples.each_line do |line| triple = line.to_triple if triple[1] == RDF['type'] and triple[2]==type - raise "uri already set, two uris found with type: "+type.to_s if uri + if !allow_multiple + raise "uri already set, two uris found with type: "+type.to_s if uri + end uri = triple[0] + uris << uri end end File.delete(file.path) + # load metadata - metadata = {} - triples.each_line do |line| - triple = line.to_triple - metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type'] - end - owl = Owl::Generic.new(uri) - owl.metadata = metadata - owl + uris.each { |uri| + metadata = {} + triples.each_line do |line| + triple = line.to_triple + metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type'] + end + owl = Owl::Generic.new(uri) + owl.metadata = metadata + owls << owl + } + allow_multiple ? owls : owls[0] end # Generic parser for all OpenTox classes @@ -350,7 +361,6 @@ module OpenTox @dataset end - private def warnings @@ -437,12 +447,8 @@ module OpenTox end end - def numeric?(value) - true if Float(value) rescue false - end - def feature_type(value) - if numeric? value + if OpenTox::Algorithm::numeric? value return OT.NumericFeature else return OT.NominalFeature @@ -454,5 +460,159 @@ module OpenTox end end + + class Table + + attr_accessor :data, :features, :compounds + + def initialize + @data = {} + @activity_errors = [] + end + + def feature_values(feature) + @data.collect{|c, row| row[feature]}.uniq.compact + end + + def feature_types(feature) + @data.collect{|c, row| feature_type(row[feature])}.uniq.compact + end + + def features + @data.collect{|c,row| row.keys}.flatten.uniq + end + + def clean_features + ignored_features = [] + features.each do |feature| + if feature_values(feature).size > 5 + if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature + # REGRESSION + elsif feature_types(feature).include? OT.NumericFeature + @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features + @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)." + else + @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)." + ignored_features << feature + next + end + elsif feature_values(feature).size <= 1 + @activity_errors << "Feature #{feature} ignored (less than 2 feature values)." + ignored_features << feature + else + # CLASSIFICATION + end + end + ignored_features.each do |feature| + @data.each{ |c,row| row.delete feature } + end + @activity_errors + end + + def add_to_dataset(dataset) + features.each do |feature_name| + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature_name)) + dataset.add_feature(feature_uri,{DC.title => feature_name}) + end + + @data.each do |compound,row| + unless row.empty? + row.each do |feature,value| + if OpenTox::Algorithm::numeric?(value) + value = value.to_f + elsif value.nil? or value.empty? + value = nil + else + value = value.to_s + end + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature)) + dataset.add(compound, feature_uri, value) + #dataset.features[feature_uri][RDF.type] = feature_types(feature) + #dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + if feature_types(feature).include? OT.NumericFeature + dataset.features[feature_uri][RDF.type] = [OT.NumericFeature] + else + dataset.features[feature_uri][RDF.type] = [OT.NominalFeature] + dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + end + end + end + end + end + + private + + def feature_type(value) + if OpenTox::Algorithm::numeric? value + return OT.NumericFeature + else + return OT.NominalFeature + end + end + end + + # quick hack to enable sdf import via csv + # should be refactored + class Sdf + + attr_accessor :dataset + + def initialize + @data = {} + + @compound_errors = [] + @activity_errors = [] + @duplicates = {} + end + + def load_sdf(sdf) + + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_and_out_formats "sdf", "inchi" + + table = Table.new + + properties = [] + sdf.each_line { |l| properties << l.to_s if l.match(/</) } + properties.uniq! + properties.sort! + properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp } + + rec = 0 + sdf.split(/\$\$\$\$\r*\n/).each do |s| + rec += 1 + obconversion.read_string obmol, s + begin + inchi = obconversion.write_string(obmol).gsub(/\s/,'').chomp + @duplicates[inchi] = [] unless @duplicates[inchi] + @duplicates[inchi] << rec #inchi#+", "+row.join(", ") + compound = Compound.from_inchi inchi + rescue + @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}" + next + end + row = {} + obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) } + table.data[compound.uri] = row + end + + # finda and remove ignored_features + @activity_errors = table.clean_features + table.add_to_dataset @dataset + + warnings = '' + warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty? + warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty? + duplicate_warnings = '' + @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 } + warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty? + + @dataset.metadata[OT.Warnings] = warnings + @dataset + + end + + end end end |