summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2011-08-03 15:26:26 +0200
committermguetlein <martin.guetlein@gmail.com>2011-08-03 15:26:26 +0200
commit3fddd473ee16757ac5ae98b5ebcdca2834439ded (patch)
tree64ecfee255ddde801d1db2610a0f177a8609d23b
parent0c21b5c58977d16c74d7e976d37d5361ffcb63d1 (diff)
parentacfe33c4fd91efe5d5455892f20a3ffe20c3954c (diff)
Merge branch 'development' of github.com:opentox/opentox-ruby into development
-rw-r--r--Rakefile4
-rw-r--r--lib/algorithm.rb713
-rw-r--r--lib/compound.rb29
-rw-r--r--lib/dataset.rb31
-rw-r--r--lib/model.rb187
-rw-r--r--lib/parser.rb192
6 files changed, 846 insertions, 310 deletions
diff --git a/Rakefile b/Rakefile
index f54e23e..952affe 100644
--- a/Rakefile
+++ b/Rakefile
@@ -16,7 +16,7 @@ begin
gem.add_dependency "sinatra-respond_to", "=0.7.0"
gem.add_dependency "sinatra-static-assets", "=0.5.0"
gem.add_dependency "rest-client", "=1.6.1"
- gem.add_dependency "rack", "=1.3.0"
+ gem.add_dependency "rack", "=1.3.1"
gem.add_dependency "rack-contrib", "=1.1.0"
gem.add_dependency "rack-flash", "=0.1.1"
gem.add_dependency "nokogiri", "=1.4.4"
@@ -44,6 +44,8 @@ begin
gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
gem.add_dependency "ruby-plot", "=0.5.0"
gem.add_dependency "gsl", "=1.14.7"
+ gem.add_dependency "statsample", "=1.1.0"
+ #gem.add_dependency "statsample-optimization", "=2.1.0"
gem.add_development_dependency 'jeweler'
gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index bfa79d3..9a5ff01 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -3,6 +3,8 @@
# avoids compiling R with X
R = nil
require "rinruby"
+require "statsample"
+require 'uri'
module OpenTox
@@ -50,11 +52,11 @@ module OpenTox
include Algorithm
attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
- def check_params(params,per_mil)
+ def check_params(params,per_mil,subjectid=nil)
raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
- @prediction_feature = OpenTox::Feature.find params[:prediction_feature], @subjectid
- @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", @subjectid
+ @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
+ @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
unless params[:min_frequency].nil?
@@ -80,18 +82,6 @@ module OpenTox
next
end
- # AM: take log if appropriate
- take_logs=true
- entry.each do |feature,values|
- values.each do |value|
- if @prediction_feature.feature_type == "regression"
- if (! value.nil?) && (value.to_f <= 0)
- take_logs=false
- end
- end
- end
- end
-
value_map=params[:value_map] unless params[:value_map].nil?
entry.each do |feature,values|
if feature == @prediction_feature.uri
@@ -103,7 +93,7 @@ module OpenTox
activity= value_map.invert[value].to_i # activities are mapped to 1..n
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
elsif @prediction_feature.feature_type == "regression"
- activity= take_logs ? Math.log10(value.to_f) : value.to_f
+ activity= value.to_f
end
begin
fminer_instance.AddCompound(smiles,id)
@@ -164,19 +154,34 @@ module OpenTox
# @param [Array] features_a Features of first compound
# @param [Array] features_b Features of second compound
# @param [optional, Hash] weights Weights for all features
+ # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
# @return [Float] (Weighted) tanimoto similarity
- def self.tanimoto(features_a,features_b,weights=nil)
+ def self.tanimoto(features_a,features_b,weights=nil,params=nil)
common_features = features_a & features_b
all_features = (features_a + features_b).uniq
- common_p_sum = 0.0
+ #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
if common_features.size > 0
if weights
- common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
- all_p_sum = 0.0
- all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
+ #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
+ if !params.nil? && params[:nr_hits]
+ params[:weights] = weights
+ params[:mode] = "min"
+ params[:features] = common_features
+ common_p_sum = Algorithm.p_sum_support(params)
+ params[:mode] = "max"
+ params[:features] = all_features
+ all_p_sum = Algorithm.p_sum_support(params)
+ else
+ common_p_sum = 0.0
+ common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
+ all_p_sum = 0.0
+ all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
+ end
+ #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
common_p_sum/all_p_sum
else
- common_features.to_f/all_features
+ #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
+ common_features.size.to_f/all_features.size.to_f
end
else
0.0
@@ -206,90 +211,192 @@ module OpenTox
end
end
- module Neighbors
+ # Structural Graph Clustering by TU Munich
+ # Finds clusters similar to a query structure in a given training dataset
+ # May be queried for cluster membership of an unknown compound
+ class StructuralClustering
+ attr_accessor :training_dataset_uri, :training_threshold, :query_dataset_uri, :query_threshold, :target_clusters_array
- # Local multi-linear regression (MLR) prediction from neighbors.
- # Uses propositionalized setting.
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
- # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
- # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
- # @return [Numeric] A prediction value.
- def self.local_mlr_prop(neighbors, params, props)
+ # @params[String] Training dataset_uri
+ # @params[Float] Similarity threshold for training (optional)
+ # @params[String] Cluster service uri (no AA)
+ def initialize training_dataset_uri, training_threshold=0.8, cluster_service_uri = "http://opentox-dev.informatik.tu-muenchen.de:8080/OpenTox/algorithm/StructuralClustering"
- take_logs=true
+ if (training_dataset_uri =~ URI::regexp).nil? || (cluster_service_uri =~ URI::regexp).nil?
+ raise "Invalid URI."
+ end
+ @training_dataset_uri = training_dataset_uri
+ if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
+ raise "Training threshold out of bounds."
+ end
+ @training_threshold = training_threshold.to_f
- neighbors.each do |n|
- if (! n[:activity].nil?) && (n[:activity].to_f < 0.0)
- take_logs = false
+ # Train a cluster model
+ params = {:dataset_uri => @training_dataset_uri, :threshold => @training_threshold }
+ @cluster_model_uri = OpenTox::RestClientWrapper.post cluster_service_uri, params
+ cluster_model_rdf = OpenTox::RestClientWrapper.get @cluster_model_uri
+ @datasets = OpenTox::Parser::Owl.from_rdf cluster_model_rdf, OT.Dataset, true # must extract OT.Datasets from model
+
+ # Process parsed OWL objects
+ @clusterid_dataset_map = Hash.new
+ @datasets.each { |d|
+ begin
+ d.metadata[OT.hasSource]["Structural Clustering cluster "] = "" # must parse in metadata for string (not elegant)
+ @clusterid_dataset_map[d.metadata[OT.hasSource].to_i] = d.uri
+ rescue Exception => e
+ # ignore other entries!
end
+ }
+ end
+
+ # Whether a model has been trained
+ def trained?
+ !@cluster_model_uri.nil?
+ end
+
+ # Instance query: clusters for a compound
+ # @params[String] Query compound
+ # @params[Float] Similarity threshold for query to clusters (optional)
+ def get_clusters query_compound_uri, query_threshold = 0.5
+
+ if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
+ raise "Query threshold out of bounds."
+ end
+ @query_threshold = query_threshold.to_f
+
+
+ # Preparing a query dataset
+ query_dataset = OpenTox::Dataset.new
+ @query_dataset_uri = query_dataset.save
+ query_dataset = OpenTox::Dataset.find @query_dataset_uri
+ query_dataset.add_compound query_compound_uri
+ @query_dataset_uri = query_dataset.save
+
+ # Obtaining a clustering for query compound
+ params = { :dataset_uri => @query_dataset_uri, :threshold => @query_threshold }
+ cluster_query_dataset_uri = OpenTox::RestClientWrapper.post @cluster_model_uri, params
+ cluster_query_dataset = OpenTox::Dataset.new cluster_query_dataset_uri
+ cluster_query_dataset.load_all
+
+ # Reading cluster ids for features from metadata
+ feature_clusterid_map = Hash.new
+ pattern="Prediction feature for cluster assignment " # must parse for string in metadata (not elegant)
+ cluster_query_dataset.features.each { |feature_uri,metadata|
+ metadata[DC.title][pattern]=""
+ feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
+ }
+
+ # Integrity check
+ unless cluster_query_dataset.compounds.size == 1
+ raise "Number of predicted compounds is != 1."
end
- acts = neighbors.collect do |n|
- act = n[:activity]
- take_logs ? Math.log10(act.to_f) : act.to_f
- end # activities of neighbors for supervised learning
+ # Process data entry
+ query_compound_uri = cluster_query_dataset.compounds[0]
+ @target_clusters_array = Array.new
+ cluster_query_dataset.features.keys.each { |cluster_membership_feature|
+
+ # Getting dataset URI for cluster
+ target_cluster = feature_clusterid_map[cluster_membership_feature]
+ dataset = @clusterid_dataset_map[target_cluster]
+
+ # Finally look up presence
+ data_entry = cluster_query_dataset.data_entries[query_compound_uri]
+ present = data_entry[cluster_membership_feature][0]
+
+ # Store result
+ @target_clusters_array << dataset if present > 0.5 # 0.0 for absence, 1.0 for presence
+ }
+ end
+ end
- begin
+ module Neighbors
+ # Local multi-linear regression (MLR) prediction from neighbors.
+ # Uses propositionalized setting.
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+ # @return [Numeric] A prediction value.
+ def self.local_mlr_prop(params)
+
+ confidence=0.0
+ prediction=nil
+
+ if params[:neighbors].size>0
+ props = params[:prop_kernel] ? get_props(params) : nil
+ acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
+ sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
LOGGER.debug "Local MLR (Propositionalization / GSL)."
- n_prop = props[0] # is a matrix, i.e. two nested Arrays.
- q_prop = props[1] # is an Array.
- n_prop_x_size = n_prop[0].size
- n_prop_y_size = n_prop.size
+ prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
+ transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
+ prediction = transformer.values[0]
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
+ end
+ {:prediction => prediction, :confidence => confidence}
- n_prop.flatten!
- y_x_rel = n_prop_y_size.to_f / n_prop_x_size
- repeat_factor = (1/y_x_rel).ceil
- n_prop_tmp = Array.new ; repeat_factor.times { n_prop_tmp.concat n_prop } ; n_prop = n_prop_tmp
- acts_tmp = Array.new ; repeat_factor.times { acts_tmp.concat acts } ; acts = acts_tmp
+ end
- if n_prop.size == 0
- raise "No neighbors found."
- else
- begin
- LOGGER.debug "Setting GSL data ..."
- # set data
- prop_matrix = GSL::Matrix[n_prop, n_prop_y_size * repeat_factor, n_prop_x_size]
- y = GSL::Vector[acts]
- q_prop = GSL::Vector[q_prop]
+ # Multi-linear regression weighted by similarity.
+ # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
+ # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
+ # @return [Numeric] A prediction value.
+ def self.mlr(params)
- # model + support vectors
- LOGGER.debug "Creating MLR model ..."
- work = GSL::MultiFit::Workspace.alloc(n_prop_y_size * repeat_factor, n_prop_x_size)
- c, cov, chisq, status = GSL::MultiFit::linear(prop_matrix, y, work)
- LOGGER.debug "Predicting ..."
- prediction = GSL::MultiFit::linear_est(q_prop, c, cov)[0]
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
- end
- end
+ # GSL matrix operations:
+ # to_a : row-wise conversion to nested array
+ #
+ # Statsample operations (build on GSL):
+ # to_scale: convert into Statsample format
- prediction = (take_logs ? 10**(prediction.to_f) : prediction.to_f)
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+ begin
+ n_prop = params[:n_prop].collect { |v| v }
+ q_prop = params[:q_prop].collect { |v| v }
+ n_prop << q_prop # attach q_prop
+ nr_cases, nr_features = get_sizes n_prop
+ data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
+
+ # Principal Components Analysis
+ LOGGER.debug "PCA..."
+ pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
+ data_matrix = pca.data_transformed_matrix
+
+ # Attach intercept column to data
+ intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
+ data_matrix = data_matrix.horzcat(intercept)
+ (0..data_matrix.size2-2).each { |i|
+ autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
+ data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
+ }
+
+ # Detach query instance
+ n_prop = data_matrix.to_a
+ q_prop = n_prop.pop
+ nr_cases, nr_features = get_sizes n_prop
+ data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
+
+ # model + support vectors
+ LOGGER.debug "Creating MLR model ..."
+ c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
+ GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+ LOGGER.debug "#{e.class}: #{e.message}"
end
- sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
- conf = sims.inject{|sum,x| sum + x }
- confidence = conf/neighbors.size if neighbors.size > 0
- {:prediction => prediction, :confidence => confidence}
- end
+ end
# Classification with majority vote from neighbors weighted by similarity
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity`
- # @param [optional] params Ignored (only for compatibility with local_svm_regression)
- # @return [Hash] Hash with keys `:prediction, :confidence`
- def self.weighted_majority_vote(neighbors,params={}, props=nil)
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+ # @return [Numeric] A prediction value.
+ def self.weighted_majority_vote(params)
+
neighbor_contribution = 0.0
confidence_sum = 0.0
confidence = 0.0
prediction = nil
- positive_map_value= nil
- negative_map_value= nil
- neighbors.each do |neighbor|
+ params[:neighbors].each do |neighbor|
neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
@@ -307,70 +414,57 @@ module OpenTox
if params[:value_map].size == 2
if confidence_sum >= 0.0
- prediction = 2 unless neighbors.size==0
+ prediction = 2 unless params[:neighbors].size==0
elsif confidence_sum < 0.0
- prediction = 1 unless neighbors.size==0
+ prediction = 1 unless params[:neighbors].size==0
end
else
- prediction = (neighbor_contribution/confidence_sum).round unless neighbors.size==0 # AM: new multinomial prediction
+ prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction
end
-
- confidence = confidence_sum/neighbors.size if neighbors.size > 0
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
+ confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
+ LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
return {:prediction => prediction, :confidence => confidence.abs}
end
# Local support vector regression from neighbors
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
- # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
- # @return [Hash] Hash with keys `:prediction, :confidence`
- def self.local_svm_regression(neighbors, params, props=nil)
- take_logs=true
- neighbors.each do |n|
- if (! n[:activity].nil?) && (n[:activity].to_f < 0.0)
- take_logs = false
- end
- end
- acts = neighbors.collect do |n|
- act = n[:activity]
- take_logs ? Math.log10(act.to_f) : act.to_f
- end # activities of neighbors for supervised learning
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+ # @return [Numeric] A prediction value.
+ def self.local_svm_regression(params)
- sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
- begin
- prediction = (props.nil? ? local_svm(neighbors, acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr", params))
- prediction = (take_logs ? 10**(prediction.to_f) : prediction.to_f)
+ confidence = 0.0
+ prediction = nil
+ if params[:neighbors].size>0
+ props = params[:prop_kernel] ? get_props(params) : nil
+ acts = params[:neighbors].collect{ |n| n[:activity].to_f }
+ sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
+ prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
+ transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
+ prediction = transformer.values[0]
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
end
-
- conf = sims.inject{|sum,x| sum + x }
- confidence = conf/neighbors.size if neighbors.size > 0
{:prediction => prediction, :confidence => confidence}
end
# Local support vector classification from neighbors
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
- # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
- # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
- # @return [Hash] Hash with keys `:prediction, :confidence`
- def self.local_svm_classification(neighbors, params, props=nil)
- acts = neighbors.collect do |n|
- act = n[:activity]
- end # activities of neighbors for supervised learning
-# acts_f = acts.collect {|v| v == true ? 1.0 : 0.0}
- acts_f = acts
- sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
- begin
- prediction = (props.nil? ? local_svm(neighbors, acts_f, sims, "C-bsvc", params) : local_svm_prop(props, acts_f, "C-bsvc", params))
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+ # @return [Numeric] A prediction value.
+ def self.local_svm_classification(params)
+
+ confidence = 0.0
+ prediction = nil
+ if params[:neighbors].size>0
+ props = params[:prop_kernel] ? get_props(params) : nil
+ acts = params[:neighbors].collect { |n| act = n[:activity] }
+ sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
+ prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
end
-
- conf = sims.inject{|sum,x| sum + x }
- confidence = conf/neighbors.size if neighbors.size > 0
{:prediction => prediction, :confidence => confidence}
end
@@ -379,26 +473,34 @@ module OpenTox
# Local support vector prediction from neighbors.
# Uses pre-defined Kernel Matrix.
# Not to be called directly (use local_svm_regression or local_svm_classification).
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
# @param [Array] acts, activities for neighbors.
# @param [Array] sims, similarities for neighbors.
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
- # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
- # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
# @return [Numeric] A prediction value.
- def self.local_svm(neighbors, acts, sims, type, params)
+ def self.local_svm(acts, sims, type, params)
LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
- neighbor_matches = neighbors.collect{ |n| n[:features] } # URIs of matches
+ neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
- if neighbor_matches.size == 0
- raise "No neighbors found."
+
+ prediction = nil
+ if Algorithm::zero_variance? acts
+ prediction = acts[0]
else
# gram matrix
(0..(neighbor_matches.length-1)).each do |i|
+ neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
gram_matrix[i] = [] unless gram_matrix[i]
# upper triangle
((i+1)..(neighbor_matches.length-1)).each do |j|
- sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values])")
+ neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
+ sim_params = {}
+ if params[:nr_hits]
+ sim_params[:nr_hits] = true
+ sim_params[:compound_features_hits] = neighbor_i_hits
+ sim_params[:training_compound_features_hits] = neighbor_j_hits
+ end
+ sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
gram_matrix[i][j] = Algorithm.gauss(sim)
gram_matrix[j] = [] unless gram_matrix[j]
gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
@@ -406,6 +508,7 @@ module OpenTox
gram_matrix[i][i] = 1.0
end
+
#LOGGER.debug gram_matrix.to_yaml
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
@r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
@@ -443,7 +546,8 @@ module OpenTox
end
@r.quit # free R
rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
end
end
@@ -453,22 +557,19 @@ module OpenTox
# Local support vector prediction from neighbors.
# Uses propositionalized setting.
# Not to be called directly (use local_svm_regression or local_svm_classification).
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
- # @param [Array] acts, activities for neighbors.
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+ # @param [Array] acts, activities for neighbors.
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
- # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
# @return [Numeric] A prediction value.
- def self.local_svm_prop(props, acts, type, params)
+ def self.local_svm_prop(props, acts, type)
LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
n_prop = props[0] # is a matrix, i.e. two nested Arrays.
q_prop = props[1] # is an Array.
- #neighbor_matches = neighbors.collect{ |n| n[:features] } # URIs of matches
- #gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
- if n_prop.size == 0
- raise "No neighbors found."
+ prediction = nil
+ if Algorithm::zero_variance? acts
+ prediction = acts[0]
else
#LOGGER.debug gram_matrix.to_yaml
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
@@ -505,12 +606,85 @@ module OpenTox
end
@r.quit # free R
rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}"
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
end
end
prediction
end
+ # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
+ # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
+ # @return[Float] Confidence
+ def self.get_confidence(params)
+ if params[:conf_stdev]
+ sim_median = params[:sims].to_scale.median
+ if sim_median.nil?
+ confidence = nil
+ else
+ standard_deviation = params[:acts].to_scale.standard_deviation_sample
+ confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
+ if confidence.nan?
+ confidence = nil
+ end
+ end
+ else
+ conf = params[:sims].inject{|sum,x| sum + x }
+ confidence = conf/params[:neighbors].size
+ end
+ LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
+ return confidence
+ end
+
+ # Get X and Y size of a nested Array (Matrix)
+ def self.get_sizes(matrix)
+ begin
+ nr_cases = matrix.size
+ nr_features = matrix[0].size
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
+ [ nr_cases, nr_features ]
+ end
+
+ # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
+ # Same for the vector describing the query compound
+ # @param[Array] neighbors.
+ # @param[OpenTox::Compound] query compound.
+ # @param[Array] Dataset Features.
+ # @param[Array] Fingerprints of neighbors.
+ # @param[Float] p-values of Features.
+ def self.get_props (params)
+ matrix = Array.new
+ begin
+ params[:neighbors].each do |n|
+ n = n[:compound]
+ row = []
+ params[:features].each do |f|
+ if ! params[:fingerprints][n].nil?
+ row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
+ else
+ row << 0.0
+ end
+ end
+ matrix << row
+ end
+ row = []
+ params[:features].each do |f|
+ if params[:nr_hits]
+ compound_feature_hits = params[:compound].match_hits([f])
+ row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
+ else
+ row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
+ end
+ end
+ rescue Exception => e
+ LOGGER.debug "get_props failed with '" + $! + "'"
+ end
+ [ matrix, row ]
+ end
end
@@ -531,6 +705,195 @@ module OpenTox
def features(dataset_uri,compound_uri)
end
end
+
+ module Transform
+ include Algorithm
+
+ # The transformer that inverts values.
+ # 1/x is used, after values have been moved >= 1.
+ class Inverter
+ attr_accessor :offset, :values
+
+ # @params[Array] Values to transform.
+ # @params[Float] Offset for restore.
+ def initialize *args
+ case args.size
+ when 1
+ begin
+ values=args[0]
+ raise "Cannot transform, values empty." if @values.size==0
+ @values = values.collect { |v| -1.0 * v }
+ @offset = 1.0 - @values.minmax[0]
+ @offset = -1.0 * @offset if @offset>0.0
+ @values.collect! { |v| v - @offset } # slide >1
+ @values.collect! { |v| 1 / v } # invert to [0,1]
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ when 2
+ @offset = args[1].to_f
+ @values = args[0].collect { |v| 1 / v }
+ @values.collect! { |v| v + @offset }
+ @values.collect! { |v| -1.0 * v }
+ end
+ end
+ end
+
+ # The transformer that takes logs.
+ # Log10 is used, after values have been moved > 0.
+ class Log10
+ attr_accessor :offset, :values
+
+ # @params[Array] Values to transform / restore.
+ # @params[Float] Offset for restore.
+ def initialize *args
+ @distance_to_zero = 0.000000001 # 1 / 1 billion
+ case args.size
+ when 1
+ begin
+ values=args[0]
+ raise "Cannot transform, values empty." if values.size==0
+ @offset = values.minmax[0]
+ @offset = -1.0 * @offset if @offset>0.0
+ @values = values.collect { |v| v - @offset } # slide > anchor
+ @values.collect! { |v| v + @distance_to_zero } #
+ @values.collect! { |v| Math::log10 v } # log10 (can fail)
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ when 2
+ @offset = args[1].to_f
+ @values = args[0].collect { |v| 10**v }
+ @values.collect! { |v| v - @distance_to_zero }
+ @values.collect! { |v| v + @offset }
+ end
+ end
+ end
+
+ # The transformer that does nothing (No OPeration).
+ class NOP
+ attr_accessor :offset, :values
+
+ # @params[Array] Values to transform / restore.
+ # @params[Float] Offset for restore.
+ def initialize *args
+ @offset = 0.0
+ @distance_to_zero = 0.0
+ case args.size
+ when 1
+ @values = args[0]
+ when 2
+ @values = args[0]
+ end
+ end
+ end
+
+
+ # Auto-Scaler for Arrays
+ # Center on mean and divide by standard deviation
+ class AutoScale
+ attr_accessor :scaled_values, :mean, :stdev
+
+ # @params[Array] Values to transform.
+ def initialize values
+ @scaled_values = values
+ @mean = @scaled_values.to_scale.mean
+ @stdev = @scaled_values.to_scale.standard_deviation_sample
+ @scaled_values = @scaled_values.collect {|vi| vi - @mean }
+ @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
+ end
+ end
+
+ # Principal Components Analysis
+ # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
+ class PCA
+ attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
+
+ # Creates a transformed dataset as GSL::Matrix.
+ # @param [GSL::Matrix] Data matrix.
+ # @param [Float] Compression ratio from [0,1].
+ # @return [GSL::Matrix] Data transformed matrix.
+ def initialize data_matrix, compression=0.05
+ begin
+ @data_matrix = data_matrix
+ @compression = compression.to_f
+ @stdev = Array.new
+ @mean = Array.new
+
+ # Objective Feature Selection
+ raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
+ @data_matrix_selected = nil
+ (0..@data_matrix.size2-1).each { |i|
+ if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
+ if @data_matrix_selected.nil?
+ @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
+ @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
+ else
+ @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
+ end
+ end
+ }
+ raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
+
+ # Scaling of Axes
+ @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
+ (0..@data_matrix_selected.size2-1).each { |i|
+ @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
+ @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
+ @stdev << @autoscaler.stdev
+ @mean << @autoscaler.mean
+ }
+
+ data_matrix_hash = Hash.new
+ (0..@data_matrix_scaled.size2-1).each { |i|
+ column_view = @data_matrix_scaled.col(i)
+ data_matrix_hash[i] = column_view.to_scale
+ }
+ dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
+ cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
+ pca=Statsample::Factor::PCA.new(cor_matrix)
+ pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
+ @eigenvalue_sums = Array.new
+ (0..dataset_hash.fields.size-1).each { |i|
+ @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
+ }
+ eigenvectors_selected = Array.new
+ pca.eigenvectors.each_with_index { |ev, i|
+ if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
+ eigenvectors_selected << ev.to_a
+ end
+ }
+ @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
+ dataset_matrix = dataset_hash.to_gsl.transpose
+ @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ # Restores data in the original feature space (possibly with compression loss).
+ # @return [GSL::Matrix] Data matrix.
+ def restore
+ begin
+ data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
+ # reverse scaling
+ (0..data_matrix_restored.size2-1).each { |i|
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
+ }
+ data_matrix_restored
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ end
+
+ end
# Gauss kernel
# @return [Float]
@@ -538,24 +901,31 @@ module OpenTox
d = 1.0 - x.to_f
Math.exp(-(d*d)/(2*sigma*sigma))
end
-
- # Median of an array
- # @param [Array] Array with values
- # @return [Float] Median
- def self.median(array)
- return nil if array.empty?
- array.sort!
- m_pos = array.size / 2
- return array.size % 2 == 1 ? array[m_pos] : (array[m_pos-1] + array[m_pos])/2
+
+ # For symbolic features
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
+ # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
+ def self.isnull_or_singular?(array)
+ nr_zeroes = array.count(0)
+ return (nr_zeroes == array.size) || # remove non-occurring feature
+ (nr_zeroes == array.size-1) || # remove singular feature
+ (nr_zeroes == 0) # also remove feature present everywhere
end
- # Sum of an array for Numeric values
- # @param [Array] Array with values
- # @return [Integer] Sum of values
- def self.sum(array)
- array.inject{|s,x| s + x }
+ # Numeric value test
+ # @param[Object] value
+ # @return [Boolean] Whether value is a number
+ def self.numeric?(value)
+ true if Float(value) rescue false
end
+ # For symbolic features
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
+ # @return [Boolean] Whether the feature has variance zero.
+ def self.zero_variance?(array)
+ return (array.to_scale.variance_sample == 0.0)
+ end
+
# Sum of an array for Arrays.
# @param [Array] Array with values
# @return [Integer] Sum of size of values
@@ -565,14 +935,13 @@ module OpenTox
return sum
end
-
# Minimum Frequency
# @param [Integer] per-mil value
# return [Integer] min-frequency
def self.min_frequency(training_dataset,per_mil)
- minfreq = per_mil*training_dataset.compounds.size/1000 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+ minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
minfreq = 2 unless minfreq > 2
- minfreq
+ Integer (minfreq)
end
# Effect calculation for classification
@@ -582,7 +951,7 @@ module OpenTox
max=0
max_value=0
nr_o = self.sum_size(occurrences)
- nr_db = self.sum(db_instances)
+ nr_db = db_instances.to_scale.sum
occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
actual = o.size.to_f/nr_o
@@ -596,8 +965,20 @@ module OpenTox
}
max
end
-
-
+
+ # Returns Support value of an fingerprint
+ # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
+ # return [Numeric] Support value
+ def self.p_sum_support(params)
+ p_sum = 0.0
+ params[:features].each{|f|
+ compound_hits = params[:compound_features_hits][f]
+ neighbor_hits = params[:training_compound_features_hits][f]
+ p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
+ }
+ p_sum
+ end
+
end
end
diff --git a/lib/compound.rb b/lib/compound.rb
index d374b02..e7b4da0 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -164,6 +164,35 @@ module OpenTox
#smarts_array.collect { |s| s if match?(s)}.compact
end
+ # Match_hits an array of smarts strings, returns hash with matching smarts as key and number of non-unique hits as value
+ # @example
+ # compound = OpenTox::Compound.from_name("Benzene")
+ # compound.match(['cc','cN']) # returns ['cc']
+ # @param [Array] smarts_array Array with Smarts strings
+ # @return [Hash] Hash with matching smarts as key and number of non-unique hits as value
+ def match_hits(smarts_array)
+ # avoid recreation of OpenBabel objects
+ obconversion = OpenBabel::OBConversion.new
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_format('inchi')
+ obconversion.read_string(obmol,@inchi)
+ smarts_pattern = OpenBabel::OBSmartsPattern.new
+ smarts_hits = {}
+ #LOGGER.debug "dv ----------- obmol #{Compound.new(@inchi).to_smiles}"
+ smarts_array.collect do |smarts|
+ #LOGGER.debug "dv ----------- all smarts #{smarts}"
+ smarts_pattern.init(smarts)
+ if smarts_pattern.match(obmol)
+ hits = smarts_pattern.get_map_list
+ smarts_hits[smarts] = hits.size
+ end
+ end
+ #LOGGER.debug "dv ----------- smarts => hits #{smarts_hits}"
+ return smarts_hits
+ #smarts_array.collect { |s| s if match?(s)}.compact
+ end
+
+
# Get URI of compound image with highlighted fragments
#
# @param [Array] activating Array with activating Smarts strings
diff --git a/lib/dataset.rb b/lib/dataset.rb
index f13c0d3..5ebad0f 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -102,6 +102,13 @@ module OpenTox
copy parser.load_uri(subjectid)
end
+ def load_sdf(sdf,subjectid=nil)
+ save(subjectid) unless @uri # get a uri for creating features
+ parser = Parser::Sdf.new
+ parser.dataset = self
+ parser.load_sdf(sdf)
+ end
+
# Load CSV string (format specification: http://toxcreate.org/help)
# - loads data_entries, compounds, features
# - sets metadata (warnings) for parser errors
@@ -230,6 +237,30 @@ module OpenTox
s.to_rdfxml
end
+ # Get SDF representation of compounds
+ # @return [String] SDF representation
+ def to_sdf
+ sum=""
+ @compounds.each{ |c|
+ sum << OpenTox::Compound.new(c).to_inchi
+ sum << OpenTox::Compound.new(c).to_sdf.sub(/\n\$\$\$\$/,'')
+ @data_entries[c].each{ |f,v|
+ sum << "> <\"#{f}\">\n"
+ sum << v.join(", ")
+ sum << "\n\n"
+ }
+ sum << "$$$$\n"
+ }
+ sum
+ end
+
+ def to_urilist
+ @compounds.inject { |sum, c|
+ sum << OpenTox::Compound.new(c).uri
+ sum + "\n"
+ }
+ end
+
# Get name (DC.title) of a feature
# @param [String] feature Feature URI
# @return [String] Feture title
diff --git a/lib/model.rb b/lib/model.rb
index 825f697..26c42a5 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -50,38 +50,49 @@ module OpenTox
@predicted_variable
end
+ def predicted_variables( subjectid )
+ load_predicted_variables( subjectid, false ) unless @predicted_variables
+ @predicted_variables
+ end
+
def predicted_confidence( subjectid )
load_predicted_variables( subjectid ) unless @predicted_confidence
@predicted_confidence
end
private
- def load_predicted_variables( subjectid=nil )
+ def load_predicted_variables( subjectid=nil, use_confidence=true )
load_metadata(subjectid) if @metadata==nil or @metadata.size==0 or (@metadata.size==1 && @metadata.values[0]==@uri)
if @metadata[OT.predictedVariables]
predictedVariables = @metadata[OT.predictedVariables]
if predictedVariables.is_a?(Array)
if (predictedVariables.size==1)
@predicted_variable = predictedVariables[0]
- elsif (predictedVariables.size==2)
+ elsif (predictedVariables.size>=2)
# PENDING identify confidence
- conf_index = -1
- predictedVariables.size.times do |i|
- f = OpenTox::Feature.find(predictedVariables[i])
- conf_index = i if f.metadata[DC.title]=~/(?i)confidence/
+ if use_confidence
+ conf_index = -1
+ predictedVariables.size.times do |i|
+ f = OpenTox::Feature.find(predictedVariables[i], subjectid)
+ conf_index = i if f.metadata[DC.title]=~/(?i)confidence/
+ end
+ raise "could not estimate predicted variable from model: '"+uri.to_s+
+ "', number of predicted-variables==2, but no confidence found" if conf_index==-1
+ end
+ if (predictedVariables.size==2) && use_confidence
+ @predicted_variable = predictedVariables[1-conf_index]
+ @predicted_confidence = predictedVariables[conf_index]
+ else
+ @predicted_variables = predictedVariables
end
- raise "could not estimate predicted variable from model: '"+uri.to_s+
- "', number of predicted-variables==2, but no confidence found" if conf_index==-1
- @predicted_variable = predictedVariables[1-conf_index]
- @predicted_confidence = predictedVariables[conf_index]
else
- raise "could not estimate predicted variable from model: '"+uri.to_s+"', number of predicted-variables > 2"
+ raise "could not estimate predicted variable from model: '"+uri.to_s+"', number of predicted-variables == 0"
end
else
raise "could not estimate predicted variable from model: '"+uri.to_s+"', predicted-variables is no array"
end
end
- raise "could not estimate predicted variable from model: '"+uri.to_s+"'" unless @predicted_variable
+ raise "could not estimate predicted variable from model: '"+uri.to_s+"'" unless (@predicted_variable || @predicted_variables)
end
end
@@ -91,7 +102,7 @@ module OpenTox
include Algorithm
include Model
- attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim, :subjectid, :prop_kernel, :value_map, :balanced
+ attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim, :subjectid, :prop_kernel, :value_map, :nr_hits, :transform, :conf_stdev
def initialize(uri=nil)
@@ -113,10 +124,12 @@ module OpenTox
@feature_calculation_algorithm = "Substructure.match"
@similarity_algorithm = "Similarity.tanimoto"
@prediction_algorithm = "Neighbors.weighted_majority_vote"
-
+
+ @nr_hits = false
@min_sim = 0.3
@prop_kernel = false
- @balanced = false
+ @transform = { "class" => "NOP" }
+ @conf_stdev = false
end
@@ -168,6 +181,7 @@ module OpenTox
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
# @return [OpenTox::Dataset] Dataset with predictions
def predict_dataset(dataset_uri, subjectid=nil, waiting_task=nil)
+
@prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@prediction_dataset.add_metadata({
OT.hasSource => @uri,
@@ -212,90 +226,33 @@ module OpenTox
unless database_activity(subjectid) # adds database activity to @prediction_dataset
- if @balanced && OpenTox::Feature.find(metadata[OT.dependentVariables]).feature_type == "classification"
- # AM: Balancing, see http://www.maunz.de/wordpress/opentox/2011/balanced-lazar
- l = Array.new # larger
- s = Array.new # smaller fraction
-
- raise "no fingerprints in model" if @fingerprints.size==0
-
- @fingerprints.each do |training_compound,training_features|
- @activities[training_compound].each do |act|
- case act.to_s
- when "0"
- l << training_compound
- when "1"
- s << training_compound
- else
- LOGGER.warn "BLAZAR: Activity #{act.to_s} should not be reached (supports only two classes)."
- end
- end
- end
- if s.size > l.size then
- l,s = s,l # happy swapping
- LOGGER.info "BLAZAR: |s|=#{s.size}, |l|=#{l.size}."
- end
- # determine ratio
- modulo = l.size.divmod(s.size)# modulo[0]=ratio, modulo[1]=rest
- LOGGER.info "BLAZAR: Balance: #{modulo[0]}, rest #{modulo[1]}."
-
- # AM: Balanced predictions
- addon = (modulo[1].to_f/modulo[0]).ceil # what will be added in each round
- slack = (addon!=0 ? modulo[1].divmod(addon)[1] : 0) # what remains for the last round
- position = 0
- predictions = Array.new
-
- prediction_best=nil
- neighbors_best=nil
-
- begin
- for i in 1..modulo[0] do
- (i == modulo[0]) && (slack>0) ? lr_size = s.size + slack : lr_size = s.size + addon # determine fraction
- LOGGER.info "BLAZAR: Neighbors round #{i}: #{position} + #{lr_size}."
- neighbors_balanced(s, l, position, lr_size) # get ratio fraction of larger part
- if @prop_kernel && ( @prediction_algorithm.include?("svm") || @prediction_algorithm.include?("local_mlr_prop") )
- props = get_props
- else
- props = nil
- end
- prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values, :value_map => @value_map}, props)")
- if prediction_best.nil? || prediction[:confidence].abs > prediction_best[:confidence].abs
- prediction_best=prediction
- neighbors_best=@neighbors
- end
- position = position + lr_size
- end
- rescue Exception => e
- LOGGER.error "BLAZAR failed in prediction: "+e.class.to_s+": "+e.message
- end
-
- prediction=prediction_best
- @neighbors=neighbors_best
- ### END AM balanced predictions
+ neighbors
+ prediction = eval("#{@prediction_algorithm} ( { :neighbors => @neighbors,
+ :compound => @compound,
+ :features => @features,
+ :p_values => @p_values,
+ :fingerprints => @fingerprints,
+ :similarity_algorithm => @similarity_algorithm,
+ :prop_kernel => @prop_kernel,
+ :value_map => @value_map,
+ :nr_hits => @nr_hits,
+ :conf_stdev => @conf_stdev,
+ :transform => @transform } ) ")
- else # AM: no balancing or regression
- LOGGER.info "LAZAR: Unbalanced."
- neighbors
- if @prop_kernel && ( @prediction_algorithm.include?("svm") || @prediction_algorithm.include?("local_mlr_prop") )
- props = get_props
- else
- props = nil
- end
- prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values, :value_map => @value_map}, props)")
- end
-
value_feature_uri = File.join( @uri, "predicted", "value")
confidence_feature_uri = File.join( @uri, "predicted", "confidence")
@prediction_dataset.metadata[OT.dependentVariables] = @metadata[OT.dependentVariables] unless @prediction_dataset.metadata[OT.dependentVariables]
@prediction_dataset.metadata[OT.predictedVariables] = [value_feature_uri, confidence_feature_uri] unless @prediction_dataset.metadata[OT.predictedVariables]
- if OpenTox::Feature.find(metadata[OT.dependentVariables]).feature_type == "classification"
+ if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification"
@prediction_dataset.add @compound.uri, value_feature_uri, @value_map[prediction[:prediction]]
else
@prediction_dataset.add @compound.uri, value_feature_uri, prediction[:prediction]
end
@prediction_dataset.add @compound.uri, confidence_feature_uri, prediction[:confidence]
+ @prediction_dataset.features[value_feature_uri][DC.title] = @prediction_dataset.metadata[DC.title]
+ @prediction_dataset.features[confidence_feature_uri][DC.title] = "Confidence"
if verbose
if @feature_calculation_algorithm == "Substructure.match"
@@ -356,56 +313,32 @@ module OpenTox
@prediction_dataset
end
- # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
- # Same for the vector describing the query compound
- def get_props
- matrix = Array.new
- begin
- @neighbors.each do |n|
- n = n[:compound]
- row = []
- @features.each do |f|
- if ! @fingerprints[n].nil?
- row << (@fingerprints[n].include?(f) ? 0.0 : @p_values[f])
- else
- row << 0.0
- end
- end
- matrix << row
- end
- row = []
- @features.each do |f|
- row << (@compound.match([f]).size == 0 ? 0.0 : @p_values[f])
- end
- rescue Exception => e
- LOGGER.debug "get_props failed with '" + $! + "'"
- end
- [ matrix, row ]
- end
-
- # Find neighbors and store them as object variable, access only a subset of compounds for that.
- def neighbors_balanced(s, l, start, offset)
- @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm
- @neighbors = []
- [ l[start, offset ] , s ].flatten.each do |training_compound| # AM: access only a balanced subset
- training_features = @fingerprints[training_compound]
- add_neighbor training_features, training_compound
- end
-
- end
+
# Find neighbors and store them as object variable, access all compounds for that.
def neighbors
@compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm
@neighbors = []
- @fingerprints.each do |training_compound,training_features| # AM: access all compounds
- add_neighbor training_features, training_compound
+ @fingerprints.keys.each do |training_compound| # AM: access all compounds
+ add_neighbor @fingerprints[training_compound].keys, training_compound
end
end
# Adds a neighbor to @neighbors if it passes the similarity threshold.
def add_neighbor(training_features, training_compound)
- sim = eval("#{@similarity_algorithm}(@compound_features,training_features,@p_values)")
+ compound_features_hits = {}
+ training_compound_features_hits = {}
+ if @nr_hits
+ compound_features_hits = @compound.match_hits(@compound_features)
+ training_compound_features_hits = @fingerprints[training_compound]
+ #LOGGER.debug "dv ------------ training_compound_features_hits:#{training_compound_features_hits.class} #{training_compound_features_hits}"
+ end
+ params = {}
+ params[:nr_hits] = @nr_hits
+ params[:compound_features_hits] = compound_features_hits
+ params[:training_compound_features_hits] = training_compound_features_hits
+
+ sim = eval("#{@similarity_algorithm}(training_features, @compound_features, @p_values, params)")
if sim > @min_sim
@activities[training_compound].each do |act|
@neighbors << {
diff --git a/lib/parser.rb b/lib/parser.rb
index 07bee67..d0975af 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -86,7 +86,11 @@ module OpenTox
# @param [String] rdf
# @param [String] type of the info (e.g. OT.Task, OT.ErrorReport) needed to get the subject-uri
# @return [Owl] with uri and metadata set
- def self.from_rdf( rdf, type )
+ def self.from_rdf( rdf, type, allow_multiple = false )
+
+ uris = Array.new
+ owls = Array.new
+
# write to file and read convert with rapper into tripples
file = Tempfile.new("ot-rdfxml")
file.puts rdf
@@ -99,20 +103,27 @@ module OpenTox
triples.each_line do |line|
triple = line.to_triple
if triple[1] == RDF['type'] and triple[2]==type
- raise "uri already set, two uris found with type: "+type.to_s if uri
+ if !allow_multiple
+ raise "uri already set, two uris found with type: "+type.to_s if uri
+ end
uri = triple[0]
+ uris << uri
end
end
File.delete(file.path)
+
# load metadata
- metadata = {}
- triples.each_line do |line|
- triple = line.to_triple
- metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type']
- end
- owl = Owl::Generic.new(uri)
- owl.metadata = metadata
- owl
+ uris.each { |uri|
+ metadata = {}
+ triples.each_line do |line|
+ triple = line.to_triple
+ metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type']
+ end
+ owl = Owl::Generic.new(uri)
+ owl.metadata = metadata
+ owls << owl
+ }
+ allow_multiple ? owls : owls[0]
end
# Generic parser for all OpenTox classes
@@ -350,7 +361,6 @@ module OpenTox
@dataset
end
-
private
def warnings
@@ -437,12 +447,8 @@ module OpenTox
end
end
- def numeric?(value)
- true if Float(value) rescue false
- end
-
def feature_type(value)
- if numeric? value
+ if OpenTox::Algorithm::numeric? value
return OT.NumericFeature
else
return OT.NominalFeature
@@ -454,5 +460,159 @@ module OpenTox
end
end
+
+ class Table
+
+ attr_accessor :data, :features, :compounds
+
+ def initialize
+ @data = {}
+ @activity_errors = []
+ end
+
+ def feature_values(feature)
+ @data.collect{|c, row| row[feature]}.uniq.compact
+ end
+
+ def feature_types(feature)
+ @data.collect{|c, row| feature_type(row[feature])}.uniq.compact
+ end
+
+ def features
+ @data.collect{|c,row| row.keys}.flatten.uniq
+ end
+
+ def clean_features
+ ignored_features = []
+ features.each do |feature|
+ if feature_values(feature).size > 5
+ if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
+ # REGRESSION
+ elsif feature_types(feature).include? OT.NumericFeature
+ @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
+ @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
+ else
+ @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
+ ignored_features << feature
+ next
+ end
+ elsif feature_values(feature).size <= 1
+ @activity_errors << "Feature #{feature} ignored (less than 2 feature values)."
+ ignored_features << feature
+ else
+ # CLASSIFICATION
+ end
+ end
+ ignored_features.each do |feature|
+ @data.each{ |c,row| row.delete feature }
+ end
+ @activity_errors
+ end
+
+ def add_to_dataset(dataset)
+ features.each do |feature_name|
+ feature_uri = File.join(dataset.uri,"feature",URI.encode(feature_name))
+ dataset.add_feature(feature_uri,{DC.title => feature_name})
+ end
+
+ @data.each do |compound,row|
+ unless row.empty?
+ row.each do |feature,value|
+ if OpenTox::Algorithm::numeric?(value)
+ value = value.to_f
+ elsif value.nil? or value.empty?
+ value = nil
+ else
+ value = value.to_s
+ end
+ feature_uri = File.join(dataset.uri,"feature",URI.encode(feature))
+ dataset.add(compound, feature_uri, value)
+ #dataset.features[feature_uri][RDF.type] = feature_types(feature)
+ #dataset.features[feature_uri][OT.acceptValue] = feature_values(feature)
+ if feature_types(feature).include? OT.NumericFeature
+ dataset.features[feature_uri][RDF.type] = [OT.NumericFeature]
+ else
+ dataset.features[feature_uri][RDF.type] = [OT.NominalFeature]
+ dataset.features[feature_uri][OT.acceptValue] = feature_values(feature)
+ end
+ end
+ end
+ end
+ end
+
+ private
+
+ def feature_type(value)
+ if OpenTox::Algorithm::numeric? value
+ return OT.NumericFeature
+ else
+ return OT.NominalFeature
+ end
+ end
+ end
+
+ # quick hack to enable sdf import via csv
+ # should be refactored
+ class Sdf
+
+ attr_accessor :dataset
+
+ def initialize
+ @data = {}
+
+ @compound_errors = []
+ @activity_errors = []
+ @duplicates = {}
+ end
+
+ def load_sdf(sdf)
+
+ obconversion = OpenBabel::OBConversion.new
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_and_out_formats "sdf", "inchi"
+
+ table = Table.new
+
+ properties = []
+ sdf.each_line { |l| properties << l.to_s if l.match(/</) }
+ properties.uniq!
+ properties.sort!
+ properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp }
+
+ rec = 0
+ sdf.split(/\$\$\$\$\r*\n/).each do |s|
+ rec += 1
+ obconversion.read_string obmol, s
+ begin
+ inchi = obconversion.write_string(obmol).gsub(/\s/,'').chomp
+ @duplicates[inchi] = [] unless @duplicates[inchi]
+ @duplicates[inchi] << rec #inchi#+", "+row.join(", ")
+ compound = Compound.from_inchi inchi
+ rescue
+ @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
+ next
+ end
+ row = {}
+ obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
+ table.data[compound.uri] = row
+ end
+
+ # finda and remove ignored_features
+ @activity_errors = table.clean_features
+ table.add_to_dataset @dataset
+
+ warnings = ''
+ warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
+ warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
+ duplicate_warnings = ''
+ @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
+ warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
+
+ @dataset.metadata[OT.Warnings] = warnings
+ @dataset
+
+ end
+
+ end
end
end