summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrautenberg <rautenberg@in-silico.ch>2012-03-13 15:32:57 +0100
committerrautenberg <rautenberg@in-silico.ch>2012-03-13 15:32:57 +0100
commit6b064515f11623e0209f265b32be6889e28def52 (patch)
tree66e9182b001a94c96270c153a659b6b8eb0055c2
parent1687a218b1593478bae1ab43a3eb8e5596def684 (diff)
parent4f14262609d58bf856675ae01195dd2c5f70b97b (diff)
pre v3.1.0
-rw-r--r--ChangeLog8
-rw-r--r--Rakefile5
-rw-r--r--lib/algorithm.rb902
-rw-r--r--lib/authorization.rb18
-rw-r--r--lib/compound.rb58
-rw-r--r--lib/dataset.rb52
-rw-r--r--lib/environment.rb7
-rw-r--r--lib/model.rb109
-rw-r--r--lib/opentox-ruby.rb2
-rw-r--r--lib/parser.rb172
-rw-r--r--lib/r-util.rb354
-rw-r--r--lib/rest_client_wrapper.rb2
-rw-r--r--lib/serializer.rb77
-rw-r--r--lib/stratification.R201
-rw-r--r--lib/task.rb6
-rw-r--r--lib/transform.rb520
-rw-r--r--lib/utils.rb372
-rw-r--r--lib/validation.rb58
-rw-r--r--opentox-ruby.gemspec4
19 files changed, 2058 insertions, 869 deletions
diff --git a/ChangeLog b/ChangeLog
index de9e01b..5872d56 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+v3.1.0 2012-02-24
+ * utils.rb: added for special routines (e.g. descriptor calculation)
+ * task.rb: Polling with increasing interval
+ * parser.rb: CSV up and download fixed
+ * transform.rb: routines to create machine learning data matrices
+ * algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
+ gauss() removed
+
v3.0.1 2011-10-19
* feature: model registration to ontology service
* ontology lib gets endpoints from ontology service
diff --git a/Rakefile b/Rakefile
index 952affe..dddea1b 100644
--- a/Rakefile
+++ b/Rakefile
@@ -16,7 +16,7 @@ begin
gem.add_dependency "sinatra-respond_to", "=0.7.0"
gem.add_dependency "sinatra-static-assets", "=0.5.0"
gem.add_dependency "rest-client", "=1.6.1"
- gem.add_dependency "rack", "=1.3.1"
+ gem.add_dependency "rack", "=1.3.5"
gem.add_dependency "rack-contrib", "=1.1.0"
gem.add_dependency "rack-flash", "=0.1.1"
gem.add_dependency "nokogiri", "=1.4.4"
@@ -42,10 +42,9 @@ begin
gem.add_dependency "dm-migrations", "=1.1.0"
gem.add_dependency "dm-validations", "=1.1.0"
gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
- gem.add_dependency "ruby-plot", "=0.5.0"
+ gem.add_dependency "ruby-plot", "=0.6.0"
gem.add_dependency "gsl", "=1.14.7"
gem.add_dependency "statsample", "=1.1.0"
- #gem.add_dependency "statsample-optimization", "=2.1.0"
gem.add_development_dependency 'jeweler'
gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index cf88bab..c026c56 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -5,6 +5,8 @@ R = nil
require "rinruby"
require "statsample"
require 'uri'
+require 'transform.rb'
+require 'utils.rb'
module OpenTox
@@ -13,7 +15,7 @@ module OpenTox
include OpenTox
- # Execute algorithm with parameters, please consult the OpenTox API and the webservice documentation for acceptable parameters
+ # Execute algorithm with parameters, consult OpenTox API and webservice documentation for acceptable parameters
# @param [optional,Hash] params Algorithm parameters
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
# @return [String] URI of new resource (dataset, model, ...)
@@ -21,7 +23,7 @@ module OpenTox
LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
end
-
+
# Get OWL-DL representation in RDF/XML format
# @return [application/rdf+xml] RDF/XML representation
def to_rdfxml
@@ -33,7 +35,7 @@ module OpenTox
# Generic Algorithm class, should work with all OpenTox webservices
class Generic
include Algorithm
-
+
# Find Generic Opentox Algorithm via URI, and loads metadata, could raise NotFound/NotAuthorized error
# @param [String] uri Algorithm URI
# @return [OpenTox::Algorithm::Generic] Algorithm instance
@@ -44,14 +46,14 @@ module OpenTox
raise "cannot load algorithm metadata" if alg.metadata==nil or alg.metadata.size==0
alg
end
-
+
end
# Fminer algorithms (https://github.com/amaunz/fminer2)
class Fminer
include Algorithm
attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
-
+
def check_params(params,per_mil,subjectid=nil)
raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
@@ -81,7 +83,7 @@ module OpenTox
LOGGER.warn "Cannot find smiles for #{compound.to_s}."
next
end
-
+
value_map=params[:value_map] unless params[:value_map].nil?
entry.each do |feature,values|
if feature == @prediction_feature.uri
@@ -90,7 +92,7 @@ module OpenTox
LOGGER.warn "No #{feature} activity for #{compound.to_s}."
else
if @prediction_feature.feature_type == "classification"
- activity= value_map.invert[value].to_i # activities are mapped to 1..n
+ activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
elsif @prediction_feature.feature_type == "regression"
activity= value.to_f
@@ -115,23 +117,23 @@ module OpenTox
end
- # Backbone Refinement Class mining (http://bbrc.maunz.de/)
- class BBRC < Fminer
- # Initialize bbrc algorithm
- def initialize(subjectid=nil)
- super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
- load_metadata(subjectid)
- end
+ # Backbone Refinement Class mining (http://bbrc.maunz.de/)
+ class BBRC < Fminer
+ # Initialize bbrc algorithm
+ def initialize(subjectid=nil)
+ super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
+ load_metadata(subjectid)
end
+ end
- # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
- class LAST < Fminer
- # Initialize last algorithm
- def initialize(subjectid=nil)
- super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
- load_metadata(subjectid)
- end
+ # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
+ class LAST < Fminer
+ # Initialize last algorithm
+ def initialize(subjectid=nil)
+ super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
+ load_metadata(subjectid)
end
+ end
# Create lazar prediction model
@@ -144,72 +146,6 @@ module OpenTox
end
end
- # Utility methods without dedicated webservices
-
- # Similarity calculations
- module Similarity
- include Algorithm
-
- # Tanimoto similarity
- # @param [Array] features_a Features of first compound
- # @param [Array] features_b Features of second compound
- # @param [optional, Hash] weights Weights for all features
- # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
- # @return [Float] (Weighted) tanimoto similarity
- def self.tanimoto(features_a,features_b,weights=nil,params=nil)
- common_features = features_a & features_b
- all_features = (features_a + features_b).uniq
- #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
- if common_features.size > 0
- if weights
- #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
- if !params.nil? && params[:nr_hits]
- params[:weights] = weights
- params[:mode] = "min"
- params[:features] = common_features
- common_p_sum = Algorithm.p_sum_support(params)
- params[:mode] = "max"
- params[:features] = all_features
- all_p_sum = Algorithm.p_sum_support(params)
- else
- common_p_sum = 0.0
- common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
- all_p_sum = 0.0
- all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
- end
- #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
- common_p_sum/all_p_sum
- else
- #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
- common_features.size.to_f/all_features.size.to_f
- end
- else
- 0.0
- end
- end
-
- # Euclidean similarity
- # @param [Hash] properties_a Properties of first compound
- # @param [Hash] properties_b Properties of second compound
- # @param [optional, Hash] weights Weights for all properties
- # @return [Float] (Weighted) euclidean similarity
- def self.euclidean(properties_a,properties_b,weights=nil)
- common_properties = properties_a.keys & properties_b.keys
- if common_properties.size > 1
- dist_sum = 0
- common_properties.each do |p|
- if weights
- dist_sum += ( (properties_a[p] - properties_b[p]) * Algorithm.gauss(weights[p]) )**2
- else
- dist_sum += (properties_a[p] - properties_b[p])**2
- end
- end
- 1/(1+Math.sqrt(dist_sum))
- else
- 0.0
- end
- end
- end
# Structural Graph Clustering by TU Munich
# Finds clusters similar to a query structure in a given training dataset
@@ -226,7 +162,7 @@ module OpenTox
raise "Invalid URI."
end
@training_dataset_uri = training_dataset_uri
- if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
+ if !self.numeric? training_threshold || training_threshold <0 || training_threshold >1
raise "Training threshold out of bounds."
end
@training_threshold = training_threshold.to_f
@@ -259,7 +195,7 @@ module OpenTox
# @params[Float] Similarity threshold for query to clusters (optional)
def get_clusters query_compound_uri, query_threshold = 0.5
- if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
+ if !self.numeric? query_threshold || query_threshold <0 || query_threshold >1
raise "Query threshold out of bounds."
end
@query_threshold = query_threshold.to_f
@@ -285,7 +221,7 @@ module OpenTox
metadata[DC.title][pattern]=""
feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
}
-
+
# Integrity check
unless cluster_query_dataset.compounds.size == 1
raise "Number of predicted compounds is != 1."
@@ -295,11 +231,11 @@ module OpenTox
query_compound_uri = cluster_query_dataset.compounds[0]
@target_clusters_array = Array.new
cluster_query_dataset.features.keys.each { |cluster_membership_feature|
-
+
# Getting dataset URI for cluster
target_cluster = feature_clusterid_map[cluster_membership_feature]
dataset = @clusterid_dataset_map[target_cluster]
-
+
# Finally look up presence
data_entry = cluster_query_dataset.data_entries[query_compound_uri]
present = data_entry[cluster_membership_feature][0]
@@ -311,85 +247,13 @@ module OpenTox
end
- module Neighbors
-
- # Local multi-linear regression (MLR) prediction from neighbors.
- # Uses propositionalized setting.
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
- # @return [Numeric] A prediction value.
- def self.local_mlr_prop(params)
-
- confidence=0.0
- prediction=nil
-
- if params[:neighbors].size>0
- props = params[:prop_kernel] ? get_props(params) : nil
- acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
- sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
- LOGGER.debug "Local MLR (Propositionalization / GSL)."
- prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
- transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
- prediction = transformer.values[0]
- prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
- params[:conf_stdev] = false if params[:conf_stdev].nil?
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
- confidence = nil if prediction.nil?
- end
- {:prediction => prediction, :confidence => confidence}
-
- end
-
- # Multi-linear regression weighted by similarity.
- # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
- # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
- # @return [Numeric] A prediction value.
- def self.mlr(params)
-
- # GSL matrix operations:
- # to_a : row-wise conversion to nested array
- #
- # Statsample operations (build on GSL):
- # to_scale: convert into Statsample format
-
- begin
- n_prop = params[:n_prop].collect { |v| v }
- q_prop = params[:q_prop].collect { |v| v }
- n_prop << q_prop # attach q_prop
- nr_cases, nr_features = get_sizes n_prop
- data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
-
- # Principal Components Analysis
- LOGGER.debug "PCA..."
- pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
- data_matrix = pca.data_transformed_matrix
-
- # Attach intercept column to data
- intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
- data_matrix = data_matrix.horzcat(intercept)
- (0..data_matrix.size2-2).each { |i|
- autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
- data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
- }
- # Detach query instance
- n_prop = data_matrix.to_a
- q_prop = n_prop.pop
- nr_cases, nr_features = get_sizes n_prop
- data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
- # model + support vectors
- LOGGER.debug "Creating MLR model ..."
- c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
- GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- end
+ module Neighbors
- end
# Classification with majority vote from neighbors weighted by similarity
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+ # @param [Hash] params Keys `:acts, :sims, :value_map` are required
# @return [Numeric] A prediction value.
def self.weighted_majority_vote(params)
@@ -398,12 +262,13 @@ module OpenTox
confidence = 0.0
prediction = nil
- params[:neighbors].each do |neighbor|
- neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
- neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
+ LOGGER.debug "Weighted Majority Vote Classification."
+ params[:acts].each_index do |idx|
+ neighbor_weight = params[:sims][1][idx]
+ neighbor_contribution += params[:acts][idx] * neighbor_weight
if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
- case neighbor[:activity]
+ case params[:acts][idx]
when 1
confidence_sum -= neighbor_weight
when 2
@@ -413,294 +278,257 @@ module OpenTox
confidence_sum += neighbor_weight
end
end
-
if params[:value_map].size == 2
if confidence_sum >= 0.0
- prediction = 2 unless params[:neighbors].size==0
+ prediction = 2 unless params[:acts].size==0
elsif confidence_sum < 0.0
- prediction = 1 unless params[:neighbors].size==0
+ prediction = 1 unless params[:acts].size==0
end
else
- prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction
+ prediction = (neighbor_contribution/confidence_sum).round unless params[:acts].size==0 # AM: new multinomial prediction
end
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
- confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
+ confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0
LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
return {:prediction => prediction, :confidence => confidence.abs}
end
+
+
# Local support vector regression from neighbors
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+ # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
# @return [Numeric] A prediction value.
def self.local_svm_regression(params)
- confidence = 0.0
- prediction = nil
- if params[:neighbors].size>0
- props = params[:prop_kernel] ? get_props(params) : nil
- acts = params[:neighbors].collect{ |n| n[:activity].to_f }
- sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
- prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
- transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
- prediction = transformer.values[0]
- prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
- params[:conf_stdev] = false if params[:conf_stdev].nil?
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
- confidence = nil if prediction.nil?
+ begin
+ confidence = 0.0
+ prediction = nil
+
+ LOGGER.debug "Local SVM."
+ if params[:acts].size>0
+ if params[:props]
+ n_prop = params[:props][0].collect
+ q_prop = params[:props][1].collect
+ props = [ n_prop, q_prop ]
+ end
+ acts = params[:acts].collect
+ prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+ prediction = nil if (!prediction.nil? && prediction.infinite?)
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+ confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
+ confidence = 0.0 if prediction.nil?
+ end
+ {:prediction => prediction, :confidence => confidence}
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
end
- {:prediction => prediction, :confidence => confidence}
-
+
end
- # Local support vector classification from neighbors
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+
+ # Local support vector regression from neighbors
+ # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
# @return [Numeric] A prediction value.
def self.local_svm_classification(params)
- confidence = 0.0
- prediction = nil
- if params[:neighbors].size>0
- props = params[:prop_kernel] ? get_props(params) : nil
- acts = params[:neighbors].collect { |n| act = n[:activity] }
- sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
- prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
- params[:conf_stdev] = false if params[:conf_stdev].nil?
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
+ begin
+ confidence = 0.0
+ prediction = nil
+
+ LOGGER.debug "Local SVM."
+ if params[:acts].size>0
+ if params[:props]
+ n_prop = params[:props][0].collect
+ q_prop = params[:props][1].collect
+ props = [ n_prop, q_prop ]
+ end
+ acts = params[:acts].collect
+ acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
+ prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+ prediction = prediction.sub(/Val/,"") if prediction # Convert back to Float
+ confidence = 0.0 if prediction.nil?
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+ confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
+ end
+ {:prediction => prediction, :confidence => confidence}
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
end
- {:prediction => prediction, :confidence => confidence}
-
+
end
+
# Local support vector prediction from neighbors.
- # Uses pre-defined Kernel Matrix.
+ # Uses propositionalized setting.
# Not to be called directly (use local_svm_regression or local_svm_classification).
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
# @param [Array] acts, activities for neighbors.
- # @param [Array] sims, similarities for neighbors.
- # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+ # @param [Float] min_train_performance, parameter to control censoring
# @return [Numeric] A prediction value.
- def self.local_svm(acts, sims, type, params)
- LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
- neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
- gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
+ def self.local_svm_prop(props, acts, min_train_performance)
+
+ LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
+ n_prop = props[0] # is a matrix, i.e. two nested Arrays.
+ q_prop = props[1] # is an Array.
prediction = nil
if Algorithm::zero_variance? acts
prediction = acts[0]
else
- # gram matrix
- (0..(neighbor_matches.length-1)).each do |i|
- neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
- gram_matrix[i] = [] unless gram_matrix[i]
- # upper triangle
- ((i+1)..(neighbor_matches.length-1)).each do |j|
- neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
- sim_params = {}
- if params[:nr_hits]
- sim_params[:nr_hits] = true
- sim_params[:compound_features_hits] = neighbor_i_hits
- sim_params[:training_compound_features_hits] = neighbor_j_hits
- end
- sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
- gram_matrix[i][j] = Algorithm.gauss(sim)
- gram_matrix[j] = [] unless gram_matrix[j]
- gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
- end
- gram_matrix[i][i] = 1.0
- end
-
-
#LOGGER.debug gram_matrix.to_yaml
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
- @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
- LOGGER.debug "Setting R data ..."
- # set data
- @r.gram_matrix = gram_matrix.flatten
- @r.n = neighbor_matches.size
- @r.y = acts
- @r.sims = sims
-
+ @r.eval "set.seed(1)"
+ @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
+ @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
+ @r.eval "registerDoMC()" # switch on parallel processing
begin
- LOGGER.debug "Preparing R data ..."
- # prepare data
- @r.eval "y<-as.vector(y)"
- @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
- @r.eval "sims<-as.vector(sims)"
-
- # model + support vectors
- LOGGER.debug "Creating SVM model ..."
- @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
- @r.eval "sv<-as.vector(SVindex(model))"
- @r.eval "sims<-sims[sv]"
- @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
- LOGGER.debug "Predicting ..."
- if type == "nu-svr"
- @r.eval "p<-predict(model,sims)[1,1]"
- elsif type == "C-bsvc"
- @r.eval "p<-predict(model,sims)"
- end
- if type == "nu-svr"
- prediction = @r.p
- elsif type == "C-bsvc"
- #prediction = (@r.p.to_f == 1.0 ? true : false)
- prediction = @r.p
- end
- @r.quit # free R
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
-
- end
- prediction
- end
-
- # Local support vector prediction from neighbors.
- # Uses propositionalized setting.
- # Not to be called directly (use local_svm_regression or local_svm_classification).
- # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
- # @param [Array] acts, activities for neighbors.
- # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
- # @return [Numeric] A prediction value.
- def self.local_svm_prop(props, acts, type)
-
- LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
- n_prop = props[0] # is a matrix, i.e. two nested Arrays.
- q_prop = props[1] # is an Array.
- prediction = nil
- if Algorithm::zero_variance? acts
- prediction = acts[0]
- else
- #LOGGER.debug gram_matrix.to_yaml
- @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
- @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
- LOGGER.debug "Setting R data ..."
# set data
+ LOGGER.debug "Setting R data ..."
@r.n_prop = n_prop.flatten
@r.n_prop_x_size = n_prop.size
@r.n_prop_y_size = n_prop[0].size
@r.y = acts
@r.q_prop = q_prop
+ #@r.eval "y = matrix(y)"
+ @r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
+ @r.eval "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
- begin
- LOGGER.debug "Preparing R data ..."
- # prepare data
- @r.eval "y<-matrix(y)"
- @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
- @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
-
- # model + support vectors
- LOGGER.debug "Creating SVM model ..."
- @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
- LOGGER.debug "Predicting ..."
- if type == "nu-svr"
- @r.eval "p<-predict(model,q_prop)[1,1]"
- elsif type == "C-bsvc"
- @r.eval "p<-predict(model,q_prop)"
- end
- if type == "nu-svr"
- prediction = @r.p
- elsif type == "C-bsvc"
- #prediction = (@r.p.to_f == 1.0 ? true : false)
- prediction = @r.p
- end
- @r.quit # free R
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
- end
- prediction
- end
+ # prepare data
+ LOGGER.debug "Preparing R data ..."
+ @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
+
+ @r.eval <<-EOR
+ rem = nearZeroVar(prop_matrix)
+ if (length(rem) > 0) {
+ prop_matrix = prop_matrix[,-rem,drop=F]
+ q_prop = q_prop[,-rem,drop=F]
+ }
+ rem = findCorrelation(cor(prop_matrix))
+ if (length(rem) > 0) {
+ prop_matrix = prop_matrix[,-rem,drop=F]
+ q_prop = q_prop[,-rem,drop=F]
+ }
+ EOR
- # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
- # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
- # @return[Float] Confidence
- def self.get_confidence(params)
- if params[:conf_stdev]
- sim_median = params[:sims].to_scale.median
- if sim_median.nil?
- confidence = nil
- else
- standard_deviation = params[:acts].to_scale.standard_deviation_sample
- confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
- if confidence.nan?
- confidence = nil
- end
- end
- else
- conf = params[:sims].inject{|sum,x| sum + x }
- confidence = conf/params[:neighbors].size
- end
- LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
- return confidence
- end
+ # model + support vectors
+ LOGGER.debug "Creating R SVM model ..."
+ @r.eval <<-EOR
+ model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
+ perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+ EOR
- # Get X and Y size of a nested Array (Matrix)
- def self.get_sizes(matrix)
- begin
- nr_cases = matrix.size
- nr_features = matrix[0].size
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
- #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
- [ nr_cases, nr_features ]
- end
- # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
- # Same for the vector describing the query compound
- # @param[Array] neighbors.
- # @param[OpenTox::Compound] query compound.
- # @param[Array] Dataset Features.
- # @param[Array] Fingerprints of neighbors.
- # @param[Float] p-values of Features.
- def self.get_props (params)
- matrix = Array.new
- begin
- params[:neighbors].each do |n|
- n = n[:compound]
- row = []
- params[:features].each do |f|
- if ! params[:fingerprints][n].nil?
- row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
- else
- row << 0.0
- end
- end
- matrix << row
- end
- row = []
- params[:features].each do |f|
- if params[:nr_hits]
- compound_feature_hits = params[:compound].match_hits([f])
- row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
- else
- row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
- end
+ # prediction
+ LOGGER.debug "Predicting ..."
+ @r.eval "p = predict(model,q_prop)"
+ @r.eval "if (class(y)!='numeric') p = as.character(p)"
+ prediction = @r.p
+
+ # censoring
+ prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
+ LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
end
- rescue Exception => e
- LOGGER.debug "get_props failed with '" + $! + "'"
+ @r.quit # free R
end
- [ matrix, row ]
+ prediction
end
end
+ module FeatureSelection
+ include Algorithm
+ # Recursive Feature Elimination using caret
+ # @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).
+ # @return [String] feature dataset CSV file composed of selected features.
+ def self.rfe(params)
+ @r=RinRuby.new(false,false)
+ @r.ds_csv_file = params[:ds_csv_file].to_s
+ @r.prediction_feature = params[:prediction_feature].to_s
+ @r.fds_csv_file = params[:fds_csv_file].to_s
+ @r.del_missing = params[:del_missing] == true ? 1 : 0
+ r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
+ @r.f_fds_r = r_result_file.to_s
+
+ # need packs 'randomForest', 'RANN'
+ @r.eval <<-EOR
+ set.seed(1)
+ suppressPackageStartupMessages(library('caret'))
+ suppressPackageStartupMessages(library('randomForest'))
+ suppressPackageStartupMessages(library('RANN'))
+ suppressPackageStartupMessages(library('doMC'))
+ registerDoMC()
+
+ acts = read.csv(ds_csv_file, check.names=F)
+ feats = read.csv(fds_csv_file, check.names=F)
+ ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
+
+ features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
+ y = ds[,which(names(ds) == prediction_feature)]
+
+ # assumes a data matrix 'features' and a vector 'y' of target values
+ row.names(features)=NULL
+
+ pp = NULL
+ if (del_missing) {
+ # needed if rows should be removed
+ na_ids = apply(features,1,function(x)any(is.na(x)))
+ features = features[!na_ids,]
+ y = y[!na_ids]
+ pp = preProcess(features, method=c("scale", "center"))
+ } else {
+ # Use imputation if NA's random (only then!)
+ pp = preProcess(features, method=c("scale", "center", "knnImpute"))
+ }
+ features = predict(pp, features)
+
+ # determine subsets
+ subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+ subsets = c(2,3,4,5,7,10,subsets)
+ subsets = unique(sort(round(subsets)))
+ subsets = subsets[subsets<=dim(features)[2]]
+ subsets = subsets[subsets>1]
+
+ # Recursive feature elimination
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
+
+ # read existing dataset and select most useful features
+ csv=feats[,c("SMILES", rfProfile$optVariables)]
+ write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
+ EOR
+ r_result_file
+ end
+ end
+
module Substructure
include Algorithm
# Substructure matching
- # @param [OpenTox::Compound] compound Compound
- # @param [Array] features Array with Smarts strings
+ # @param [Hash] required keys: compound, features
# @return [Array] Array with matching Smarts
- def self.match(compound,features)
- compound.match(features)
+ def self.match(params)
+ params[:compound].match(params[:features])
end
+
+ # Substructure matching with number of non-unique hits
+ # @param [Hash] required keys: compound, features
+ # @return [Hash] Hash with matching Smarts and number of hits
+ def self.match_hits(params)
+ params[:compound].match_hits(params[:features])
+ end
+
+ # Substructure matching with number of non-unique hits
+ # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
+ # @return [Hash] Hash with matching Smarts and number of hits
+ def self.lookup(params)
+ params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
+ end
end
module Dataset
@@ -709,281 +537,5 @@ module OpenTox
def features(dataset_uri,compound_uri)
end
end
-
- module Transform
- include Algorithm
-
- # The transformer that inverts values.
- # 1/x is used, after values have been moved >= 1.
- class Inverter
- attr_accessor :offset, :values
-
- # @params[Array] Values to transform.
- # @params[Float] Offset for restore.
- def initialize *args
- case args.size
- when 1
- begin
- values=args[0]
- raise "Cannot transform, values empty." if @values.size==0
- @values = values.collect { |v| -1.0 * v }
- @offset = 1.0 - @values.minmax[0]
- @offset = -1.0 * @offset if @offset>0.0
- @values.collect! { |v| v - @offset } # slide >1
- @values.collect! { |v| 1 / v } # invert to [0,1]
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
- when 2
- @offset = args[1].to_f
- @values = args[0].collect { |v| 1 / v }
- @values.collect! { |v| v + @offset }
- @values.collect! { |v| -1.0 * v }
- end
- end
- end
-
- # The transformer that takes logs.
- # Log10 is used, after values have been moved > 0.
- class Log10
- attr_accessor :offset, :values
-
- # @params[Array] Values to transform / restore.
- # @params[Float] Offset for restore.
- def initialize *args
- @distance_to_zero = 0.000000001 # 1 / 1 billion
- case args.size
- when 1
- begin
- values=args[0]
- raise "Cannot transform, values empty." if values.size==0
- @offset = values.minmax[0]
- @offset = -1.0 * @offset if @offset>0.0
- @values = values.collect { |v| v - @offset } # slide > anchor
- @values.collect! { |v| v + @distance_to_zero } #
- @values.collect! { |v| Math::log10 v } # log10 (can fail)
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
- when 2
- @offset = args[1].to_f
- @values = args[0].collect { |v| 10**v }
- @values.collect! { |v| v - @distance_to_zero }
- @values.collect! { |v| v + @offset }
- end
- end
- end
-
- # The transformer that does nothing (No OPeration).
- class NOP
- attr_accessor :offset, :values
-
- # @params[Array] Values to transform / restore.
- # @params[Float] Offset for restore.
- def initialize *args
- @offset = 0.0
- @distance_to_zero = 0.0
- case args.size
- when 1
- @values = args[0]
- when 2
- @values = args[0]
- end
- end
- end
-
-
- # Auto-Scaler for Arrays
- # Center on mean and divide by standard deviation
- class AutoScale
- attr_accessor :scaled_values, :mean, :stdev
-
- # @params[Array] Values to transform.
- def initialize values
- @scaled_values = values
- @mean = @scaled_values.to_scale.mean
- @stdev = @scaled_values.to_scale.standard_deviation_sample
- @scaled_values = @scaled_values.collect {|vi| vi - @mean }
- @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
- end
- end
-
- # Principal Components Analysis
- # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
- class PCA
- attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
-
- # Creates a transformed dataset as GSL::Matrix.
- # @param [GSL::Matrix] Data matrix.
- # @param [Float] Compression ratio from [0,1].
- # @return [GSL::Matrix] Data transformed matrix.
- def initialize data_matrix, compression=0.05
- begin
- @data_matrix = data_matrix
- @compression = compression.to_f
- @stdev = Array.new
- @mean = Array.new
-
- # Objective Feature Selection
- raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
- @data_matrix_selected = nil
- (0..@data_matrix.size2-1).each { |i|
- if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
- if @data_matrix_selected.nil?
- @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
- @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
- else
- @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
- end
- end
- }
- raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
-
- # Scaling of Axes
- @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
- (0..@data_matrix_selected.size2-1).each { |i|
- @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
- @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
- @stdev << @autoscaler.stdev
- @mean << @autoscaler.mean
- }
-
- data_matrix_hash = Hash.new
- (0..@data_matrix_scaled.size2-1).each { |i|
- column_view = @data_matrix_scaled.col(i)
- data_matrix_hash[i] = column_view.to_scale
- }
- dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
- cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
- pca=Statsample::Factor::PCA.new(cor_matrix)
- pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
- @eigenvalue_sums = Array.new
- (0..dataset_hash.fields.size-1).each { |i|
- @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
- }
- eigenvectors_selected = Array.new
- pca.eigenvectors.each_with_index { |ev, i|
- if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
- eigenvectors_selected << ev.to_a
- end
- }
- @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
- dataset_matrix = dataset_hash.to_gsl.transpose
- @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
- end
-
- # Restores data in the original feature space (possibly with compression loss).
- # @return [GSL::Matrix] Data matrix.
- def restore
- begin
- data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
- # reverse scaling
- (0..data_matrix_restored.size2-1).each { |i|
- data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
- data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
- }
- data_matrix_restored
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- end
- end
-
- end
-
- end
-
- # Gauss kernel
- # @return [Float]
- def self.gauss(x, sigma = 0.3)
- d = 1.0 - x.to_f
- Math.exp(-(d*d)/(2*sigma*sigma))
- end
-
- # For symbolic features
- # @param [Array] Array to test, must indicate non-occurrence with 0.
- # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
- def self.isnull_or_singular?(array)
- nr_zeroes = array.count(0)
- return (nr_zeroes == array.size) || # remove non-occurring feature
- (nr_zeroes == array.size-1) || # remove singular feature
- (nr_zeroes == 0) # also remove feature present everywhere
- end
-
- # Numeric value test
- # @param[Object] value
- # @return [Boolean] Whether value is a number
- def self.numeric?(value)
- true if Float(value) rescue false
- end
-
- # For symbolic features
- # @param [Array] Array to test, must indicate non-occurrence with 0.
- # @return [Boolean] Whether the feature has variance zero.
- def self.zero_variance?(array)
- return (array.to_scale.variance_population == 0.0)
- end
-
- # Sum of an array for Arrays.
- # @param [Array] Array with values
- # @return [Integer] Sum of size of values
- def self.sum_size(array)
- sum=0
- array.each { |e| sum += e.size }
- return sum
- end
-
- # Minimum Frequency
- # @param [Integer] per-mil value
- # return [Integer] min-frequency
- def self.min_frequency(training_dataset,per_mil)
- minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
- minfreq = 2 unless minfreq > 2
- Integer (minfreq)
- end
-
- # Effect calculation for classification
- # @param [Array] Array of occurrences per class in the form of Enumerables.
- # @param [Array] Array of database instance counts per class.
- def self.effect(occurrences, db_instances)
- max=0
- max_value=0
- nr_o = self.sum_size(occurrences)
- nr_db = db_instances.to_scale.sum
-
- occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
- actual = o.size.to_f/nr_o
- expected = db_instances[i].to_f/nr_db
- if actual > expected
- if ((actual - expected) / actual) > max_value
- max_value = (actual - expected) / actual # 'Schleppzeiger'
- max = i
- end
- end
- }
- max
- end
-
- # Returns Support value of an fingerprint
- # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
- # return [Numeric] Support value
- def self.p_sum_support(params)
- p_sum = 0.0
- params[:features].each{|f|
- compound_hits = params[:compound_features_hits][f]
- neighbor_hits = params[:training_compound_features_hits][f]
- p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
- }
- p_sum
- end
-
end
end
-
-
diff --git a/lib/authorization.rb b/lib/authorization.rb
index 5d57781..a9744e9 100644
--- a/lib/authorization.rb
+++ b/lib/authorization.rb
@@ -37,13 +37,15 @@ module OpenTox
#Loads and sends Policyfile(XML) to open-sso server
# @param [String] URI to create a policy for
- def send(uri)
+ def send(uri)
xml = get_xml(uri)
ret = false
- ret = Authorization.create_policy(xml, @subjectid)
+ ret = Authorization.create_policy(xml, @subjectid)
+ LOGGER.warn "Create policy on openSSO failed for URI: #{uri} subjectid: #{@subjectid}. Will try again." if !ret
+ ret = Authorization.create_policy(xml, @subjectid) if !ret
LOGGER.debug "Policy send with subjectid: #{@subjectid}"
LOGGER.warn "Not created Policy is: #{xml}" if !ret
- ret
+ ret
end
end
@@ -337,7 +339,7 @@ module OpenTox
# @param [String] subjectid
# @return [Boolean] true if access granted, else otherwise
def self.authorized?(uri, request_method, subjectid)
- if CONFIG[:authorization][:free_request].include?(request_method)
+ if CONFIG[:authorization][:free_request].include?(request_method)
#LOGGER.debug "authorized? >>true<< (request is free), method: #{request_method}, URI: #{uri}, subjectid: #{subjectid}"
true
elsif OpenTox::Authorization.free_uri?(uri, request_method)
@@ -360,7 +362,7 @@ module OpenTox
false
end
end
-
+
private
def self.free_uri?(uri, request_method)
if CONFIG[:authorization][:free_uris]
@@ -374,7 +376,7 @@ module OpenTox
end
return false
end
-
+
def self.authorize_exception?(uri, request_method)
if CONFIG[:authorization][:authorize_exceptions]
CONFIG[:authorization][:authorize_exceptions].each do |request_methods,uris|
@@ -387,6 +389,6 @@ module OpenTox
end
return false
end
-
+
end
-end \ No newline at end of file
+end
diff --git a/lib/compound.rb b/lib/compound.rb
index e7b4da0..c25125b 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -6,13 +6,15 @@ module OpenTox
# Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
class Compound
+ include OpenTox
+
attr_accessor :inchi, :uri
# Create compound with optional uri
# @example
- # compound = OpenTox::Compound.new("http://webservices.in-silico.ch/compound/InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"")
+ # compound = Compound.new("http://webservices.in-silico.ch/compound/InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"")
# @param [optional, String] uri Compound URI
- # @return [OpenTox::Compound] Compound
+ # @return [Compound] Compound
def initialize(uri=nil)
@uri = uri
case @uri
@@ -36,9 +38,9 @@ module OpenTox
# Create a compound from smiles string
# @example
- # compound = OpenTox::Compound.from_smiles("c1ccccc1")
+ # compound = Compound.from_smiles("c1ccccc1")
# @param [String] smiles Smiles string
- # @return [OpenTox::Compound] Compound
+ # @return [Compound] Compound
def self.from_smiles(smiles)
c = Compound.new
c.inchi = Compound.smiles2inchi(smiles)
@@ -48,7 +50,7 @@ module OpenTox
# Create a compound from inchi string
# @param [String] smiles InChI string
- # @return [OpenTox::Compound] Compound
+ # @return [Compound] Compound
def self.from_inchi(inchi)
c = Compound.new
c.inchi = inchi
@@ -58,7 +60,7 @@ module OpenTox
# Create a compound from sdf string
# @param [String] smiles SDF string
- # @return [OpenTox::Compound] Compound
+ # @return [Compound] Compound
def self.from_sdf(sdf)
c = Compound.new
c.inchi = Compound.sdf2inchi(sdf)
@@ -68,9 +70,9 @@ module OpenTox
# Create a compound from name. Relies on an external service for name lookups.
# @example
- # compound = OpenTox::Compound.from_name("Benzene")
+ # compound = Compound.from_name("Benzene")
# @param [String] name name can be also an InChI/InChiKey, CAS number, etc
- # @return [OpenTox::Compound] Compound
+ # @return [Compound] Compound
def self.from_name(name)
c = Compound.new
# paranoid URI encoding to keep SMILES charges and brackets
@@ -131,7 +133,7 @@ module OpenTox
# Match a smarts string
# @example
- # compound = OpenTox::Compound.from_name("Benzene")
+ # compound = Compound.from_name("Benzene")
# compound.match?("cN") # returns false
# @param [String] smarts Smarts string
def match?(smarts)
@@ -146,7 +148,7 @@ module OpenTox
# Match an array of smarts strings, returns array with matching smarts
# @example
- # compound = OpenTox::Compound.from_name("Benzene")
+ # compound = Compound.from_name("Benzene")
# compound.match(['cc','cN']) # returns ['cc']
# @param [Array] smarts_array Array with Smarts strings
# @return [Array] Array with matching Smarts strings
@@ -166,7 +168,7 @@ module OpenTox
# Match_hits an array of smarts strings, returns hash with matching smarts as key and number of non-unique hits as value
# @example
- # compound = OpenTox::Compound.from_name("Benzene")
+ # compound = Compound.from_name("Benzene")
# compound.match(['cc','cN']) # returns ['cc']
# @param [Array] smarts_array Array with Smarts strings
# @return [Hash] Hash with matching smarts as key and number of non-unique hits as value
@@ -191,6 +193,40 @@ module OpenTox
return smarts_hits
#smarts_array.collect { |s| s if match?(s)}.compact
end
+
+ # Lookup numerical values, returns hash with feature name as key and value as value
+ # @param [Array] Array of feature names
+ # @param [String] Feature dataset uri
+ # @return [Hash] Hash with feature name as key and value as value
+ def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil)
+ ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid)
+ #entry = ds.data_entries[self.uri]
+ entry = nil
+ ds.data_entries.each { |c_uri, values|
+ if c_uri.split('/compound/').last == self.to_inchi
+ entry = ds.data_entries[self.uri]
+ break
+ end
+ }
+ LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
+
+ if entry.nil?
+ uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
+ uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid)
+ ds = OpenTox::Dataset.find(uri,subjectid)
+ entry = ds.data_entries[self.uri]
+ ds.delete(subjectid)
+ end
+ features = entry.keys
+ features.each { |feature|
+ new_feature = File.join(feature_dataset_uri, "feature", feature.split("/").last)
+ entry[new_feature] = entry[feature].flatten.first.to_f # see algorithm/lazar.rb:182, to_f because feature type detection doesn't work w 1 entry
+ entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
+ }
+ #res = feature_array.collect {|v| entry[v]}
+ #LOGGER.debug "----- am #{entry.to_yaml}"
+ entry
+ end
# Get URI of compound image with highlighted fragments
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 0911073..95c1918 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -288,7 +288,7 @@ module OpenTox
# Insert a statement (compound_uri,feature_uri,value)
# @example Insert a statement (compound_uri,feature_uri,value)
- # dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", true
+ # dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", 1
# @param [String] compound Compound URI
# @param [String] feature Compound URI
# @param [Boolean,Float] value Feature value
@@ -315,6 +315,16 @@ module OpenTox
@features[feature] = metadata
end
+ # Complete feature values by adding zeroes
+ def complete_data_entries
+ all_features = @features.keys
+ @data_entries.each { |c, e|
+ (Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
+ self.add(c,f,0)
+ }
+ }
+ end
+
# Add/modify metadata for a feature
# @param [String] feature Feature URI
# @param [Hash] metadata Hash with feature metadata
@@ -363,7 +373,45 @@ module OpenTox
dataset.save(subjectid)
dataset
end
-
+
+ # merges two dataset into a new dataset (by default uses all compounds and features)
+ # precondition: both datasets are fully loaded
+ # @param [OpenTox::Dataset] dataset1 to merge
+ # @param [OpenTox::Dataset] dataset2 to merge
+ # @param [Hash] metadata
+ # @param [optional,String] subjectid
+ # @param [optional,Array] features1, if specified only this features of dataset1 are used
+ # @param [optional,Array] features2, if specified only this features of dataset2 are used
+ # @param [optional,Array] compounds1, if specified only this compounds of dataset1 are used
+ # @param [optional,Array] compounds2, if specified only this compounds of dataset2 are used
+ # example: if you want no features from dataset2, give empty array as features2
+ def self.merge( dataset1, dataset2, metadata, subjectid=nil, features1=nil, features2=nil, compounds1=nil, compounds2=nil )
+ features1 = dataset1.features.keys unless features1
+ features2 = dataset2.features.keys unless features2
+ compounds1 = dataset1.compounds unless compounds1
+ compounds2 = dataset2.compounds unless compounds2
+ data_combined = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+ LOGGER.debug("merging datasets #{dataset1.uri} and #{dataset2.uri} to #{data_combined.uri}")
+ [[dataset1, features1, compounds1], [dataset2, features2, compounds2]].each do |dataset,features,compounds|
+ compounds.each{|c| data_combined.add_compound(c)}
+ features.each do |f|
+ m = dataset.features[f]
+ m[OT.hasSource] = dataset.uri unless m[OT.hasSource]
+ data_combined.add_feature(f,m)
+ compounds.each do |c|
+ dataset.data_entries[c][f].each do |v|
+ data_combined.add(c,f,v)
+ end if dataset.data_entries[c] and dataset.data_entries[c][f]
+ end
+ end
+ end
+ metadata = {} unless metadata
+ metadata[OT.hasSource] = "Merge from #{dataset1.uri} and #{dataset2.uri}" unless metadata[OT.hasSource]
+ data_combined.add_metadata(metadata)
+ data_combined.save(subjectid)
+ data_combined
+ end
+
# Save dataset at the dataset service
# - creates a new dataset if uri is not set
# - overwrites dataset if uri exists
diff --git a/lib/environment.rb b/lib/environment.rb
index 3775797..c1b8312 100644
--- a/lib/environment.rb
+++ b/lib/environment.rb
@@ -24,7 +24,11 @@ end
# database
#`redis-server /opt/redis/redis.conf` unless File.exists? "/var/run/redis.pid" # removed by AM
-Ohm.connect :thread_safe => true
+ohm_port=6379
+if !CONFIG[:ohm_port].nil?
+ ohm_port=CONFIG[:ohm_port].to_i
+end
+Ohm.connect(:thread_safe => true, :port => ohm_port)
# load mail settings for error messages
#load File.join config_dir,"mail.rb" if File.exists?(File.join config_dir,"mail.rb")
@@ -87,4 +91,5 @@ DC = OwlNamespace.new 'http://purl.org/dc/elements/1.1/'
OT = OwlNamespace.new 'http://www.opentox.org/api/1.1#'
OTA = OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#'
XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#'
+#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
diff --git a/lib/model.rb b/lib/model.rb
index 0b116c2..a858a0f 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -102,8 +102,8 @@ module OpenTox
include Algorithm
include Model
- attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim, :subjectid, :prop_kernel, :value_map, :nr_hits, :transform, :conf_stdev, :prediction_min_max
+ attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
def initialize(uri=nil)
if uri
@@ -120,18 +120,11 @@ module OpenTox
@p_values = {}
@fingerprints = {}
@value_map = {}
- @prediction_min_max = []
@feature_calculation_algorithm = "Substructure.match"
@similarity_algorithm = "Similarity.tanimoto"
@prediction_algorithm = "Neighbors.weighted_majority_vote"
- @nr_hits = false
- @min_sim = 0.3
- @prop_kernel = false
- @transform = { "class" => "NOP" }
- @conf_stdev = false
-
end
# Get URIs of all lazar models
@@ -174,19 +167,14 @@ module OpenTox
lazar.feature_calculation_algorithm = hash["feature_calculation_algorithm"] if hash["feature_calculation_algorithm"]
lazar.similarity_algorithm = hash["similarity_algorithm"] if hash["similarity_algorithm"]
lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
- lazar.min_sim = hash["min_sim"] if hash["min_sim"]
lazar.subjectid = hash["subjectid"] if hash["subjectid"]
- lazar.prop_kernel = hash["prop_kernel"] if hash["prop_kernel"]
lazar.value_map = hash["value_map"] if hash["value_map"]
- lazar.nr_hits = hash["nr_hits"] if hash["nr_hits"]
- lazar.transform = hash["transform"] if hash["transform"]
- lazar.conf_stdev = hash["conf_stdev"] if hash["conf_stdev"]
- lazar.prediction_min_max = hash["prediction_min_max"] if hash["prediction_min_max"]
+
lazar
end
def to_json
- Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :min_sim => @min_sim, :subjectid => @subjectid, :prop_kernel => @prop_kernel, :value_map => @value_map, :nr_hits => @nr_hits, :transform => @transform, :conf_stdev => @conf_stdev, :prediction_min_max => @prediction_min_max})
+ Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
end
def run( params, accept_header=nil, waiting_task=nil )
@@ -230,8 +218,11 @@ module OpenTox
predict(compound_uri,false,subjectid)
count += 1
waiting_task.progress( count/d.compounds.size.to_f*100.0 ) if waiting_task
- rescue => ex
- LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+ex.message+" subjectid: #{subjectid}"
+ rescue => e
+ LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+e.message+" subjectid: #{subjectid}"
+ #LOGGER.debug "#{e.class}: #{e.message}"
+ #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+
end
end
#@prediction_dataset.save(subjectid)
@@ -246,7 +237,6 @@ module OpenTox
@compound = Compound.new compound_uri
features = {}
-
#LOGGER.debug self.to_yaml
unless @prediction_dataset
@prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@@ -257,29 +247,42 @@ module OpenTox
OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
} )
end
-
if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
all_activities = []
all_activities = @activities.values.flatten.collect! { |i| i.to_f }
- @prediction_min_max[0] = (all_activities.to_scale.min/2)
- @prediction_min_max[1] = (all_activities.to_scale.max*2)
end
-
unless database_activity(subjectid) # adds database activity to @prediction_dataset
+ # Calculation of needed values for query compound
+ @compound_features = eval("#{@feature_calculation_algorithm}({
+ :compound => @compound,
+ :features => @features,
+ :feature_dataset_uri => @metadata[OT.featureDataset],
+ :pc_type => self.parameter(\"pc_type\"),
+ :subjectid => subjectid
+ })")
+ # Adding fingerprint of query compound with features and values(p_value*nr_hits)
+ @compound_fingerprints = {}
+ @compound_features.each do |feature, value| # value is nil if "Substructure.match"
+ if @feature_calculation_algorithm == "Substructure.match_hits"
+ @compound_fingerprints[feature] = @p_values[feature] * value
+ elsif @feature_calculation_algorithm == "Substructure.match"
+ @compound_fingerprints[feature] = @p_values[feature]
+ elsif @feature_calculation_algorithm == "Substructure.lookup"
+ @compound_fingerprints[feature] = value
+ end
+ end
- neighbors
- prediction = eval("#{@prediction_algorithm} ( { :neighbors => @neighbors,
- :compound => @compound,
- :features => @features,
- :p_values => @p_values,
- :fingerprints => @fingerprints,
- :similarity_algorithm => @similarity_algorithm,
- :prop_kernel => @prop_kernel,
+ # Transform model data to machine learning scheme (tables of data)
+ mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self)
+ mtf.transform
+
+ # Make a prediction
+ prediction = eval("#{@prediction_algorithm}( { :props => mtf.props,
+ :acts => mtf.acts,
+ :sims => mtf.sims,
:value_map => @value_map,
- :nr_hits => @nr_hits,
- :conf_stdev => @conf_stdev,
- :prediction_min_max => @prediction_min_max,
- :transform => @transform } ) ")
+ :min_train_performance => self.parameter(\"min_train_performance\")
+ } ) ")
value_feature_uri = File.join( @uri, "predicted", "value")
confidence_feature_uri = File.join( @uri, "predicted", "confidence")
@@ -355,44 +358,6 @@ module OpenTox
@prediction_dataset
end
-
-
- # Find neighbors and store them as object variable, access all compounds for that.
- def neighbors
- @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm
- @neighbors = []
- @fingerprints.keys.each do |training_compound| # AM: access all compounds
- add_neighbor @fingerprints[training_compound].keys, training_compound
- end
- end
-
- # Adds a neighbor to @neighbors if it passes the similarity threshold.
- def add_neighbor(training_features, training_compound)
- compound_features_hits = {}
- training_compound_features_hits = {}
- if @nr_hits
- compound_features_hits = @compound.match_hits(@compound_features)
- training_compound_features_hits = @fingerprints[training_compound]
- #LOGGER.debug "dv ------------ training_compound_features_hits:#{training_compound_features_hits.class} #{training_compound_features_hits}"
- end
- params = {}
- params[:nr_hits] = @nr_hits
- params[:compound_features_hits] = compound_features_hits
- params[:training_compound_features_hits] = training_compound_features_hits
-
- sim = eval("#{@similarity_algorithm}(training_features, @compound_features, @p_values, params)")
- if sim > @min_sim
- @activities[training_compound].each do |act|
- @neighbors << {
- :compound => training_compound,
- :similarity => sim,
- :features => training_features,
- :activity => act
- }
- end
- end
- end
-
# Find database activities and store them in @prediction_dataset
# @return [Boolean] true if compound has databasse activities, false if not
def database_activity(subjectid)
diff --git a/lib/opentox-ruby.rb b/lib/opentox-ruby.rb
index 1fa2a86..d25632c 100644
--- a/lib/opentox-ruby.rb
+++ b/lib/opentox-ruby.rb
@@ -9,6 +9,6 @@ rescue LoadError
end
['opentox', 'compound','dataset', 'parser','serializer', 'algorithm','model','task','validation','feature',
- 'rest_client_wrapper', 'authorization', 'policy', 'helper', 'to-html', 'ontology' ].each do |lib|
+ 'rest_client_wrapper', 'authorization', 'policy', 'helper', 'to-html', 'ontology', 'r-util' ].each do |lib|
require lib
end
diff --git a/lib/parser.rb b/lib/parser.rb
index d0975af..56e4fed 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -57,7 +57,7 @@ module OpenTox
`rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
triple = line.to_triple
if triple[0] == @uri
- if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
+ if triple[1] == RDF.type || triple[1]==OT.predictedVariables || triple[1]==OT.independentVariables # allow multiple types
@metadata[triple[1]] = [] unless @metadata[triple[1]]
@metadata[triple[1]] << triple[2].split('^^').first
else
@@ -290,10 +290,11 @@ module OpenTox
@features = []
@feature_types = {}
- @format_errors = ""
- @smiles_errors = []
+ @format_errors = []
+ @id_errors = []
@activity_errors = []
@duplicates = {}
+ @max_class_values = 3
end
def detect_new_values(row, value_maps)
@@ -309,9 +310,10 @@ module OpenTox
# Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
# @param [Excel] book Excel workbook object (created with roo gem)
# @return [OpenTox::Dataset] Dataset object with Excel data
- def load_spreadsheet(book)
+ def load_spreadsheet(book, drop_missing=false)
book.default_sheet = 0
- add_features book.row(1)
+ headers = book.row(1)
+ add_features headers
value_maps = Array.new
regression_features=Array.new
@@ -319,15 +321,27 @@ module OpenTox
row = book.row(i)
value_maps = detect_new_values(row, value_maps)
value_maps.each_with_index { |vm,j|
- if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
+ if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
regression_features[j]=true
else
regression_features[j]=false
end
}
}
+
2.upto(book.last_row) { |i|
- add_values book.row(i), regression_features
+ drop=false
+ row = book.row(i)
+ raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
+ if row.include?("")
+ @format_errors << "Row #{i} has #{row.count("")} missing values"
+ drop=true
+ drop_missing=true if (row.count("") == row.size-1)
+ end
+ add_values(row, regression_features) unless (drop_missing && drop)
+ if (drop_missing && drop)
+ @format_errors << "Row #{i} not added"
+ end
}
warnings
@dataset
@@ -336,10 +350,11 @@ module OpenTox
# Load CSV string (format specification: http://toxcreate.org/help)
# @param [String] csv CSV representation of the dataset
# @return [OpenTox::Dataset] Dataset object with CSV data
- def load_csv(csv)
+ def load_csv(csv, drop_missing=false)
row = 0
input = csv.split("\n")
- add_features split_row(input.shift)
+ headers = split_row(input.shift)
+ add_features(headers)
value_maps = Array.new
regression_features=Array.new
@@ -347,15 +362,27 @@ module OpenTox
row = split_row(row)
value_maps = detect_new_values(row, value_maps)
value_maps.each_with_index { |vm,j|
- if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
+ if vm.size > @max_class_values # max @max_class_values classes.
regression_features[j]=true
else
regression_features[j]=false
end
}
}
- input.each { |row|
- add_values split_row(row), regression_features
+
+ input.each_with_index { |row, i|
+ drop=false
+ row = split_row(row)
+ raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
+ if row.include?("")
+ @format_errors << "Row #{i} has #{row.count("")} missing values"
+ drop=true
+ drop_missing=true if (row.count("") == row.size-1)
+ end
+ add_values(row, regression_features) unless (drop_missing && drop)
+ if (drop_missing && drop)
+ @format_errors << "Row #{i} not added"
+ end
}
warnings
@dataset
@@ -367,88 +394,115 @@ module OpenTox
info = ''
@feature_types.each do |feature,types|
- if types.uniq.size > 1
+ if types.uniq.size == 0
+ type = "helper#MissingFeature"
+ elsif types.uniq.size > 1
type = OT.NumericFeature
else
type = types.first
end
@dataset.add_feature_metadata(feature,{RDF.type => [type]})
- info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
+ info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
# TODO: rewrite feature values
- # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored."
+ # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
end
@dataset.metadata[OT.Info] = info
warnings = ''
- warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty?
+ warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty?
warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
+ warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty?
duplicate_warnings = ''
@duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
- warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
+ warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
@dataset.metadata[OT.Warnings] = warnings
end
+ # Adds a row of features to a dataset
+ # @param Array A row split up as an array
+ # @return Array Indices for duplicate features
def add_features(row)
- row.shift # get rid of smiles entry
- row.each do |feature_name|
+ row=row.collect
+ row.shift # get rid of id entry
+ @duplicate_feature_indices = [] # starts with 0 at first f after id
+ row.each_with_index do |feature_name, idx|
feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
- @feature_types[feature_uri] = []
- @features << feature_uri
- @dataset.add_feature(feature_uri,{DC.title => feature_name})
+ unless @features.include? feature_uri
+ @feature_types[feature_uri] = []
+ @features << feature_uri
+ @dataset.add_feature(feature_uri,{DC.title => feature_name})
+ else
+ @duplicate_feature_indices << idx
+ @format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}"
+ end
end
end
# Adds a row to a dataset
# @param Array A row split up as an array
# @param Array Indicator for regression for each field
+ # @param Array Indices for duplicate features
def add_values(row, regression_features)
- smiles = row.shift
- compound = Compound.from_smiles(smiles)
+ id = row.shift
+ case id
+ when /InChI/
+ compound = Compound.from_inchi(URI.decode_www_form_component(id))
+ else
+ compound = Compound.from_smiles(id)
+ end
+
if compound.nil? or compound.inchi.nil? or compound.inchi == ""
- @smiles_errors << smiles+", "+row.join(", ")
+ @id_errors << id+", "+row.join(", ")
return false
end
@duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
- @duplicates[compound.inchi] << smiles+", "+row.join(", ")
+ @duplicates[compound.inchi] << id+", "+row.join(", ")
+ feature_idx = 0
row.each_index do |i|
- value = row[i]
- feature = @features[i]
- type = nil
- if (regression_features[i])
- type = feature_type(value)
- if type != OT.NumericFeature
- raise "Error! Expected numeric values."
+ unless @duplicate_feature_indices.include? i
+
+ value = row[i]
+ #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
+ feature = @features[feature_idx]
+
+ type = feature_type(value) # May be NIL
+ type = OT.NominalFeature unless (type.nil? || regression_features[i])
+ @feature_types[feature] << type if type
+
+ val = nil
+ case type
+ when OT.NumericFeature
+ val = value.to_f
+ when OT.NominalFeature
+ val = value.to_s
end
- else
- type = OT.NominalFeature
- end
- @feature_types[feature] << type
- case type
- when OT.NumericFeature
- val = value.to_f
- when OT.NominalFeature
- val = value.to_s
- end
- if val!=nil
- @dataset.add(compound.uri, feature, val)
- if type!=OT.NumericFeature
- @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
- @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+ feature_idx += 1
+
+ if val != nil
+ @dataset.add(compound.uri, feature, val)
+ if type != OT.NumericFeature
+ @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
+ @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+ end
end
+
end
+
end
end
def feature_type(value)
- if OpenTox::Algorithm::numeric? value
+ if value == ""
+ return nil
+ elsif OpenTox::Algorithm::numeric? value
return OT.NumericFeature
else
return OT.NominalFeature
@@ -456,7 +510,7 @@ module OpenTox
end
def split_row(row)
- row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes
+ row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells
end
end
@@ -468,6 +522,7 @@ module OpenTox
def initialize
@data = {}
@activity_errors = []
+ @max_class_values = 3
end
def feature_values(feature)
@@ -485,14 +540,14 @@ module OpenTox
def clean_features
ignored_features = []
features.each do |feature|
- if feature_values(feature).size > 5
+ if feature_values(feature).size > @max_class_values
if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
# REGRESSION
elsif feature_types(feature).include? OT.NumericFeature
@data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
@activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
else
- @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
+ @activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)."
ignored_features << feature
next
end
@@ -543,12 +598,15 @@ module OpenTox
private
def feature_type(value)
- if OpenTox::Algorithm::numeric? value
+ if value.nil?
+ return nil
+ elsif OpenTox::Algorithm::numeric? value
return OT.NumericFeature
else
return OT.NominalFeature
end
end
+
end
# quick hack to enable sdf import via csv
@@ -589,20 +647,20 @@ module OpenTox
@duplicates[inchi] << rec #inchi#+", "+row.join(", ")
compound = Compound.from_inchi inchi
rescue
- @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
+ @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}"
next
end
row = {}
obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
table.data[compound.uri] = row
end
-
- # finda and remove ignored_features
+
+ # find and remove ignored_features
@activity_errors = table.clean_features
table.add_to_dataset @dataset
warnings = ''
- warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
+ warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
duplicate_warnings = ''
@duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
diff --git a/lib/r-util.rb b/lib/r-util.rb
new file mode 100644
index 0000000..7163c46
--- /dev/null
+++ b/lib/r-util.rb
@@ -0,0 +1,354 @@
+# pending: package dir hack ---------
+# CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
+# PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
+package_dir = CONFIG[:base_dir].split("/")
+package_dir[-1] = "r-packages"
+package_dir = package_dir.join("/")
+PACKAGE_DIR = package_dir
+
+require "tempfile"
+
+module OpenTox
+
+ class RUtil
+
+ @@feats = {}
+
+ def initialize
+ @r = RinRuby.new(true,false) unless defined?(@r) and @r
+ @r.eval ".libPaths('#{PACKAGE_DIR}')"
+ @r_packages = @r.pull "installed.packages()[,1]"
+ ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
+ @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
+ end
+
+ def quit_r
+ begin
+ @r.quit
+ @r = nil
+ rescue
+ end
+ end
+
+ def r
+ @r
+ end
+
+ def package_installed?( package )
+ @r_packages.include?(package)
+ end
+
+ def install_package( package )
+ unless package_installed?(package)
+ LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
+ @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
+ end
+ end
+
+ # <0 -> array1 << array2
+ # 0 -> no significant difference
+ # >0 -> array2 >> array1
+ def paired_ttest(array1, array2, significance_level=0.95)
+ @r.assign "v1",array1
+ @r.assign "v2",array2
+ @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
+ t = @r.pull "ttest$statistic"
+ p = @r.pull "ttest$p.value"
+ if (1-significance_level > p)
+ t
+ else
+ 0
+ end
+ end
+
+ # example:
+ # files = ["/tmp/box.svg","/tmp/box.png"]
+ # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
+ # boxplot(files, data, "comparison1" )
+ #
+ def boxplot(files, data, title="")
+ LOGGER.debug("r-util> create boxplot")
+ assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
+ plot_to_files(files) do |file|
+ @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
+ end
+ end
+
+ # embedds feature values of two datasets into 2D and plots it
+ # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
+ #
+ def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
+ features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
+
+ raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
+ LOGGER.debug("r-util> create feature value plot")
+ d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
+ d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
+ if features
+ [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}}
+ else
+ raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if
+ (d1.features.keys.sort != d2.features.keys.sort)
+ features = d1.features.keys
+ end
+ raise "at least two features needed" if d1.features.keys.size<2
+ waiting_task.progress(25) if waiting_task
+
+ df1 = dataset_to_dataframe(d1,0,subjectid,features)
+ df2 = dataset_to_dataframe(d2,0,subjectid,features)
+ waiting_task.progress(50) if waiting_task
+
+ @r.eval "df <- rbind(#{df1},#{df2})"
+ @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
+ @r.names = [dataset_name1, dataset_name2]
+ LOGGER.debug("r-util> - convert data to 2d")
+ @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
+ waiting_task.progress(75) if waiting_task
+
+ if fast_plot
+ info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
+ else
+ info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
+ end
+ LOGGER.debug("r-util> - plot data")
+ plot_to_files(files) do |file|
+ @r.eval "plot_split( df.2d, split, names, #{info})"
+ end
+ end
+
+ # plots a double histogram
+ # data1 and data2 are arrays with values, either numerical or categorial (string values)
+ # is_numerical, boolean flag indicating value types
+ # log (only for numerical), plot logarithm of values
+ def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
+ LOGGER.debug("r-util> create double hist plot")
+ all = data1 + data2
+ if (is_numerical)
+ @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
+ {
+ if (log)
+ {
+ data1 <- log(data1)
+ data2 <- log(data2)
+ xlab = paste('logarithm of',xlab,sep=' ')
+ }
+ xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
+ h <- hist(rbind(data1,data2),plot=F)
+ h1 <- hist(data1,plot=F,breaks=h$breaks)
+ h2 <- hist(data2,plot=F,breaks=h$breaks)
+ xlims = c(min(h$breaks),max(h$breaks))
+ ylims = c(0,max(h1$counts,h2$counts))
+ xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
+ plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
+ main=title, xlab=xlab, ylab='counts' )
+ plot(h2, col=rgb(0,1,0,2/4), add=T )
+ legend('topleft',names,lty=c(1,1),col=c('red','green'))
+ }"
+ @r.assign("data1",data1)
+ @r.assign("data2",data2)
+ @r.legend = [name1, name2]
+ else
+ raise "log not valid for categorial" if log
+ vals = all.uniq.sort!
+ counts1 = vals.collect{|e| data1.count(e)}
+ counts2 = vals.collect{|e| data2.count(e)}
+ @r.data1 = counts1
+ @r.data2 = counts2
+ @r.value_names = [name1, name2]
+ @r.legend = vals
+ @r.eval("data <- cbind(data1,data2)")
+ end
+
+ plot_to_files(files) do |file|
+ if (is_numerical)
+ @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
+ else
+ @r.eval("bp <- barplot(data, beside=T, names.arg=value_names,
+ main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
+ @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
+ end
+ end
+ end
+
+ # stratified splits a dataset into two dataset the feature values
+ # all features are taken into account unless <split_features> is given
+ def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+ raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
+ LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
+
+ df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
+ @r.eval "set.seed(#{seed})"
+ @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
+ split = @r.pull 'split'
+ split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
+ split_to_datasets( df, split, subjectid )
+ end
+
+ # dataset should be loaded completely (use Dataset.find)
+ # takes duplicates into account
+ # replaces missing values with param <missing_value>
+ # returns dataframe-variable-name in R
+ def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
+ LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
+
+ # count duplicates
+ num_compounds = {}
+ dataset.features.keys.each do |f|
+ dataset.compounds.each do |c|
+ if dataset.data_entries[c]
+ val = dataset.data_entries[c][f]
+ size = val==nil ? 1 : val.size
+ num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
+ else
+ num_compounds[c] = 1
+ end
+ end
+ end
+
+ # use either all, or the provided features, sorting is important as col-index := features
+ if features
+ features.sort!
+ else
+ features = dataset.features.keys.sort
+ end
+ compounds = []
+ dataset.compounds.each do |c|
+ num_compounds[c].times do |i|
+ compounds << c
+ end
+ end
+
+ # values into 2D array, then to dataframe
+ d_values = []
+ dataset.compounds.each do |c|
+ num_compounds[c].times do |i|
+ c_values = []
+ features.each do |f|
+ if dataset.data_entries[c]
+ val = dataset.data_entries[c][f]
+ v = val==nil ? "" : val[i].to_s
+ else
+ raise "wtf" if i>0
+ v = ""
+ end
+ v = missing_value if v.size()==0
+ c_values << v
+ end
+ d_values << c_values
+ end
+ end
+ df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
+ assign_dataframe(df_name,d_values,compounds,features)
+
+ # set dataframe column types accordingly
+ f_count = 1 #R starts at 1
+ features.each do |f|
+ feat = OpenTox::Feature.find(f,subjectid)
+ nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+ if nominal
+ @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
+ else
+ @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
+ end
+ f_count += 1
+ end
+ #@r.eval "head(#{df_name})"
+
+ # store compounds, and features (including metainformation)
+ @@feats[df_name] = {}
+ features.each do |f|
+ @@feats[df_name][f] = dataset.features[f]
+ end
+ df_name
+ end
+
+ # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
+ # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
+ def dataframe_to_dataset( df, subjectid=nil )
+ dataframe_to_dataset_indices( df, subjectid, nil)
+ end
+
+ private
+ def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
+ raise unless @@feats[df].size>0
+ values, compounds, features = pull_dataframe(df)
+ features.each{|f| raise unless @@feats[df][f]}
+ dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+ LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
+ compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
+ features.each{|f| dataset.add_feature(f,@@feats[df][f])}
+ features.size.times do |c|
+ feat = OpenTox::Feature.find(features[c],subjectid)
+ nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+ compounds.size.times do |r|
+ if compound_indices==nil or compound_indices.include?(r)
+ dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
+ end
+ end
+ end
+ dataset.save(subjectid)
+ dataset
+ end
+
+ def split_to_datasets( df, split, subjectid=nil )
+ sets = []
+ (split.min.to_i .. split.max.to_i).each do |i|
+ indices = []
+ split.size.times{|j| indices<<j if split[j]==i}
+ dataset = dataframe_to_dataset_indices( df, subjectid, indices )
+ LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+ sets << dataset
+ end
+ sets
+ end
+
+ def pull_dataframe(df)
+ tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
+ @r.eval "write.table(#{df},file='#{tmp}',sep='#')"
+ res = []; compounds = []; features = []
+ first = true
+ file = File.new(tmp, 'r')
+ file.each_line("\n") do |row|
+ if first
+ features = row.chomp.split("#").collect{|e| e.gsub("\"","")}
+ first = false
+ else
+ vals = row.chomp.split("#").collect{|e| e.gsub("\"","")}
+ compounds << vals[0]
+ res << vals[1..-1]
+ end
+ end
+ begin File.delete(tmp); rescue; end
+ return res, compounds, features
+ end
+
+ def assign_dataframe(df,input,rownames,colnames)
+ tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
+ file = File.new(tmp, 'w')
+ input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
+ file.flush
+ @r.rownames = rownames if rownames
+ @r.colnames = colnames
+ @r.eval "#{df} <- read.table(file='#{tmp}',sep='#',"+
+ "#{rownames ? "row.names=rownames" : ""},col.names=colnames,check.names=F)"
+ begin File.delete(tmp); rescue; end
+ end
+
+ def plot_to_files(files)
+ files.each do |file|
+ if file=~/(?i)\.svg/
+ @r.eval("svg('#{file}',10,8)")
+ elsif file=~/(?i)\.png/
+ @r.eval("png('#{file}')")
+ else
+ raise "invalid format: "+file.to_s
+ end
+ yield file
+ LOGGER.debug "r-util> plotted to #{file}"
+ @r.eval("dev.off()")
+ end
+ end
+ end
+end
+
+
diff --git a/lib/rest_client_wrapper.rb b/lib/rest_client_wrapper.rb
index 6d25bb3..fcadebb 100644
--- a/lib/rest_client_wrapper.rb
+++ b/lib/rest_client_wrapper.rb
@@ -70,7 +70,7 @@ module OpenTox
begin
#LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect
- resource = RestClient::Resource.new(uri,{:timeout => 60})
+ resource = RestClient::Resource.new(uri,{:timeout => 600})
if rest_call=="post" || rest_call=="put"
result = resource.send(rest_call, payload, headers)
else
diff --git a/lib/serializer.rb b/lib/serializer.rb
index b62ac45..30cb2ba 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -55,7 +55,7 @@ module OpenTox
OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
- #object props for validation#
+ #object props for validation#
OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
@@ -87,7 +87,7 @@ module OpenTox
OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
- # annotation props for validation
+ # annotation props for validation
OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
@@ -143,8 +143,8 @@ module OpenTox
@data_entries = {}
@values_id = 0
@parameter_id = 0
-
- @classes = Set.new
+
+ @classes = Set.new
@object_properties = Set.new
@annotation_properties = Set.new
@datatype_properties = Set.new
@@ -208,7 +208,7 @@ module OpenTox
@object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] }
add_metadata uri, metadata
end
-
+
# Add a resource defined by resource_class and content
# (see documentation of add_content for example)
# @param [String] uri of resource
@@ -223,10 +223,10 @@ module OpenTox
def add_uri(uri,type)
@object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] }
end
-
+
private
@@content_id = 1
-
+
#Recursiv function to add content
#@example
# { DC.description => "bla",
@@ -244,7 +244,7 @@ module OpenTox
hash.each do |u,v|
if v.is_a? Hash
# value is again a hash, i.e. a new owl class is added
- # first make sure type (==class) is set
+ # first make sure type (==class) is set
type = v[RDF.type]
raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type
raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type)
@@ -256,7 +256,7 @@ module OpenTox
# add content to new class
add_content(genid,v)
elsif v.is_a? Array
- # value is an array, i.e. a list of values with property is added
+ # value is an array, i.e. a list of values with property is added
v.each{ |vv| add_content( uri, { u => vv } ) }
else # v.is_a? String
# simple string value
@@ -268,7 +268,7 @@ module OpenTox
end
end
end
-
+
public
# Add metadata
@@ -329,7 +329,7 @@ module OpenTox
v = [{ "type" => "uri", "value" => value}]
when "literal"
v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }]
- else
+ else
raise "Illegal type #{type(value)} for #{value}."
end
@object[values] = {
@@ -342,7 +342,7 @@ module OpenTox
end
# Serializers
-
+
# Convert to N-Triples
# @return [text/plain] Object OWL-DL in N-Triples format
def to_ntriples
@@ -353,7 +353,7 @@ module OpenTox
entry.each do |p,objects|
p = url(p)
objects.each do |o|
- case o["type"]
+ case o["type"]
when "uri"
o = url(o["value"])
when "literal"
@@ -371,9 +371,15 @@ module OpenTox
# Convert to RDF/XML
# @return [text/plain] Object OWL-DL in RDF/XML format
def to_rdfxml
- Tempfile.open("owl-serializer"){|f| f.write(self.to_ntriples); @path = f.path}
+ tmpf = Tempfile.open("owl-serializer")
+ tmpf.write(self.to_ntriples)
+ tmpf.flush
+ @path = tmpf.path
# TODO: add base uri for ist services
- `rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
+ res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
+ tmpf.close
+ tmpf.delete
+ res
end
# Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification
@@ -427,20 +433,20 @@ module OpenTox
end
def literal(value,type)
- # concat and << are faster string concatination operators than +
+ # concat and << are faster string concatination operators than +
'"'.concat(value.to_s).concat('"^^<').concat(type).concat('>')
end
def url(uri)
- # concat and << are faster string concatination operators than +
+ # concat and << are faster string concatination operators than +
'<'.concat(uri).concat('>')
end
def rdf_types
- @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
- @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
- @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
- @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
+ @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
+ @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
+ @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
+ @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
end
end
@@ -457,35 +463,46 @@ module OpenTox
@rows.first << features
@rows.first.flatten!
dataset.data_entries.each do |compound,entries|
- smiles = Compound.new(compound).to_smiles
+ cmpd = Compound.new(compound)
+ smiles = cmpd.to_smiles
+ inchi = URI.encode_www_form_component(cmpd.to_inchi)
+ row_container = Array.new
row = Array.new(@rows.first.size)
- row[0] = smiles
+ row_container << row
+ #row[0] = smiles
+ row[0] = inchi
entries.each do |feature, values|
i = features.index(feature)+1
values.each do |value|
- if row[i]
- row[i] = "#{row[i]} #{value}" # multiple values
+ if row_container[0][i]
+ #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
+ row_container << row_container.last.collect
+ row_container.last[i] = value
+ #LOGGER.debug "RC: #{row_container.to_yaml}"
else
- row[i] = value
+ row_container.each { |r| r[i] = value }
end
end
end
- @rows << row
+ row_container.each { |r| @rows << r }
end
end
# Convert to CSV string
# @return [String] CSV string
def to_csv
- @rows.collect{|r| r.join(", ")}.join("\n")
+ rows = @rows.collect
+ result = ""
+ result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
+ result << rows.collect{ |r| r.join(",") }.join("\n")
end
# Convert to spreadsheet workbook
# @return [Spreadsheet::Workbook] Workbook object (use the spreadsheet gemc to write a file)
- def to_spreadsheet
+ def to_spreadsheet(sheetname="sheet1")
Spreadsheet.client_encoding = 'UTF-8'
book = Spreadsheet::Workbook.new
- sheet = book.create_worksheet(:name => '')
+ sheet = book.create_worksheet(:name => "#{sheetname}")
sheet.column(0).width = 100
i = 0
@rows.each do |row|
diff --git a/lib/stratification.R b/lib/stratification.R
new file mode 100644
index 0000000..76ff2d8
--- /dev/null
+++ b/lib/stratification.R
@@ -0,0 +1,201 @@
+
+nominal_to_binary <- function( data )
+{
+ result = NULL
+ for (i in 1:ncol(data))
+ {
+ #print(i)
+ if (is.numeric( data[,i] ) )
+ {
+ if (is.null(result))
+ result = data.frame(data[,i])
+ else
+ result = data.frame(result, data[,i])
+ colnames(result)[ncol(result)] <- colnames(data)[i]
+ }
+ else
+ {
+ vals = unique(data[,i])
+ for (j in 1:length(vals))
+ {
+ #print(j)
+ bins = c()
+ for (k in 1:nrow(data))
+ {
+ if(data[,i][k] == vals[j])
+ bins = c(bins,1)
+ else
+ bins = c(bins,0)
+ }
+ #print(bins)
+ if (is.null(result))
+ result = data.frame(bins)
+ else
+ result = data.frame(result, bins)
+ colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
+ if (length(vals)==2) break
+ }
+ }
+ }
+ #print(head(result))
+ result
+}
+
+process_data <- function( data )
+{
+ data.num <- as.data.frame(data)
+ if (!is.numeric(data.num))
+ {
+ data.num = nominal_to_binary(data.num)
+ }
+ if(any(is.na(data.num)))
+ {
+ require("gam")
+ data.repl = na.gam.replace(data.num)
+ }
+ else
+ data.repl = data.num
+ data.repl
+}
+
+cluster <- function( data, min=10, max=15 )
+{
+ require("vegan")
+ max <- min(max,nrow(unique(data)))
+ max <- min(max,nrow(data)-1)
+ if (min>max)
+ min=max
+ print(paste("cascade k-means ",min," - ",max))
+ s = cascadeKM(data,min,max,iter=30)
+ m = max.col(s$results)[2]
+ print(paste("best k-means clustering result: ",((m-1)+min)," num clusters"))
+ cbind(s$partition[,m])
+}
+
+stratified_split <- function( data, ratio=0.3, method="cluster" )
+{
+ data.processed = as.matrix(process_data( data ))
+ if (method == "samplecube")
+ {
+ require("sampling")
+ # adjust ratio to make samplecube return exact number of samples
+ ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
+ pik = rep(ratio,times=nrow(data.processed))
+ data.strat = cbind(pik,data.processed)
+ samplecube(data.strat,pik,order=2,comment=F)
+ }
+ else if (method == "cluster")
+ {
+ cl = cluster(data.processed)
+# require("caret")
+# res = createDataPartition(cl,p=ratio)
+# split = rep(1, times=nrow(data))
+# for (j in 1:nrow(data))
+# if ( is.na(match(j,res$Resample1)) )
+# split[j]=0
+# split
+ require("sampling")
+ stratified_split(cl,ratio,"samplecube")
+ }
+ else
+ stop("unknown method")
+}
+
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
+{
+ print(paste(num_folds,"-fold-split, data-size",nrow(data)))
+ data.processed = as.matrix(process_data( data ))
+ if (method == "samplecube")
+ {
+ folds = rep(0, times=nrow(data))
+ for (i in 1:(num_folds-1))
+ {
+ require("sampling")
+ prop = 1/(num_folds-(i-1))
+ print(paste("fold",i,"/",num_folds," prop",prop))
+ pik = rep(prop,times=nrow(data))
+ for (j in 1:nrow(data))
+ if(folds[j]!=0)
+ pik[j]=0
+ data.strat = cbind(pik,data.processed)
+ s<-samplecube(data.strat,pik,order=2,comment=F)
+ print(paste("fold size: ",sum(s)))
+ for (j in 1:nrow(data))
+ if (s[j] == 1)
+ folds[j]=i
+ }
+ for (j in 1:nrow(data))
+ if (folds[j] == 0)
+ folds[j]=num_folds
+ folds
+ }
+ else if (method == "cluster")
+ {
+ require("TunePareto")
+ cl = cluster(data.processed)
+ res = generateCVRuns(cl,ntimes=1,nfold=3)
+ folds = rep(0, times=nrow(data))
+ for (i in 1:num_folds)
+ for(j in 1:length(res[[1]][[i]]))
+ folds[res[[1]][[i]][j]]=i
+ folds
+ }
+ else
+ stop("unknown method")
+}
+
+plot_pre_process <- function( data, method="pca" )
+{
+ data.processed = process_data( data )
+ if (method == "pca")
+ {
+ data.pca <- prcomp(data.processed, scale=TRUE)
+ as.data.frame(data.pca$x)[1:2]
+ }
+ else if (method == "smacof")
+ {
+ require("smacof")
+ data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
+ data.emb$conf
+ }
+ else
+ stop("unknown method")
+}
+
+plot_split <- function( data, split, names=NULL, ... )
+{
+ if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
+ stop("data not suitable for plotting, plot_pre_process() first")
+
+ plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
+ if (is.null(names))
+ names <- c("split 1","split 2")
+ colos = as.double(rep(2:(max(split)+2)))
+ legend("topleft",names,pch=2,col=colos)
+
+ for (j in max(split):0)
+ {
+ set = c()
+ for (i in 1:nrow(data))
+ if (split[i] == j)
+ set = c(set,i)
+ points(data[set,], pch = 2, col=(j+2))
+ }
+}
+
+#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
+#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
+#data<-rbind(a,b)
+#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
+#data<-rbind(data,c)
+#data=iris
+#split = stratified_k_fold_split(data, num_folds=3)
+#split = stratified_split(data, ratio=0.33, method="cluster")
+#print(sum(split))
+#plot_split(plot_pre_process(data),split,c("training","test"))
+
+#cl = cluster(data)
+
+
+
+
diff --git a/lib/task.rb b/lib/task.rb
index e6fa5e1..102f4dc 100644
--- a/lib/task.rb
+++ b/lib/task.rb
@@ -242,16 +242,20 @@ module OpenTox
# waits for a task, unless time exceeds or state is no longer running
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
# @param [optional,Numeric] dur seconds pausing before cheking again for completion
- def wait_for_completion( waiting_task=nil, dur=0.3)
+ def wait_for_completion( waiting_task=nil)
waiting_task.waiting_for(self.uri) if waiting_task
due_to_time = Time.new + DEFAULT_TASK_MAX_DURATION
+ start_time = Time.new
+ dur = 0
LOGGER.debug "start waiting for task "+@uri.to_s+" at: "+Time.new.to_s+", waiting at least until "+due_to_time.to_s
load_metadata # for extremely fast tasks
check_state
while self.running? or self.queued?
sleep dur
+ dur = [[(Time.new - start_time)/20.0,0.3].max,300.0].min
+ #LOGGER.debug "task-object-id: #{self.object_id} - wait: #{"%.2f"%(Time.new - start_time)} - dur: #{"%.2f"%dur}"
load_metadata
# if another (sub)task is waiting for self, set progress accordingly
waiting_task.progress(@metadata[OT.percentageCompleted].to_f) if waiting_task
diff --git a/lib/transform.rb b/lib/transform.rb
new file mode 100644
index 0000000..8fe1093
--- /dev/null
+++ b/lib/transform.rb
@@ -0,0 +1,520 @@
+module OpenTox
+ module Transform
+ # Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
+
+ # LogAutoScaler for GSL vectors.
+ # Take log and scale.
+ class LogAutoScale
+ attr_accessor :vs, :offset, :autoscaler
+
+ # @param [GSL::Vector] Values to transform using LogAutoScaling.
+ def initialize values
+ @distance_to_zero = 1.0
+ begin
+ raise "Cannot transform, values empty." if values.size==0
+ vs = values.clone
+ @offset = vs.min - @distance_to_zero
+ @autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs)
+ @vs = @autoscaler.vs
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ # @param [GSL::Vector] values to restore.
+ # @return [GSL::Vector] transformed values.
+ def restore values
+ begin
+ raise "Cannot transform, values empty." if values.size==0
+ vs = values.clone
+ rv = @autoscaler.restore(vs)
+ rv.to_a.collect { |v| (10**v) + @offset }.to_gv
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ # @param [GSL::Vector] values to transform.
+ # @return [GSL::Vector] transformed values.
+ def mvlog values
+ values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv
+ end
+
+ end
+
+
+ # Auto-Scaler for GSL vectors.
+ # Center on mean and divide by standard deviation.
+ class AutoScale
+ attr_accessor :vs, :mean, :stdev
+
+ # @param [GSL::Vector] values to transform using AutoScaling.
+ def initialize values
+ begin
+ raise "Cannot transform, values empty." if values.size==0
+ vs = values.clone
+ @mean = vs.to_scale.mean
+ @stdev = vs.to_scale.standard_deviation_population
+ @stdev = 0.0 if @stdev.nan?
+ @vs = transform vs
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ # @param [GSL::Vector] values to transform.
+ # @return [GSL::Vector] transformed values.
+ def transform values
+ begin
+ raise "Cannot transform, values empty." if values.size==0
+ autoscale values.clone
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ # @param [GSL::Vector] Values to restore.
+ # @return [GSL::Vector] transformed values.
+ def restore values
+ begin
+ raise "Cannot transform, values empty." if values.size==0
+ rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
+ (rv_ss + @mean).to_gsl
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ # @param [GSL::Vector] values to transform.
+ # @return [GSL::Vector] transformed values.
+ def autoscale values
+ vs_ss = values.clone.to_scale - @mean
+ @stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl
+ end
+
+ end
+
+
+ # Principal Components Analysis.
+ class PCA
+ attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
+
+ # Creates a transformed dataset as GSL::Matrix.
+ #
+ # @param [GSL::Matrix] Data matrix.
+ # @param [Float] Compression ratio from [0,1], default 0.05.
+ # @return [GSL::Matrix] Data transformed matrix.
+ def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
+ begin
+ @data_matrix = data_matrix.clone
+ @compression = compression.to_f
+ @mean = Array.new
+ @autoscaler = Array.new
+ @cols = Array.new
+ @maxcols = maxcols
+
+ # Objective Feature Selection
+ raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
+ @data_matrix_selected = nil
+ (0..@data_matrix.size2-1).each { |i|
+ if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
+ if @data_matrix_selected.nil?
+ @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
+ @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
+ else
+ @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
+ end
+ @cols << i
+ end
+ }
+ raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
+
+ # PCA uses internal centering on 0
+ @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
+ (0..@cols.size-1).each { |i|
+ as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
+ @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
+ @mean << as.mean
+ @autoscaler << as
+ }
+
+ # PCA
+ data_matrix_hash = Hash.new
+ (0..@cols.size-1).each { |i|
+ column_view = @data_matrix_scaled.col(i)
+ data_matrix_hash[i] = column_view.to_scale
+ }
+ dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
+ cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
+ pca=Statsample::Factor::PCA.new(cor_matrix)
+
+ # Select best eigenvectors
+ pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
+ @eigenvalue_sums = Array.new
+ (0..@cols.size-1).each { |i|
+ @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
+ }
+ eigenvectors_selected = Array.new
+ pca.eigenvectors.each_with_index { |ev, i|
+ if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
+ eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
+ end
+ }
+ @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
+ @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
+
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ # Transforms data to feature space found by PCA.
+ #
+ # @param [GSL::Matrix] Data matrix.
+ # @return [GSL::Matrix] Transformed data matrix.
+ def transform values
+ begin
+ vs = values.clone
+ raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
+ data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
+ @cols.each_with_index { |i,j|
+ data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
+ }
+ (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ # Restores data in the original feature space (possibly with compression loss).
+ #
+ # @param [GSL::Matrix] Transformed data matrix.
+ # @return [GSL::Matrix] Data matrix.
+ def restore
+ begin
+ data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
+ # reverse scaling
+ (0..@cols.size-1).each { |i|
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
+ }
+ data_matrix_restored
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+ end
+
+
+ # Singular Value Decomposition
+ class SVD
+ attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv
+
+ # Creates a transformed dataset as GSL::Matrix.
+ #
+ # @param [GSL::Matrix] Data matrix
+ # @param [Float] Compression ratio from [0,1], default 0.05
+ # @return [GSL::Matrix] Data transformed matrix
+
+ def initialize data_matrix, compression=0.05
+ begin
+ @data_matrix = data_matrix.clone
+ @compression = compression
+
+ # Compute the SV Decomposition X=USV
+ # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
+ u, vt, s = data_matrix.SV_decomp
+
+ # Determine cutoff index
+ s2 = s.mul(s) ; s2_sum = s2.sum
+ s2_run = 0
+ k = s2.size - 1
+ s2.to_a.reverse.each { |v|
+ s2_run += v
+ frac = s2_run / s2_sum
+ break if frac > compression
+ k -= 1
+ }
+ k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
+
+ # Take the k-rank approximation of the Matrix
+ # - Take first k columns of u
+ # - Take first k columns of vt
+ # - Take the first k eigenvalues
+ @uk = u.submatrix(nil, (0..k)) # used to transform column format data
+ @vk = vt.submatrix(nil, (0..k)) # used to transform row format data
+ s = GSL::Matrix.diagonal(s)
+ @eigk = s.submatrix((0..k), (0..k))
+ @eigk_inv = @eigk.inv
+
+ # Transform data
+ @data_transformed_matrix = @uk # = u for all SVs
+ # NOTE: @data_transformed_matrix is also equal to
+ # @data_matrix * @vk * @eigk_inv
+
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+
+ # Transforms data instance (1 row) to feature space found by SVD.
+ #
+ # @param [GSL::Matrix] Data matrix (1 x m).
+ # @return [GSL::Matrix] Transformed data matrix.
+ def transform_instance values
+ begin
+ values * @vk * @eigk_inv
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+ alias :transform :transform_instance # make this the default (see PCA interface)
+
+ # Transforms data feature (1 column) to feature space found by SVD.
+ #
+ # @param [GSL::Matrix] Data matrix (1 x n).
+ # @return [GSL::Matrix] Transformed data matrix.
+ def transform_feature values
+ begin
+ values * @uk * @eigk_inv
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+
+ # Restores data in the original feature space (possibly with compression loss).
+ #
+ # @param [GSL::Matrix] Transformed data matrix.
+ # @return [GSL::Matrix] Data matrix.
+ def restore
+ begin
+ @data_transformed_matrix * @eigk * @vk.transpose # reverse svd
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+
+ end
+
+
+
+ # Attaches transformations to an OpenTox::Model
+ # Stores props, sims, performs similarity calculations
+ class ModelTransformer
+ attr_accessor :model, :similarity_algorithm, :acts, :sims
+
+ # @params[OpenTox::Model] model to transform
+ def initialize model
+ @model = model
+ @similarity_algorithm = @model.similarity_algorithm
+ end
+
+ def transform
+ get_matrices # creates @n_prop, @q_prop, @acts from ordered fps
+ @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors
+
+ # Preprocessing
+ if (@model.similarity_algorithm == "Similarity.cosine")
+ # truncate nil-columns and -rows
+ LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+ while @q_prop.size>0
+ idx = @q_prop.index(nil)
+ break if idx.nil?
+ @q_prop.slice!(idx)
+ @n_prop.each { |r| r.slice!(idx) }
+ end
+ LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+ remove_nils # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors
+ LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+
+ # adjust rest
+ fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp
+ cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp
+ acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
+
+ # scale and svd
+ nr_cases, nr_features = @n_prop.size, @n_prop[0].size
+ gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup
+ gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup
+ (0...nr_features).each { |i|
+ autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i))
+ gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs
+ gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i)
+ }
+ svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0)
+ @n_prop = svd.data_transformed_matrix.to_a
+ @q_prop = svd.transform(gsl_q_prop).row(0).to_a
+ LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+ else
+ convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched
+ end
+
+ # neighbor calculation
+ @ids = [] # surviving compounds become neighbors
+ @sims = [] # calculated by neighbor routine
+ neighbors
+ n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
+ acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
+
+
+ # Sims between neighbors, if necessary
+ gram_matrix = []
+ if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.)
+ @n_prop.each_index do |i|
+ gram_matrix[i] = [] unless gram_matrix[i]
+ @n_prop.each_index do |j|
+ if (j>i)
+ sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
+ gram_matrix[i][j] = sim
+ gram_matrix[j] = [] unless gram_matrix[j]
+ gram_matrix[j][i] = gram_matrix[i][j]
+ end
+ end
+ gram_matrix[i][i] = 1.0
+ end
+ end
+
+ # reclaim original data (if svd was performed)
+ if svd
+ @n_prop = gsl_n_prop_orig.to_a
+ n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp
+ @q_prop = gsl_q_prop_orig.row(0).to_a
+ end
+
+ LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+ LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
+
+ @sims = [ gram_matrix, @sims ]
+
+ end
+
+
+
+
+ # Find neighbors and store them as object variable, access all compounds for that.
+ def neighbors
+ @model.neighbors = []
+ @n_prop.each_with_index do |fp, idx| # AM: access all compounds
+ add_neighbor fp, idx
+ end
+ end
+
+
+ # Adds a neighbor to @neighbors if it passes the similarity threshold
+ # adjusts @ids to signal the
+ def add_neighbor(training_props, idx)
+
+ sim = similarity(training_props)
+ if sim > @model.parameter("min_sim")
+ if @model.activities[@cmpds[idx]]
+ @model.activities[@cmpds[idx]].each do |act|
+ @model.neighbors << {
+ :compound => @cmpds[idx],
+ :similarity => sim,
+ :features => @fps[idx].keys,
+ :activity => act
+ }
+ @sims << sim
+ @ids << idx
+ end
+ end
+ end
+ end
+
+
+ # Removes nil entries from n_prop and q_prop.
+ # Matrix is a nested two-dimensional array.
+ # Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed.
+ # Tie break: columns take precedence.
+ # Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero.
+ # Enables the use of cosine similarity / SVD
+ def remove_nils
+ return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0)
+ col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
+ row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
+ m_cols = col_nr_nils.max
+ m_rows = row_nr_nils.max
+ idx_cols = col_nr_nils.index(m_cols)
+ idx_rows = row_nr_nils.index(m_rows)
+ while ((m_cols > 0) || (m_rows > 0)) do
+ if m_cols >= m_rows
+ @n_prop.each { |row| row.slice!(idx_cols) }
+ @q_prop.slice!(idx_cols)
+ else
+ @n_prop.slice!(idx_rows)
+ @ids.slice!(idx_rows)
+ end
+ break if (@n_prop.length == 0) || (@n_prop[0].length == 0)
+ col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
+ row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
+ m_cols = col_nr_nils.max
+ m_rows = row_nr_nils.max
+ idx_cols= col_nr_nils.index(m_cols)
+ idx_rows = row_nr_nils.index(m_rows)
+ end
+ end
+
+
+ # Replaces nils by zeroes in n_prop and q_prop
+ # Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop)
+ def convert_nils
+ @n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } }
+ @q_prop.collect! { |v| v.nil? ? 0 : v }
+ end
+
+
+ # Executes model similarity_algorithm
+ def similarity(training_props)
+ eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)")
+ end
+
+
+ # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
+ # Same for compound fingerprints.
+ def get_matrices
+
+ @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
+
+ @model.fingerprints.each { |fp|
+ cmpd = fp[0]; fp = fp[1]
+ if @model.activities[cmpd] # row good
+ acts = @model.activities[cmpd]; @acts += acts
+ LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
+ row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
+ acts.size.times { # multiple additions for multiple activities
+ @n_prop << row.collect
+ @cmpds << cmpd
+ @fps << Marshal.load(Marshal.dump(fp))
+ }
+ else
+ LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
+ end
+ }
+
+ @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
+
+ end
+
+ def props
+ @model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil
+ end
+
+ end
+
+ end
+end
diff --git a/lib/utils.rb b/lib/utils.rb
new file mode 100644
index 0000000..d9d7b4b
--- /dev/null
+++ b/lib/utils.rb
@@ -0,0 +1,372 @@
+require 'csv'
+
+
+module OpenTox
+
+ module Algorithm
+
+ include OpenTox
+
+ # Calculate physico-chemical descriptors.
+ # @param[Hash] Required keys: :dataset_uri, :pc_type
+ # @return[String] dataset uri
+
+ def self.pc_descriptors(params)
+
+ begin
+ ds = OpenTox::Dataset.find(params[:dataset_uri])
+ compounds = ds.compounds.collect
+ ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
+ #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
+ LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
+ load_ds_csv(ambit_result_uri, smiles_to_inchi)
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+
+ end
+
+ # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
+ # @param[Hash] Required keys: :compounds, :pc_type
+ # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
+ def self.get_pc_descriptors(params)
+
+ begin
+
+ ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
+ ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
+ descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
+ descs_uris = []
+ params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
+ types = params[:pc_type].split(",")
+ descs.each { |uri, cat_name|
+ if types.include? cat_name[:category]
+ descs_uris << uri
+ end
+ }
+ if descs_uris.size == 0
+ raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
+ end
+ #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
+
+ begin
+ # Create SMI
+ smiles_array = []; smiles_to_inchi = {}
+ params[:compounds].each do |n|
+ cmpd = OpenTox::Compound.new(n)
+ smiles_string = cmpd.to_smiles
+ smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
+ smiles_array << smiles_string
+ end
+ smi_file = Tempfile.open(['pc_ambit', '.csv'])
+ pc_descriptors = nil
+
+ # Create Ambit dataset
+ smi_file.puts( "SMILES\n" )
+ smi_file.puts( smiles_array.join("\n") )
+ smi_file.flush
+ ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ ensure
+ smi_file.close! if smi_file
+ end
+ ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
+
+ # Calculate 3D for CPSA
+ if types.include? "cpsa"
+ ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} )
+ LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
+ end
+
+ # Get Ambit results
+ ambit_result_uri = [] # 1st pos: base uri, then features
+ ambit_result_uri << ambit_ds_uri + "?"
+ ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
+ descs_uris.each_with_index do |uri, i|
+ algorithm = Algorithm::Generic.new(uri)
+ result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
+ ambit_result_uri << result_uri.split("?")[1] + "&"
+ LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}"
+ end
+ #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
+ [ ambit_result_uri, smiles_to_inchi ]
+
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ end
+
+
+ # Load dataset via CSV
+ # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
+ # @return[String] dataset uri
+ def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil)
+
+ master=nil
+ (1...ambit_result_uri.size).collect { |idx|
+ curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
+ LOGGER.debug "Requesting #{curr_uri}"
+ csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
+ if csv_data[0] && csv_data[0].size>1
+ if master.nil? # This is the smiles entry
+ (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
+ master = csv_data
+ next
+ else
+ index_uri = csv_data[0].index("SMILES")
+ csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
+
+ nr_cols = (csv_data[0].size)-1
+ LOGGER.debug "Merging #{nr_cols} new columns"
+ master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
+ csv_data.each do |row|
+ temp = master.assoc(row[0]) # Finds the appropriate line in master
+ ((-1*nr_cols)..-1).collect.each { |idx|
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
+ }
+ end
+ end
+ end
+ }
+
+ index_uri = master[0].index("Compound")
+ master.map {|i| i.delete_at(index_uri)}
+ master[0].each {|cell| cell.chomp!(" ")}
+ master[0][0] = "Compound" #"SMILES"
+ index_smi = master[0].index("SMILES")
+ master.map {|i| i.delete_at(index_smi)} if index_smi
+ #master[0][0] = "SMILES"
+
+ #LOGGER.debug "-------- AM: Writing to dumpfile"
+ #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
+
+ parser = OpenTox::Parser::Spreadsheets.new
+ ds = OpenTox::Dataset.new(nil,subjectid)
+ ds.save(subjectid)
+ parser.dataset = ds
+ ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
+ ds.save(subjectid)
+ end
+
+
+ # Gauss kernel
+ # @return [Float]
+ def self.gauss(x, sigma = 0.3)
+ d = 1.0 - x.to_f
+ Math.exp(-(d*d)/(2*sigma*sigma))
+ end
+
+
+ # For symbolic features
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
+ # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
+ def self.isnull_or_singular?(array)
+ nr_zeroes = array.count(0)
+ return (nr_zeroes == array.size) || # remove non-occurring feature
+ (nr_zeroes == array.size-1) || # remove singular feature
+ (nr_zeroes == 0) # also remove feature present everywhere
+ end
+
+
+ # Numeric value test
+ # @param[Object] value
+ # @return [Boolean] Whether value is a number
+ def self.numeric?(value)
+ true if Float(value) rescue false
+ end
+
+
+ # For symbolic features
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
+ # @return [Boolean] Whether the feature has variance zero.
+ def self.zero_variance?(array)
+ return array.uniq.size == 1
+ end
+
+
+ # Sum of an array for Arrays.
+ # @param [Array] Array with values
+ # @return [Integer] Sum of size of values
+ def self.sum_size(array)
+ sum=0
+ array.each { |e| sum += e.size }
+ return sum
+ end
+
+
+ # Minimum Frequency
+ # @param [Integer] per-mil value
+ # return [Integer] min-frequency
+ def self.min_frequency(training_dataset,per_mil)
+ minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+ minfreq = 2 unless minfreq > 2
+ Integer (minfreq)
+ end
+
+
+ # Effect calculation for classification
+ # @param [Array] Array of occurrences per class in the form of Enumerables.
+ # @param [Array] Array of database instance counts per class.
+ def self.effect(occurrences, db_instances)
+ max=0
+ max_value=0
+ nr_o = self.sum_size(occurrences)
+ nr_db = db_instances.to_scale.sum
+
+ occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
+ actual = o.size.to_f/nr_o
+ expected = db_instances[i].to_f/nr_db
+ if actual > expected
+ if ((actual - expected) / actual) > max_value
+ max_value = (actual - expected) / actual # 'Schleppzeiger'
+ max = i
+ end
+ end
+ }
+ max
+ end
+
+
+ # neighbors
+
+ module Neighbors
+
+ # Get confidence.
+ # @param[Hash] Required keys: :sims, :acts
+ # @return[Float] Confidence
+ def self.get_confidence(params)
+ conf = params[:sims].inject{|sum,x| sum + x }
+ confidence = conf/params[:sims].size
+ LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
+ return confidence
+ end
+
+ end
+
+
+ # Similarity calculations
+ module Similarity
+
+ # Tanimoto similarity
+ # @param [Hash, Array] fingerprints of first compound
+ # @param [Hash, Array] fingerprints of second compound
+ # @return [Float] (Weighted) tanimoto similarity
+ def self.tanimoto(fingerprints_a,fingerprints_b,weights=nil,params=nil)
+
+ common_p_sum = 0.0
+ all_p_sum = 0.0
+
+ # fingerprints are hashes
+ if fingerprints_a.class == Hash && fingerprints_b.class == Hash
+ common_features = fingerprints_a.keys & fingerprints_b.keys
+ all_features = (fingerprints_a.keys + fingerprints_b.keys).uniq
+ if common_features.size > 0
+ common_features.each{ |f| common_p_sum += [ fingerprints_a[f], fingerprints_b[f] ].min }
+ all_features.each{ |f| all_p_sum += [ fingerprints_a[f],fingerprints_b[f] ].compact.max } # compact, since one fp may be empty at that pos
+ end
+
+ # fingerprints are arrays
+ elsif fingerprints_a.class == Array && fingerprints_b.class == Array
+ size = [ fingerprints_a.size, fingerprints_b.size ].min
+ LOGGER.warn "fingerprints don't have equal size" if fingerprints_a.size != fingerprints_b.size
+ (0...size).each { |idx|
+ common_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].min
+ all_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].max
+ }
+ end
+
+ (all_p_sum > 0.0) ? (common_p_sum/all_p_sum) : 0.0
+
+ end
+
+
+ # Cosine similarity
+ # @param [Hash] properties_a key-value properties of first compound
+ # @param [Hash] properties_b key-value properties of second compound
+ # @return [Float] cosine of angle enclosed between vectors induced by keys present in both a and b
+ def self.cosine(fingerprints_a,fingerprints_b,weights=nil)
+
+ # fingerprints are hashes
+ if fingerprints_a.class == Hash && fingerprints_b.class == Hash
+ a = []; b = []
+ common_features = fingerprints_a.keys & fingerprints_b.keys
+ if common_features.size > 1
+ common_features.each do |p|
+ a << fingerprints_a[p]
+ b << fingerprints_b[p]
+ end
+ end
+
+ # fingerprints are arrays
+ elsif fingerprints_a.class == Array && fingerprints_b.class == Array
+ a = fingerprints_a
+ b = fingerprints_b
+ end
+
+ (a.size > 0 && b.size > 0) ? self.cosine_num(a.to_gv, b.to_gv) : 0.0
+
+ end
+
+
+ # Cosine similarity
+ # @param [GSL::Vector] a
+ # @param [GSL::Vector] b
+ # @return [Float] cosine of angle enclosed between a and b
+ def self.cosine_num(a, b)
+ if a.size>12 && b.size>12
+ a = a[0..11]
+ b = b[0..11]
+ end
+ a.dot(b) / (a.norm * b.norm)
+ end
+
+
+ # Outlier detection based on Mahalanobis distances
+ # Multivariate detection on X, univariate detection on y
+ # Uses an existing Rinruby instance, if possible
+ # @param[Hash] Keys query_matrix, data_matrix, acts are required; r, p_outlier optional
+ # @return[Array] indices identifying outliers (may occur several times, this is intended)
+ def self.outliers(params)
+ outlier_array = []
+ data_matrix = params[:data_matrix]
+ query_matrix = params[:query_matrix]
+ acts = params[:acts]
+ begin
+ LOGGER.debug "Outliers (p=#{params[:p_outlier] || 0.9999})..."
+ r = ( params[:r] || RinRuby.new(false,false) )
+ r.eval "suppressPackageStartupMessages(library(\"robustbase\"))"
+ r.eval "outlier_threshold = #{params[:p_outlier] || 0.999}"
+ nr_cases, nr_features = data_matrix.to_a.size, data_matrix.to_a[0].size
+ r.odx = data_matrix.to_a.flatten
+ r.q = query_matrix.to_a.flatten
+ r.y = acts.to_a.flatten
+ r.eval "odx = matrix(odx, #{nr_cases}, #{nr_features}, byrow=T)"
+ r.eval 'odx = rbind(q,odx)' # query is nr 0 (1) in ruby (R)
+ r.eval 'mah = covMcd(odx)$mah' # run MCD alg
+ r.eval "mah = pchisq(mah,#{nr_features})"
+ r.eval 'outlier_array = which(mah>outlier_threshold)' # multivariate outliers using robust mahalanobis
+ outlier_array = r.outlier_array.to_a.collect{|v| v-2 } # translate to ruby index (-1 for q, -1 due to ruby)
+ r.eval 'fqu = matrix(summary(y))[2]'
+ r.eval 'tqu = matrix(summary(y))[5]'
+ r.eval 'outlier_array = which(y>(tqu+1.5*IQR(y)))' # univariate outliers due to Tukey (http://goo.gl/mwzNH)
+ outlier_array += r.outlier_array.to_a.collect{|v| v-1 } # translate to ruby index (-1 due to ruby)
+ r.eval 'outlier_array = which(y<(fqu-1.5*IQR(y)))'
+ outlier_array += r.outlier_array.to_a.collect{|v| v-1 }
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ outlier_array
+ end
+
+ end
+
+
+ end
+
+end
+
diff --git a/lib/validation.rb b/lib/validation.rb
index 646b076..85004c7 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -1,3 +1,4 @@
+require "yaml"
module OpenTox
class Validation
include OpenTox
@@ -66,7 +67,7 @@ module OpenTox
# @return [String] report uri
def find_or_create_report( subjectid=nil, waiting_task=nil )
@report = ValidationReport.find_for_validation(@uri, subjectid) unless @report
- @report = ValidationReport.create(@uri, subjectid, waiting_task) unless @report
+ @report = ValidationReport.create(@uri, {}, subjectid, waiting_task) unless @report
@report.uri
end
@@ -107,6 +108,31 @@ module OpenTox
end
table
end
+
+ # returns probability-distribution for a given prediction
+ # it takes all predictions into account that have a confidence value that is >= confidence and that have the same predicted value
+ # (minimum 12 predictions with the hightest confidence are selected (even if the confidence is lower than the given param)
+ #
+ # @param [Float] confidence value (between 0 and 1)
+ # @param [String] predicted value
+ # @param [String,optional] subjectid
+ # @return [Hash] see example
+ #
+ # Example 1:
+ # validation.probabilities(0.3,"active")
+ # -> {:min_confidence=>0.32, :num_predictions=>20, :probs=>{"active"=>0.7, "moderate"=>0.25 "inactive"=>0.05}}
+ # there have been 20 "active" predictions with confidence >= 0.3, 70 percent of them beeing correct
+ #
+ # Example 2:
+ # validation.probabilities(0.8,"active")
+ # -> {:min_confidence=>0.45, :num_predictions=>12, :probs=>{"active"=>0.9, "moderate"=>0.1 "inactive"=>0}}
+ # the given confidence value was to high (i.e. <12 predictions with confidence value >= 0.8)
+ # the top 12 "active" predictions have a min_confidence of 0.45, 90 percent of them beeing correct
+ #
+ def probabilities( confidence, prediction, subjectid=nil )
+ YAML.load(OpenTox::RestClientWrapper.get(@uri+"/probabilities?prediction="+prediction.to_s+"&confidence="+confidence.to_s,
+ {:subjectid => subjectid, :accept => "application/x-yaml"}))
+ end
end
class Crossvalidation
@@ -168,6 +194,13 @@ module OpenTox
def statistics( subjectid=nil )
Validation.from_cv_statistics( @uri, subjectid )
end
+
+ # documentation see OpenTox::Validation.probabilities
+ def probabilities( confidence, prediction, subjectid=nil )
+ YAML.load(OpenTox::RestClientWrapper.get(@uri+"/statistics/probabilities?prediction="+prediction.to_s+"&confidence="+confidence.to_s,
+ {:subjectid => subjectid, :accept => "application/x-yaml"}))
+ end
+
end
class ValidationReport
@@ -196,12 +229,18 @@ module OpenTox
# creates a validation report via validation
# @param [String] validation uri
+ # @param [Hash] params addiditonal possible
+ # (min_confidence, params={}, min_num_predictions, max_num_predictions)
# @param [String,optional] subjectid
# @param [OpenTox::Task,optional] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
# @return [OpenTox::ValidationReport]
- def self.create( validation_uri, subjectid=nil, waiting_task=nil )
+ def self.create( validation_uri, params={}, subjectid=nil, waiting_task=nil )
+ params = {} if params==nil
+ raise OpenTox::BadRequestError.new "params is no hash" unless params.is_a?(Hash)
+ params[:validation_uris] = validation_uri
+ params[:subjectid] = subjectid
uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-validation"],"/report/validation"),
- { :validation_uris => validation_uri, :subjectid => subjectid }, {}, waiting_task )
+ params, {}, waiting_task )
ValidationReport.new(uri)
end
@@ -268,15 +307,17 @@ module OpenTox
uris.size==0 ? nil : AlgorithmComparisonReport.new(uris[-1])
end
- # creates a crossvalidation report via crossvalidation
+ # creates a algorithm comparison report via crossvalidation uris
# @param [Hash] crossvalidation uri_hash, see example
+ # @param [Hash] params addiditonal possible
+ # (ttest_significance, ttest_attributes, min_confidence, min_num_predictions, max_num_predictions)
# @param [String,optional] subjectid
# @param [OpenTox::Task,optional] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
# @return [OpenTox::AlgorithmComparisonReport]
# example for hash:
# { :lazar-bbrc => [ http://host/validation/crossvalidation/x1, http://host/validation/crossvalidation/x2 ],
# :lazar-last => [ http://host/validation/crossvalidation/xy, http://host/validation/crossvalidation/xy ] }
- def self.create( crossvalidation_uri_hash, subjectid=nil, waiting_task=nil )
+ def self.create( crossvalidation_uri_hash, params={}, subjectid=nil, waiting_task=nil )
identifier = []
validation_uris = []
crossvalidation_uri_hash.each do |id, uris|
@@ -285,8 +326,13 @@ module OpenTox
validation_uris << uri
end
end
+ params = {} if params==nil
+ raise OpenTox::BadRequestError.new "params is no hash" unless params.is_a?(Hash)
+ params[:validation_uris] = validation_uris.join(",")
+ params[:identifier] = identifier.join(",")
+ params[:subjectid] = subjectid
uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-validation"],"/report/algorithm_comparison"),
- { :validation_uris => validation_uris.join(","), :identifier => identifier.join(","), :subjectid => subjectid }, {}, waiting_task )
+ params, {}, waiting_task )
AlgorithmComparisonReport.new(uri)
end
end
diff --git a/opentox-ruby.gemspec b/opentox-ruby.gemspec
index 2ec5a18..900d53f 100644
--- a/opentox-ruby.gemspec
+++ b/opentox-ruby.gemspec
@@ -5,7 +5,7 @@
Gem::Specification.new do |s|
s.name = %q{opentox-ruby}
- s.version = "3.0.0"
+ s.version = "3.1.0"
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
s.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler"]
@@ -45,6 +45,8 @@ Gem::Specification.new do |s|
"lib/templates/default_guest_policy.xml",
"lib/templates/default_policy.xml",
"lib/to-html.rb",
+ "lib/transform.rb",
+ "lib/utils.rb"
"lib/validation.rb"
]
s.homepage = %q{http://github.com/opentox/opentox-ruby}