summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-08-01 17:53:26 +0200
committerChristoph Helma <helma@in-silico.ch>2015-08-01 17:53:26 +0200
commitcdc0e4dd01c4b8da3a43e7d6b49a7e09a9881d63 (patch)
tree05ace2752a2218c614cdc61ec759d7cdf8a2f7d9
parent28c41fc27bea4668ee1dc3c8d1f086e64d271b5a (diff)
50 times faster bbrc setup by eliminating @fminer.add_fminer_data
-rw-r--r--babel_3d_cache/.gitkeep0
-rw-r--r--lib/algorithm.rb1
-rw-r--r--lib/bbrc.rb61
-rw-r--r--lib/classification.rb10
-rw-r--r--lib/fminer.rb12
-rw-r--r--lib/lazar.rb201
-rw-r--r--lib/opentox-algorithm.rb2
-rw-r--r--lib/regression.rb174
-rw-r--r--lib/transform.rb6
9 files changed, 311 insertions, 156 deletions
diff --git a/babel_3d_cache/.gitkeep b/babel_3d_cache/.gitkeep
deleted file mode 100644
index e69de29..0000000
--- a/babel_3d_cache/.gitkeep
+++ /dev/null
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index eda7588..0e227d6 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -3,6 +3,7 @@ module OpenTox
module Algorithm
def self.run algorithm, object, parameters={}
+ bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
klass,method = algorithm.split('.')
parameters.empty? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
end
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
index 595d712..6b0eb26 100644
--- a/lib/bbrc.rb
+++ b/lib/bbrc.rb
@@ -1,6 +1,9 @@
module OpenTox
module Algorithm
class Fminer
+ TABLE_OF_ELEMENTS = [
+"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
+
#
# Run bbrc algorithm on dataset
#
@@ -14,27 +17,40 @@ module OpenTox
# - nr_hits Set to "true" to get hit count instead of presence
# - get_target Set to "true" to obtain target variable as feature
# @return [text/uri-list] Task URI
- def self.bbrc dataset, params={}
-
- table_of_elements = [
-"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
-
- @fminer=OpenTox::Algorithm::Fminer.new
- @fminer.check_params(dataset,params,5)
+ def self.bbrc training_dataset, params={}
time = Time.now
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+
+ prediction_feature = training_dataset.features.first
+ if params[:min_frequency]
+ minfreq = params[:min_frequency]
+ else
+ per_mil = 5 # value from latest version
+ i = training_dataset.feature_ids.index prediction_feature.id
+ nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
+ minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+ minfreq = 2 unless minfreq > 2
+ minfreq = minfreq.round
+ end
+
+ #@fminer=OpenTox::Algorithm::Fminer.new
+ #@fminer.check_params(dataset,params,5)
+ #p @fminer.instance_variables
+
@bbrc = Bbrc::Bbrc.new
@bbrc.Reset
- if @fminer.prediction_feature.numeric
+ if prediction_feature.numeric
@bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
else
bad_request_error "No accept values for "\
- "dataset '#{@fminer.training_dataset.id}' and "\
- "feature '#{@fminer.prediction_feature.id}'" unless @fminer.prediction_feature.accept_values
- value_map = @fminer.prediction_feature.accept_values.each_index.inject({}) { |h,idx| h[idx+1]=@fminer.prediction_feature.accept_values[idx]; h }
+ "dataset '#{training_dataset.id}' and "\
+ "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
+ act2value = prediction_feature.accept_values.each_index.inject({}) { |h,idx| h[idx+1]=prediction_feature.accept_values[idx]; h }
+ value2act = act2value.invert
end
- @bbrc.SetMinfreq(@fminer.minfreq)
+ @bbrc.SetMinfreq(minfreq)
@bbrc.SetType(1) if params[:feature_type] == "paths"
@bbrc.SetBackbone(false) if params[:backbone] == "false"
@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
@@ -42,21 +58,28 @@ module OpenTox
params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
feature_dataset = FminerDataset.new(
- :training_dataset_id => dataset.id,
+ :training_dataset_id => training_dataset.id,
:training_algorithm => "#{self.to_s}.bbrc",
- :training_feature_id => params[:prediction_feature].id ,
+ :training_feature_id => prediction_feature.id ,
:training_parameters => {
- :min_frequency => @fminer.minfreq,
+ :min_frequency => minfreq,
:nr_hits => nr_hits,
:backbone => (params[:backbone] == false ? false : true)
}
)
- feature_dataset.compounds = dataset.compounds
+ feature_dataset.compounds = training_dataset.compounds
+ $logger.debug "Setup: #{Time.now-time}"
+ time = Time.now
# Add data to fminer
- @fminer.add_fminer_data(@bbrc, value_map)
- g_median=@fminer.all_activities.values.to_scale.median
+ #@fminer.add_fminer_data(@bbrc, value_map)
+ training_dataset.compounds.each_with_index do |compound,i|
+ @bbrc.AddCompound(compound.smiles,i+1)
+ act = value2act[training_dataset.data_entries[i].first]
+ @bbrc.AddActivity(act,i+1)
+ end
+ #g_median=@fminer.all_activities.values.to_scale.median
#task.progress 10
#step_width = 80 / @bbrc.GetNoRootNodes().to_f
@@ -76,7 +99,7 @@ module OpenTox
smarts = f.shift
# convert fminer SMARTS representation into a more human readable format
smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
- element = table_of_elements[$1.to_i-1]
+ element = TABLE_OF_ELEMENTS[$1.to_i-1]
$2 == "a" ? element.downcase : element
end
p_value = f.shift
diff --git a/lib/classification.rb b/lib/classification.rb
index 127fa28..d71ab77 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -8,12 +8,8 @@ module OpenTox
# @return [Numeric] A prediction value.
def self.weighted_majority_vote(neighbors)
- return {:prediction => nil, :confidence => nil} if neighbors.empty?
-
neighbor_contribution = 0.0
confidence_sum = 0.0
- confidence = 0.0
- prediction = nil
$logger.debug "Weighted Majority Vote Classification."
@@ -39,14 +35,14 @@ module OpenTox
elsif confidence_sum < 0.0
prediction = values[0]
end
+ elsif values.size == 1 # all neighbors have the same value
+ prediction = values[0]
else
prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
end
- $logger.debug "Prediction: '" + prediction.to_s + "'." unless prediction.nil?
confidence = (confidence_sum/neighbors.size).abs
- $logger.debug "Confidence: '" + confidence.to_s + "'." unless prediction.nil?
- [prediction, confidence.abs]
+ {:value => prediction, :confidence => confidence.abs}
end
# Local support vector regression from neighbors
diff --git a/lib/fminer.rb b/lib/fminer.rb
index 666cefa..37be183 100644
--- a/lib/fminer.rb
+++ b/lib/fminer.rb
@@ -186,16 +186,8 @@ module OpenTox
# @param [Integer] per-mil value
# return [Integer] min-frequency
def min_frequency(training_dataset,prediction_feature,per_mil)
- nr_labeled_cmpds = DataEntry.where(dataset_id: training_dataset.id, feature_id: prediction_feature.id).in(compound_id: training_dataset.compound_ids).count
- #nr_labeled_cmpds=0
- #f_idx=training_dataset.features.index prediction_feature
- #training_dataset.compounds.each_with_index { |cmpd, c_idx|
- #if ( training_dataset.data_entries[c_idx] )
- #unless training_dataset.data_entries[c_idx][f_idx].nil?
- #nr_labeled_cmpds += 1
- #end
- #end
- #}
+ i = training_dataset.feature_ids.index prediction_feature.id
+ nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
minfreq = 2 unless minfreq > 2
Integer (minfreq)
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 399f5c1..d9195ad 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -1,10 +1,3 @@
-=begin
-* Name: lazar.rb
-* Description: Lazar model representation
-* Author: Andreas Maunz <andreas@maunz.de>, Christoph Helma
-* Date: 10/2012
-=end
-
module OpenTox
module Model
@@ -16,26 +9,18 @@ module OpenTox
store_in collection: "models"
field :title, type: String
- field :description, type: String
- #field :parameters, type: Array, default: []
+ field :endpoint, type: String
field :creator, type: String, default: __FILE__
# datasets
field :training_dataset_id, type: BSON::ObjectId
field :feature_dataset_id, type: BSON::ObjectId
# algorithms
- #field :feature_generation, type: String
- #field :feature_calculation_algorithm, type: String
+ field :feature_calculation_algorithm, type: String
field :prediction_algorithm, type: String
field :similarity_algorithm, type: String
- # prediction features
- field :prediction_feature_id, type: BSON::ObjectId
- field :predicted_value_id, type: BSON::ObjectId
- field :predicted_variables, type: Array
- # parameters
- field :nr_hits, type: Boolean
field :min_sim, type: Float
- #field :propositionalized, type:Boolean
- field :min_train_performance, type: Float
+ # prediction feature
+ field :prediction_feature_id, type: BSON::ObjectId
attr_accessor :prediction_dataset
attr_accessor :training_dataset
@@ -43,84 +28,31 @@ module OpenTox
attr_accessor :query_fingerprint
attr_accessor :neighbors
- # Check parameters for plausibility
- # Prepare lazar object (includes graph mining)
- # @param[Array] lazar parameters as strings
- # @param[Hash] REST parameters, as input by user
- def self.create training_dataset, feature_dataset, prediction_feature=nil, nr_hits=false, params={}
-
- lazar = OpenTox::Model::Lazar.new
+ # Create a lazar model from a training_dataset and a feature_dataset
+ # @param [OpenTox::Dataset] training_dataset
+ # @param [OpenTox::Dataset] feature_dataset
+ # @return [OpenTox::Model::Lazar] Regression or classification model
+ def self.create training_dataset, feature_dataset
bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty?
- lazar.feature_dataset_id = feature_dataset.id
- @training_dataset = training_dataset
- bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless @training_dataset.compounds == feature_dataset.compounds
- lazar.training_dataset_id = @training_dataset.id
-
- if prediction_feature
- resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{@training_dataset.id}'" unless @training_dataset.features.include?( params[:prediction_feature] )
- else # try to read prediction_feature from dataset
- resource_not_found_error "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1
- prediction_feature = @training_dataset.features.first
- end
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+ bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless training_dataset.compounds == feature_dataset.compounds
+ prediction_feature = training_dataset.features.first
+ prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
+ lazar.feature_dataset_id = feature_dataset.id
+ lazar.training_dataset_id = training_dataset.id
lazar.prediction_feature_id = prediction_feature.id
lazar.title = prediction_feature.title
- if params and params[:prediction_algorithm]
- bad_request_error "Unknown prediction_algorithm #{params[:prediction_algorithm]}" unless OpenTox::Algorithm::Neighbors.respond_to?(params[:prediction_algorithm])
- lazar.prediction_algorithm = params[:prediction_algorithm]
- end
-
- unless lazar.prediction_algorithm # set defaults
- # TODO consider params
- if prediction_feature.nominal
- lazar.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
- lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto"
- lazar.min_sim = 0.3 unless lazar.min_sim
- elsif prediction_feature.numeric
- lazar.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression"
- lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine"
- # cosine similartiy is default
- lazar.min_sim = 0.7 unless lazar.min_sim
- end
- end
- #lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true
-
- lazar.min_sim = params[:min_sim].to_f if params[:min_sim] and params[:min_sim].numeric?
- # TODO: get info from training_dataset
- lazar.nr_hits = nr_hits
- #lazar.feature_generation = feature_dataset.training_algorithm
- #lazar.parameters << {"title" => "feature_generation_uri", "paramValue" => params[:feature_generation_uri]}
-
- bad_request_error "Parameter min_train_performance is not numeric." if params[:min_train_performance] and !params[:min_train_performance].numeric?
- lazar.min_train_performance = params[:min_train_performance].to_f if params[:min_train_performance] and params[:min_train_performance].numeric?
- lazar.min_train_performance = 0.1 unless lazar.min_train_performance
-
lazar.save
lazar
end
def predict object
- # tailored for performance
- # all consistency checks should be done during model creation
-
time = Time.now
- # prepare prediction dataset
- prediction_dataset = LazarPrediction.new
- prediction_feature = OpenTox::Feature.find prediction_feature_id
- prediction_dataset.title = "Lazar prediction for #{prediction_feature.title}",
- prediction_dataset.creator = __FILE__,
-
- confidence_feature = OpenTox::Feature.find_or_create_by({
- "title" => "Prediction confidence",
- "numeric" => true
- })
-
- prediction_dataset.features = [ confidence_feature, prediction_feature ]
-
@training_dataset = OpenTox::Dataset.find(training_dataset_id)
@feature_dataset = OpenTox::Dataset.find(feature_dataset_id)
@@ -139,52 +71,44 @@ module OpenTox
$logger.debug "Setup: #{Time.now-time}"
time = Time.now
- @query_fingerprint = Algorithm.run(feature_dataset.feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} )
-
- $logger.debug "Fingerprint calculation: #{Time.now-time}"
- time = Time.now
+ @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} )
- # AM: transform to cosine space
- min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/
+ $logger.debug "Query fingerprint calculation: #{Time.now-time}"
+ predictions = []
+ prediction_feature = OpenTox::Feature.find prediction_feature_id
+ tt = 0
+ pt = 0
compounds.each_with_index do |compound,c|
+ t = Time.new
$logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}"
database_activities = @training_dataset.values(compound,prediction_feature)
if database_activities and !database_activities.empty?
- database_activities.each do |database_activity|
- $logger.debug "do not predict compound, it occurs in dataset with activity #{database_activity}"
- prediction_dataset.compound_ids << compound.id
- prediction_dataset[c,0] = database_activity
- prediction_dataset[c,1] = nil
- end
+ database_activities = database_activities.first if database_activities.size == 1
+ $logger.debug "Compound #{compound.inchi} occurs in training dataset with activity #{database_activities}"
+ predictions << {:compound => compound, :value => database_activities, :confidence => "measured"}
next
else
- t = Time.new
if prediction_algorithm =~ /Regression/
mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self)
mtf.transform
training_fingerprints = mtf.n_prop
- training_activities = mtf.activities
- p training_activities
query_fingerprint = mtf.q_prop
neighbors = [[nil,nil,nil,query_fingerprint]]
else
training_fingerprints = @feature_dataset.data_entries
- # TODO fix for multi feature datasets
- training_activities = @training_dataset.data_entries[i].first
query_fingerprint = @query_fingerprint[c]
neighbors = []
end
- $logger.debug "Transform: #{Time.now-t}"
+ tt += Time.now-t
t = Time.new
# find neighbors
training_fingerprints.each_with_index do |fingerprint, i|
-
sim = Algorithm.run(similarity_algorithm,fingerprint, query_fingerprint)
if sim > self.min_sim
if prediction_algorithm =~ /Regression/
@@ -195,40 +119,85 @@ module OpenTox
end
end
+ if neighbors.empty?
+ predictions << {:compound => compound, :value => nil, :confidence => nil, :warning => "No neighbors with similarity > #{min_sim} in dataset #{training_dataset.id}"}
+ #$logger.warn "No neighbors found for compound #{compound}."
+ next
+ end
+
if prediction_algorithm =~ /Regression/
prediction = Algorithm.run(prediction_algorithm, neighbors, :min_train_performance => self.min_train_performance)
else
prediction = Algorithm.run(prediction_algorithm, neighbors)
end
+ prediction[:compound] = compound
+ prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort with ascending similarities
- $logger.debug "Prediction time: #{Time.now-time}"
- time = Time.now
- p prediction
# AM: transform to original space (TODO)
confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/
- $logger.debug "predicted value: #{prediction[0]}, confidence: #{prediction[1]}"
+ $logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}"
+ predictions << prediction
+ pt += Time.now-t
end
- prediction_dataset.compound_ids << compound
- prediction_dataset[c,0] = prediction[0]
- prediction_dataset[c,1] = prediction[1]
end
- prediction_dataset
+ $logger.debug "Transform time: #{tt}"
+ $logger.debug "Prediction time: #{pt}"
+
+ # serialize result
+ case object.class.to_s
+ when "OpenTox::Compound"
+ return predictions.first
+ when "Array"
+ return predictions
+ when "OpenTox::Dataset"
+ # prepare prediction dataset
+ prediction_dataset = LazarPrediction.new(
+ :title => "Lazar prediction for #{prediction_feature.title}",
+ :creator => __FILE__
+ )
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
+ prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+ prediction_dataset.compounds = compounds
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence],p[:warning]]}
+ prediction_dataset.save_all
+ return prediction_dataset
+ end
end
def training_activities
- # TODO select predicted variable
- #@training_activities = @training_dataset.data_entries.collect{|entry|
- #act = entry[prediction_feature_pos] if entry
- #@prediction_feature.feature_type=="classification" ? @prediction_feature.value_map.invert[act] : act
- #}
- @training_dataset.data_entries.flatten
+ i = @training_dataset.feature_ids.index prediction_feature_id
+ @training_dataset.data_entries.collect{|de| de[i]}
+ end
+
+ end
+
+ class LazarRegression < Lazar
+ field :min_train_performance, type: Float, default: 0.1
+ def initialize
+ super
+ self.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression"
+ self.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine"
+ self.min_sim = 0.7
+
+ # AM: transform to cosine space
+ min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/
end
+ end
+ class LazarClassification < Lazar
+ def initialize
+ super
+ self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
+ self.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto"
+ self.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_match"
+ self.min_sim = 0.3
+ end
end
end
diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb
index 7743247..1764b47 100644
--- a/lib/opentox-algorithm.rb
+++ b/lib/opentox-algorithm.rb
@@ -15,7 +15,7 @@ require_relative '../last-utils/lu.rb'
#Dir[File.join(File.dirname(__FILE__),"*.rb")].each{ |f| require_relative f}
require_relative "algorithm.rb"
require_relative "descriptor.rb"
-require_relative "fminer.rb"
+#require_relative "fminer.rb"
require_relative "lazar.rb"
require_relative "transform.rb"
require_relative "similarity.rb"
diff --git a/lib/regression.rb b/lib/regression.rb
new file mode 100644
index 0000000..4bade40
--- /dev/null
+++ b/lib/regression.rb
@@ -0,0 +1,174 @@
+#require "rinruby"
+
+# TODO install R packages kernlab, caret, doMC, class, e1071
+# TODO use Rserve
+
+module OpenTox
+ module Algorithm
+
+ class Regression
+require "rserve"
+
+ # Local support vector regression from neighbors
+ # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
+ # @return [Numeric] A prediction value.
+ def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
+
+ confidence = 0.0
+ prediction = nil
+
+ $logger.debug "Local SVM."
+ props = neighbors.collect{|row| row[3] }
+ neighbors.shift
+ activities = neighbors.collect{|n| n[2]}
+ prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+ prediction = nil if (!prediction.nil? && prediction.infinite?)
+ $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
+ if prediction
+ confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
+ else
+ confidence = nil if prediction.nil?
+ end
+ [prediction, confidence]
+
+ end
+
+
+ # Local support vector prediction from neighbors.
+ # Uses propositionalized setting.
+ # Not to be called directly (use local_svm_regression or local_svm_classification).
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+ # @param [Array] activities, activities for neighbors.
+ # @param [Float] min_train_performance, parameter to control censoring
+ # @return [Numeric] A prediction value.
+ def self.local_svm_prop(props, activities, min_train_performance)
+
+ $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
+ n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
+ q_prop = props[0] # is an Array.
+
+ prediction = nil
+ if activities.uniq.size == 1
+ prediction = activities[0]
+ else
+ t = Time.now
+ #$logger.debug gram_matrix.to_yaml
+ #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
+ @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
+ rs = []
+ ["caret", "doMC", "class"].each do |lib|
+ #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
+ rs << "suppressPackageStartupMessages(library('#{lib}'))"
+ end
+ #@r.eval "registerDoMC()" # switch on parallel processing
+ rs << "registerDoMC()" # switch on parallel processing
+ #@r.eval "set.seed(1)"
+ rs << "set.seed(1)"
+ $logger.debug "Loading R packages: #{Time.now-t}"
+ t = Time.now
+ p n_prop
+ begin
+
+ # set data
+ rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
+ rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
+ rs << "n_prop_x_size <- c(#{n_prop.size})"
+ rs << "n_prop_y_size <- c(#{n_prop[0].size})"
+ rs << "y <- c(#{activities.join(',')})"
+ rs << "q_prop <- c(#{q_prop.join(',')})"
+ rs << "y = matrix(y)"
+ rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
+ rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
+
+ $logger.debug "Setting R data: #{Time.now-t}"
+ t = Time.now
+ # prepare data
+ rs << "
+ weights=NULL
+ if (!(class(y) == 'numeric')) {
+ y = factor(y)
+ weights=unlist(as.list(prop.table(table(y))))
+ weights=(weights-1)^2
+ }
+ "
+
+ rs << "
+ rem = nearZeroVar(prop_matrix)
+ if (length(rem) > 0) {
+ prop_matrix = prop_matrix[,-rem,drop=F]
+ q_prop = q_prop[,-rem,drop=F]
+ }
+ rem = findCorrelation(cor(prop_matrix))
+ if (length(rem) > 0) {
+ prop_matrix = prop_matrix[,-rem,drop=F]
+ q_prop = q_prop[,-rem,drop=F]
+ }
+ "
+
+ #p @r.eval("y").to_ruby
+ #p "weights"
+ #p @r.eval("weights").to_ruby
+ $logger.debug "Preparing R data: #{Time.now-t}"
+ t = Time.now
+ # model + support vectors
+ #train_success = @r.eval <<-EOR
+ rs << '
+ model = train(prop_matrix,y,
+ method="svmRadial",
+ preProcess=c("center", "scale"),
+ class.weights=weights,
+ trControl=trainControl(method="LGOCV",number=10),
+ tuneLength=8
+ )
+ perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+ '
+ File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
+ p rs.join("\n")
+ p `Rscript /tmp/r.r`
+=begin
+ @r.void_eval <<-EOR
+ model = train(prop_matrix,y,
+ method="svmRadial",
+ #preProcess=c("center", "scale"),
+ #class.weights=weights,
+ #trControl=trainControl(method="LGOCV",number=10),
+ #tuneLength=8
+ )
+ perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+ EOR
+=end
+
+ $logger.debug "Creating R SVM model: #{Time.now-t}"
+ t = Time.now
+ if train_success
+ # prediction
+ @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
+ #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
+ @r.eval "if (class(y)!='numeric') p = as.character(p)"
+ prediction = @r.p
+
+ # censoring
+ prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
+ prediction = nil if prediction =~ /NA/
+ $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
+ else
+ $logger.debug "Model creation failed."
+ prediction = nil
+ end
+ $logger.debug "R Prediction: #{Time.now-t}"
+ rescue Exception => e
+ $logger.debug "#{e.class}: #{e.message}"
+ $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ ensure
+ #puts @r.inspect
+ #TODO: broken pipe
+ #@r.quit # free R
+ end
+ end
+ prediction
+ end
+ end
+
+ end
+end
+
diff --git a/lib/transform.rb b/lib/transform.rb
index 15b7b60..b2cca86 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -403,11 +403,11 @@ module OpenTox
# Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
# Same for compound fingerprints.
def get_matrices
- @compounds = @model.training_dataset.compounds.clone
+ @compounds = @model.training_dataset.compounds
# TODO select predicted variable
@activities = @model.training_activities
- @n_prop = @model.feature_dataset.data_entries.clone
- @q_prop = @model.query_fingerprint.flatten.clone
+ @n_prop = @model.feature_dataset.data_entries
+ @q_prop = @model.query_fingerprint.flatten
end
# Returns propositionalized data, if appropriate, or nil