From cdc0e4dd01c4b8da3a43e7d6b49a7e09a9881d63 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 1 Aug 2015 17:53:26 +0200 Subject: 50 times faster bbrc setup by eliminating @fminer.add_fminer_data --- babel_3d_cache/.gitkeep | 0 lib/algorithm.rb | 1 + lib/bbrc.rb | 61 +++++++++----- lib/classification.rb | 10 +-- lib/fminer.rb | 12 +-- lib/lazar.rb | 201 ++++++++++++++++++++--------------------------- lib/opentox-algorithm.rb | 2 +- lib/regression.rb | 174 ++++++++++++++++++++++++++++++++++++++++ lib/transform.rb | 6 +- 9 files changed, 311 insertions(+), 156 deletions(-) delete mode 100644 babel_3d_cache/.gitkeep create mode 100644 lib/regression.rb diff --git a/babel_3d_cache/.gitkeep b/babel_3d_cache/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/lib/algorithm.rb b/lib/algorithm.rb index eda7588..0e227d6 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -3,6 +3,7 @@ module OpenTox module Algorithm def self.run algorithm, object, parameters={} + bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/ klass,method = algorithm.split('.') parameters.empty? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters) end diff --git a/lib/bbrc.rb b/lib/bbrc.rb index 595d712..6b0eb26 100644 --- a/lib/bbrc.rb +++ b/lib/bbrc.rb @@ -1,6 +1,9 @@ module OpenTox module Algorithm class Fminer + TABLE_OF_ELEMENTS = [ +"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"] + # # Run bbrc algorithm on dataset # @@ -14,27 +17,40 @@ module OpenTox # - nr_hits Set to "true" to get hit count instead of presence # - get_target Set to "true" to obtain target variable as feature # @return [text/uri-list] Task URI - def self.bbrc dataset, params={} - - table_of_elements = [ -"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"] - - @fminer=OpenTox::Algorithm::Fminer.new - @fminer.check_params(dataset,params,5) + def self.bbrc training_dataset, params={} time = Time.now + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + + prediction_feature = training_dataset.features.first + if params[:min_frequency] + minfreq = params[:min_frequency] + else + per_mil = 5 # value from latest version + i = training_dataset.feature_ids.index prediction_feature.id + nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size + minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST + minfreq = 2 unless minfreq > 2 + minfreq = minfreq.round + end + + #@fminer=OpenTox::Algorithm::Fminer.new + #@fminer.check_params(dataset,params,5) + #p @fminer.instance_variables + @bbrc = Bbrc::Bbrc.new @bbrc.Reset - if @fminer.prediction_feature.numeric + if prediction_feature.numeric @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! else bad_request_error "No accept values for "\ - "dataset '#{@fminer.training_dataset.id}' and "\ - "feature '#{@fminer.prediction_feature.id}'" unless @fminer.prediction_feature.accept_values - value_map = @fminer.prediction_feature.accept_values.each_index.inject({}) { |h,idx| h[idx+1]=@fminer.prediction_feature.accept_values[idx]; h } + "dataset '#{training_dataset.id}' and "\ + "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values + act2value = prediction_feature.accept_values.each_index.inject({}) { |h,idx| h[idx+1]=prediction_feature.accept_values[idx]; h } + value2act = act2value.invert end - @bbrc.SetMinfreq(@fminer.minfreq) + @bbrc.SetMinfreq(minfreq) @bbrc.SetType(1) if params[:feature_type] == "paths" @bbrc.SetBackbone(false) if params[:backbone] == "false" @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] @@ -42,21 +58,28 @@ module OpenTox params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false feature_dataset = FminerDataset.new( - :training_dataset_id => dataset.id, + :training_dataset_id => training_dataset.id, :training_algorithm => "#{self.to_s}.bbrc", - :training_feature_id => params[:prediction_feature].id , + :training_feature_id => prediction_feature.id , :training_parameters => { - :min_frequency => @fminer.minfreq, + :min_frequency => minfreq, :nr_hits => nr_hits, :backbone => (params[:backbone] == false ? false : true) } ) - feature_dataset.compounds = dataset.compounds + feature_dataset.compounds = training_dataset.compounds + $logger.debug "Setup: #{Time.now-time}" + time = Time.now # Add data to fminer - @fminer.add_fminer_data(@bbrc, value_map) - g_median=@fminer.all_activities.values.to_scale.median + #@fminer.add_fminer_data(@bbrc, value_map) + training_dataset.compounds.each_with_index do |compound,i| + @bbrc.AddCompound(compound.smiles,i+1) + act = value2act[training_dataset.data_entries[i].first] + @bbrc.AddActivity(act,i+1) + end + #g_median=@fminer.all_activities.values.to_scale.median #task.progress 10 #step_width = 80 / @bbrc.GetNoRootNodes().to_f @@ -76,7 +99,7 @@ module OpenTox smarts = f.shift # convert fminer SMARTS representation into a more human readable format smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do - element = table_of_elements[$1.to_i-1] + element = TABLE_OF_ELEMENTS[$1.to_i-1] $2 == "a" ? element.downcase : element end p_value = f.shift diff --git a/lib/classification.rb b/lib/classification.rb index 127fa28..d71ab77 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -8,12 +8,8 @@ module OpenTox # @return [Numeric] A prediction value. def self.weighted_majority_vote(neighbors) - return {:prediction => nil, :confidence => nil} if neighbors.empty? - neighbor_contribution = 0.0 confidence_sum = 0.0 - confidence = 0.0 - prediction = nil $logger.debug "Weighted Majority Vote Classification." @@ -39,14 +35,14 @@ module OpenTox elsif confidence_sum < 0.0 prediction = values[0] end + elsif values.size == 1 # all neighbors have the same value + prediction = values[0] else prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction end - $logger.debug "Prediction: '" + prediction.to_s + "'." unless prediction.nil? confidence = (confidence_sum/neighbors.size).abs - $logger.debug "Confidence: '" + confidence.to_s + "'." unless prediction.nil? - [prediction, confidence.abs] + {:value => prediction, :confidence => confidence.abs} end # Local support vector regression from neighbors diff --git a/lib/fminer.rb b/lib/fminer.rb index 666cefa..37be183 100644 --- a/lib/fminer.rb +++ b/lib/fminer.rb @@ -186,16 +186,8 @@ module OpenTox # @param [Integer] per-mil value # return [Integer] min-frequency def min_frequency(training_dataset,prediction_feature,per_mil) - nr_labeled_cmpds = DataEntry.where(dataset_id: training_dataset.id, feature_id: prediction_feature.id).in(compound_id: training_dataset.compound_ids).count - #nr_labeled_cmpds=0 - #f_idx=training_dataset.features.index prediction_feature - #training_dataset.compounds.each_with_index { |cmpd, c_idx| - #if ( training_dataset.data_entries[c_idx] ) - #unless training_dataset.data_entries[c_idx][f_idx].nil? - #nr_labeled_cmpds += 1 - #end - #end - #} + i = training_dataset.feature_ids.index prediction_feature.id + nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST minfreq = 2 unless minfreq > 2 Integer (minfreq) diff --git a/lib/lazar.rb b/lib/lazar.rb index 399f5c1..d9195ad 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -1,10 +1,3 @@ -=begin -* Name: lazar.rb -* Description: Lazar model representation -* Author: Andreas Maunz , Christoph Helma -* Date: 10/2012 -=end - module OpenTox module Model @@ -16,26 +9,18 @@ module OpenTox store_in collection: "models" field :title, type: String - field :description, type: String - #field :parameters, type: Array, default: [] + field :endpoint, type: String field :creator, type: String, default: __FILE__ # datasets field :training_dataset_id, type: BSON::ObjectId field :feature_dataset_id, type: BSON::ObjectId # algorithms - #field :feature_generation, type: String - #field :feature_calculation_algorithm, type: String + field :feature_calculation_algorithm, type: String field :prediction_algorithm, type: String field :similarity_algorithm, type: String - # prediction features - field :prediction_feature_id, type: BSON::ObjectId - field :predicted_value_id, type: BSON::ObjectId - field :predicted_variables, type: Array - # parameters - field :nr_hits, type: Boolean field :min_sim, type: Float - #field :propositionalized, type:Boolean - field :min_train_performance, type: Float + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId attr_accessor :prediction_dataset attr_accessor :training_dataset @@ -43,84 +28,31 @@ module OpenTox attr_accessor :query_fingerprint attr_accessor :neighbors - # Check parameters for plausibility - # Prepare lazar object (includes graph mining) - # @param[Array] lazar parameters as strings - # @param[Hash] REST parameters, as input by user - def self.create training_dataset, feature_dataset, prediction_feature=nil, nr_hits=false, params={} - - lazar = OpenTox::Model::Lazar.new + # Create a lazar model from a training_dataset and a feature_dataset + # @param [OpenTox::Dataset] training_dataset + # @param [OpenTox::Dataset] feature_dataset + # @return [OpenTox::Model::Lazar] Regression or classification model + def self.create training_dataset, feature_dataset bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty? - lazar.feature_dataset_id = feature_dataset.id - @training_dataset = training_dataset - bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless @training_dataset.compounds == feature_dataset.compounds - lazar.training_dataset_id = @training_dataset.id - - if prediction_feature - resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{@training_dataset.id}'" unless @training_dataset.features.include?( params[:prediction_feature] ) - else # try to read prediction_feature from dataset - resource_not_found_error "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1 - prediction_feature = @training_dataset.features.first - end + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless training_dataset.compounds == feature_dataset.compounds + prediction_feature = training_dataset.features.first + prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new + lazar.feature_dataset_id = feature_dataset.id + lazar.training_dataset_id = training_dataset.id lazar.prediction_feature_id = prediction_feature.id lazar.title = prediction_feature.title - if params and params[:prediction_algorithm] - bad_request_error "Unknown prediction_algorithm #{params[:prediction_algorithm]}" unless OpenTox::Algorithm::Neighbors.respond_to?(params[:prediction_algorithm]) - lazar.prediction_algorithm = params[:prediction_algorithm] - end - - unless lazar.prediction_algorithm # set defaults - # TODO consider params - if prediction_feature.nominal - lazar.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto" - lazar.min_sim = 0.3 unless lazar.min_sim - elsif prediction_feature.numeric - lazar.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression" - lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine" - # cosine similartiy is default - lazar.min_sim = 0.7 unless lazar.min_sim - end - end - #lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true - - lazar.min_sim = params[:min_sim].to_f if params[:min_sim] and params[:min_sim].numeric? - # TODO: get info from training_dataset - lazar.nr_hits = nr_hits - #lazar.feature_generation = feature_dataset.training_algorithm - #lazar.parameters << {"title" => "feature_generation_uri", "paramValue" => params[:feature_generation_uri]} - - bad_request_error "Parameter min_train_performance is not numeric." if params[:min_train_performance] and !params[:min_train_performance].numeric? - lazar.min_train_performance = params[:min_train_performance].to_f if params[:min_train_performance] and params[:min_train_performance].numeric? - lazar.min_train_performance = 0.1 unless lazar.min_train_performance - lazar.save lazar end def predict object - # tailored for performance - # all consistency checks should be done during model creation - time = Time.now - # prepare prediction dataset - prediction_dataset = LazarPrediction.new - prediction_feature = OpenTox::Feature.find prediction_feature_id - prediction_dataset.title = "Lazar prediction for #{prediction_feature.title}", - prediction_dataset.creator = __FILE__, - - confidence_feature = OpenTox::Feature.find_or_create_by({ - "title" => "Prediction confidence", - "numeric" => true - }) - - prediction_dataset.features = [ confidence_feature, prediction_feature ] - @training_dataset = OpenTox::Dataset.find(training_dataset_id) @feature_dataset = OpenTox::Dataset.find(feature_dataset_id) @@ -139,52 +71,44 @@ module OpenTox $logger.debug "Setup: #{Time.now-time}" time = Time.now - @query_fingerprint = Algorithm.run(feature_dataset.feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} ) - - $logger.debug "Fingerprint calculation: #{Time.now-time}" - time = Time.now + @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} ) - # AM: transform to cosine space - min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/ + $logger.debug "Query fingerprint calculation: #{Time.now-time}" + predictions = [] + prediction_feature = OpenTox::Feature.find prediction_feature_id + tt = 0 + pt = 0 compounds.each_with_index do |compound,c| + t = Time.new $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}" database_activities = @training_dataset.values(compound,prediction_feature) if database_activities and !database_activities.empty? - database_activities.each do |database_activity| - $logger.debug "do not predict compound, it occurs in dataset with activity #{database_activity}" - prediction_dataset.compound_ids << compound.id - prediction_dataset[c,0] = database_activity - prediction_dataset[c,1] = nil - end + database_activities = database_activities.first if database_activities.size == 1 + $logger.debug "Compound #{compound.inchi} occurs in training dataset with activity #{database_activities}" + predictions << {:compound => compound, :value => database_activities, :confidence => "measured"} next else - t = Time.new if prediction_algorithm =~ /Regression/ mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) mtf.transform training_fingerprints = mtf.n_prop - training_activities = mtf.activities - p training_activities query_fingerprint = mtf.q_prop neighbors = [[nil,nil,nil,query_fingerprint]] else training_fingerprints = @feature_dataset.data_entries - # TODO fix for multi feature datasets - training_activities = @training_dataset.data_entries[i].first query_fingerprint = @query_fingerprint[c] neighbors = [] end - $logger.debug "Transform: #{Time.now-t}" + tt += Time.now-t t = Time.new # find neighbors training_fingerprints.each_with_index do |fingerprint, i| - sim = Algorithm.run(similarity_algorithm,fingerprint, query_fingerprint) if sim > self.min_sim if prediction_algorithm =~ /Regression/ @@ -195,40 +119,85 @@ module OpenTox end end + if neighbors.empty? + predictions << {:compound => compound, :value => nil, :confidence => nil, :warning => "No neighbors with similarity > #{min_sim} in dataset #{training_dataset.id}"} + #$logger.warn "No neighbors found for compound #{compound}." + next + end + if prediction_algorithm =~ /Regression/ prediction = Algorithm.run(prediction_algorithm, neighbors, :min_train_performance => self.min_train_performance) else prediction = Algorithm.run(prediction_algorithm, neighbors) end + prediction[:compound] = compound + prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort with ascending similarities - $logger.debug "Prediction time: #{Time.now-time}" - time = Time.now - p prediction # AM: transform to original space (TODO) confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/ - $logger.debug "predicted value: #{prediction[0]}, confidence: #{prediction[1]}" + $logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}" + predictions << prediction + pt += Time.now-t end - prediction_dataset.compound_ids << compound - prediction_dataset[c,0] = prediction[0] - prediction_dataset[c,1] = prediction[1] end - prediction_dataset + $logger.debug "Transform time: #{tt}" + $logger.debug "Prediction time: #{pt}" + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + return predictions.first + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :title => "Lazar prediction for #{prediction_feature.title}", + :creator => __FILE__ + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) + warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence],p[:warning]]} + prediction_dataset.save_all + return prediction_dataset + end end def training_activities - # TODO select predicted variable - #@training_activities = @training_dataset.data_entries.collect{|entry| - #act = entry[prediction_feature_pos] if entry - #@prediction_feature.feature_type=="classification" ? @prediction_feature.value_map.invert[act] : act - #} - @training_dataset.data_entries.flatten + i = @training_dataset.feature_ids.index prediction_feature_id + @training_dataset.data_entries.collect{|de| de[i]} + end + + end + + class LazarRegression < Lazar + field :min_train_performance, type: Float, default: 0.1 + def initialize + super + self.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression" + self.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine" + self.min_sim = 0.7 + + # AM: transform to cosine space + min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/ end + end + class LazarClassification < Lazar + def initialize + super + self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" + self.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto" + self.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_match" + self.min_sim = 0.3 + end end end diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb index 7743247..1764b47 100644 --- a/lib/opentox-algorithm.rb +++ b/lib/opentox-algorithm.rb @@ -15,7 +15,7 @@ require_relative '../last-utils/lu.rb' #Dir[File.join(File.dirname(__FILE__),"*.rb")].each{ |f| require_relative f} require_relative "algorithm.rb" require_relative "descriptor.rb" -require_relative "fminer.rb" +#require_relative "fminer.rb" require_relative "lazar.rb" require_relative "transform.rb" require_relative "similarity.rb" diff --git a/lib/regression.rb b/lib/regression.rb new file mode 100644 index 0000000..4bade40 --- /dev/null +++ b/lib/regression.rb @@ -0,0 +1,174 @@ +#require "rinruby" + +# TODO install R packages kernlab, caret, doMC, class, e1071 +# TODO use Rserve + +module OpenTox + module Algorithm + + class Regression +require "rserve" + + # Local support vector regression from neighbors + # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required + # @return [Numeric] A prediction value. + def self.local_svm_regression neighbors, params={:min_train_performance => 0.1} + + confidence = 0.0 + prediction = nil + + $logger.debug "Local SVM." + props = neighbors.collect{|row| row[3] } + neighbors.shift + activities = neighbors.collect{|n| n[2]} + prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + prediction = nil if (!prediction.nil? && prediction.infinite?) + $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')." + if prediction + confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities}) + else + confidence = nil if prediction.nil? + end + [prediction, confidence] + + end + + + # Local support vector prediction from neighbors. + # Uses propositionalized setting. + # Not to be called directly (use local_svm_regression or local_svm_classification). + # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] + # @param [Array] activities, activities for neighbors. + # @param [Float] min_train_performance, parameter to control censoring + # @return [Numeric] A prediction value. + def self.local_svm_prop(props, activities, min_train_performance) + + $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)." + n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays. + q_prop = props[0] # is an Array. + + prediction = nil + if activities.uniq.size == 1 + prediction = activities[0] + else + t = Time.now + #$logger.debug gram_matrix.to_yaml + #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests + @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests + rs = [] + ["caret", "doMC", "class"].each do |lib| + #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))" + rs << "suppressPackageStartupMessages(library('#{lib}'))" + end + #@r.eval "registerDoMC()" # switch on parallel processing + rs << "registerDoMC()" # switch on parallel processing + #@r.eval "set.seed(1)" + rs << "set.seed(1)" + $logger.debug "Loading R packages: #{Time.now-t}" + t = Time.now + p n_prop + begin + + # set data + rs << "n_prop <- c(#{n_prop.flatten.join(',')})" + rs << "n_prop <- c(#{n_prop.flatten.join(',')})" + rs << "n_prop_x_size <- c(#{n_prop.size})" + rs << "n_prop_y_size <- c(#{n_prop[0].size})" + rs << "y <- c(#{activities.join(',')})" + rs << "q_prop <- c(#{q_prop.join(',')})" + rs << "y = matrix(y)" + rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)" + rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)" + + $logger.debug "Setting R data: #{Time.now-t}" + t = Time.now + # prepare data + rs << " + weights=NULL + if (!(class(y) == 'numeric')) { + y = factor(y) + weights=unlist(as.list(prop.table(table(y)))) + weights=(weights-1)^2 + } + " + + rs << " + rem = nearZeroVar(prop_matrix) + if (length(rem) > 0) { + prop_matrix = prop_matrix[,-rem,drop=F] + q_prop = q_prop[,-rem,drop=F] + } + rem = findCorrelation(cor(prop_matrix)) + if (length(rem) > 0) { + prop_matrix = prop_matrix[,-rem,drop=F] + q_prop = q_prop[,-rem,drop=F] + } + " + + #p @r.eval("y").to_ruby + #p "weights" + #p @r.eval("weights").to_ruby + $logger.debug "Preparing R data: #{Time.now-t}" + t = Time.now + # model + support vectors + #train_success = @r.eval <<-EOR + rs << ' + model = train(prop_matrix,y, + method="svmRadial", + preProcess=c("center", "scale"), + class.weights=weights, + trControl=trainControl(method="LGOCV",number=10), + tuneLength=8 + ) + perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) + ' + File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")} + p rs.join("\n") + p `Rscript /tmp/r.r` +=begin + @r.void_eval <<-EOR + model = train(prop_matrix,y, + method="svmRadial", + #preProcess=c("center", "scale"), + #class.weights=weights, + #trControl=trainControl(method="LGOCV",number=10), + #tuneLength=8 + ) + perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) + EOR +=end + + $logger.debug "Creating R SVM model: #{Time.now-t}" + t = Time.now + if train_success + # prediction + @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice + #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice + @r.eval "if (class(y)!='numeric') p = as.character(p)" + prediction = @r.p + + # censoring + prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f ) + prediction = nil if prediction =~ /NA/ + $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'" + else + $logger.debug "Model creation failed." + prediction = nil + end + $logger.debug "R Prediction: #{Time.now-t}" + rescue Exception => e + $logger.debug "#{e.class}: #{e.message}" + $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + #puts @r.inspect + #TODO: broken pipe + #@r.quit # free R + end + end + prediction + end + end + + end +end + diff --git a/lib/transform.rb b/lib/transform.rb index 15b7b60..b2cca86 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -403,11 +403,11 @@ module OpenTox # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed. # Same for compound fingerprints. def get_matrices - @compounds = @model.training_dataset.compounds.clone + @compounds = @model.training_dataset.compounds # TODO select predicted variable @activities = @model.training_activities - @n_prop = @model.feature_dataset.data_entries.clone - @q_prop = @model.query_fingerprint.flatten.clone + @n_prop = @model.feature_dataset.data_entries + @q_prop = @model.query_fingerprint.flatten end # Returns propositionalized data, if appropriate, or nil -- cgit v1.2.3