From 28c41fc27bea4668ee1dc3c8d1f086e64d271b5a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 31 Jul 2015 19:24:45 +0200 Subject: intermediary commit --- algorithm.gemspec | 3 +- lib/classification.rb | 37 +++++++++++++-- lib/descriptor.rb | 10 +++- lib/lazar.rb | 117 +++++++++++++++++++++++++++-------------------- lib/opentox-algorithm.rb | 3 +- lib/transform.rb | 7 ++- 6 files changed, 118 insertions(+), 59 deletions(-) diff --git a/algorithm.gemspec b/algorithm.gemspec index c3119e6..1a94225 100644 --- a/algorithm.gemspec +++ b/algorithm.gemspec @@ -20,7 +20,8 @@ Gem::Specification.new do |s| # specify any dependencies here; for example: #s.add_runtime_dependency "opentox-server" s.add_runtime_dependency "opentox-client" - s.add_runtime_dependency 'rinruby'#, "~>2.0.2" + s.add_runtime_dependency 'rserve-client'#, "~>2.0.2" + #s.add_runtime_dependency 'rinruby'#, "~>2.0.2" s.add_runtime_dependency 'nokogiri'#, "~>1.4.4" s.add_runtime_dependency 'statsample'#, "~>1.1" s.add_runtime_dependency 'gsl'#, "~>1.14" diff --git a/lib/classification.rb b/lib/classification.rb index f6c9b11..127fa28 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -17,10 +17,10 @@ module OpenTox $logger.debug "Weighted Majority Vote Classification." - values = neighbors.collect{|n| n[1]}.uniq + values = neighbors.collect{|n| n[2]}.uniq neighbors.each do |neighbor| - neighbor_weight = neighbor[2] - activity = values.index(neighbor[1]) + 1 # map values to integers > 1 + neighbor_weight = neighbor[1] + activity = values.index(neighbor[2]) + 1 # map values to integers > 1 neighbor_contribution += activity * neighbor_weight if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true case activity @@ -46,9 +46,38 @@ module OpenTox $logger.debug "Prediction: '" + prediction.to_s + "'." unless prediction.nil? confidence = (confidence_sum/neighbors.size).abs $logger.debug "Confidence: '" + confidence.to_s + "'." unless prediction.nil? - return {:prediction => prediction, :confidence => confidence.abs} + [prediction, confidence.abs] end + # Local support vector regression from neighbors + # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required + # @return [Numeric] A prediction value. + def self.local_svm_classification(params) + + confidence = 0.0 + prediction = nil + + $logger.debug "Local SVM." + if params[:activities].size>0 + if params[:props] + n_prop = params[:props][0].collect.to_a + q_prop = params[:props][1].collect.to_a + props = [ n_prop, q_prop ] + end + activities = params[:activities].collect.to_a + activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification + prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + prediction = prediction.sub(/Val/,"") if prediction # Convert back + confidence = 0.0 if prediction.nil? + #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')." + confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]}) + end + {:prediction => prediction, :confidence => confidence} + + end + + + end end diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 1d43d7d..8ec7480 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -1,7 +1,6 @@ require 'digest/md5' ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" BABEL_3D_CACHE_DIR = File.join(File.dirname(__FILE__),"..",'/babel_3d_cache') -# TODO store 3D structures in mongodb # TODO store descriptors in mongodb module OpenTox @@ -59,6 +58,7 @@ module OpenTox bad_request_error "Compounds for smarts_match are empty" unless compounds bad_request_error "Smarts for smarts_match are empty" unless smarts parse compounds + @count = count obconversion = OpenBabel::OBConversion.new obmol = OpenBabel::OBMol.new obconversion.set_in_format('inchi') @@ -100,13 +100,19 @@ module OpenTox @data_entries end when "OpenTox::Dataset" - dataset = OpenTox::Dataset.new(:compound_ids => @compounds.collect{|c| c.id}) + dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id}) if @smarts dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} + @count ? algo = "count" : algo = "match" + dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}" + elsif @physchem_descriptors dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} dataset.data_entries = @data_entries + dataset.feature_calculation_algorithm = "#{self}.physchem" + #TODO params? end + dataset.save_all dataset end end diff --git a/lib/lazar.rb b/lib/lazar.rb index 19f8cdd..399f5c1 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -23,8 +23,8 @@ module OpenTox field :training_dataset_id, type: BSON::ObjectId field :feature_dataset_id, type: BSON::ObjectId # algorithms - field :feature_generation, type: String - field :feature_calculation_algorithm, type: String + #field :feature_generation, type: String + #field :feature_calculation_algorithm, type: String field :prediction_algorithm, type: String field :similarity_algorithm, type: String # prediction features @@ -34,7 +34,7 @@ module OpenTox # parameters field :nr_hits, type: Boolean field :min_sim, type: Float - field :propositionalized, type:Boolean + #field :propositionalized, type:Boolean field :min_train_performance, type: Float attr_accessor :prediction_dataset @@ -54,7 +54,6 @@ module OpenTox bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty? lazar.feature_dataset_id = feature_dataset.id @training_dataset = training_dataset - #@training_dataset = OpenTox::Dataset.find(feature_dataset.parameters.select{|p| p["title"] == "dataset_id"}.first["paramValue"]) bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless @training_dataset.compounds == feature_dataset.compounds lazar.training_dataset_id = @training_dataset.id @@ -73,31 +72,26 @@ module OpenTox lazar.prediction_algorithm = params[:prediction_algorithm] end - unless lazar.prediction_algorithm - lazar.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" if prediction_feature.nominal - lazar.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression" if prediction_feature.numeric + unless lazar.prediction_algorithm # set defaults + # TODO consider params + if prediction_feature.nominal + lazar.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" + lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto" + lazar.min_sim = 0.3 unless lazar.min_sim + elsif prediction_feature.numeric + lazar.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression" + lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine" + # cosine similartiy is default + lazar.min_sim = 0.7 unless lazar.min_sim + end end - lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true + #lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true lazar.min_sim = params[:min_sim].to_f if params[:min_sim] and params[:min_sim].numeric? + # TODO: get info from training_dataset lazar.nr_hits = nr_hits - lazar.feature_generation = feature_dataset.training_algorithm + #lazar.feature_generation = feature_dataset.training_algorithm #lazar.parameters << {"title" => "feature_generation_uri", "paramValue" => params[:feature_generation_uri]} - if lazar.feature_generation =~ /fminer|bbrc|last/ - if lazar[:nr_hits] - lazar.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_count" - else - lazar.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_match" - end - lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto" - lazar.min_sim = 0.3 unless lazar.min_sim - elsif lazar.feature_generation =~/descriptor/ or lazar.feature_generation.nil? - # cosine similartiy is default (e.g. used when no fetature_generation_uri is given and a feature_dataset_uri is provided instead) - lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine" - lazar.min_sim = 0.7 unless lazar.min_sim - else - bad_request_error "unkown feature generation method #{lazar.feature_generation}" - end bad_request_error "Parameter min_train_performance is not numeric." if params[:min_train_performance] and !params[:min_train_performance].numeric? lazar.min_train_performance = params[:min_train_performance].to_f if params[:min_train_performance] and params[:min_train_performance].numeric? @@ -107,7 +101,7 @@ module OpenTox lazar end - def predict params + def predict object # tailored for performance # all consistency checks should be done during model creation @@ -131,20 +125,21 @@ module OpenTox @feature_dataset = OpenTox::Dataset.find(feature_dataset_id) compounds = [] - if params[:compound] - compounds = [ params[:compound]] - elsif params[:compounds] - compounds = params[:compounds] - elsif params[:dataset] - compounds = params[:dataset].compounds + case object.class.to_s + when "OpenTox::Compound" + compounds = [object] + when "Array" + compounds = object + when "OpenTox::Dataset" + compounds = object.compounds else - bad_request_error "Please provide one of the parameters: :compound, :compounds, :dataset" + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." end $logger.debug "Setup: #{Time.now-time}" time = Time.now - @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.smarts} ) + @query_fingerprint = Algorithm.run(feature_dataset.feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} ) $logger.debug "Fingerprint calculation: #{Time.now-time}" time = Time.now @@ -166,35 +161,59 @@ module OpenTox end next else - - # TODO reintroduce for regression - #mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) - #mtf.transform - # + t = Time.new + + if prediction_algorithm =~ /Regression/ + mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) + mtf.transform + training_fingerprints = mtf.n_prop + training_activities = mtf.activities + p training_activities + query_fingerprint = mtf.q_prop + neighbors = [[nil,nil,nil,query_fingerprint]] + else + training_fingerprints = @feature_dataset.data_entries + # TODO fix for multi feature datasets + training_activities = @training_dataset.data_entries[i].first + query_fingerprint = @query_fingerprint[c] + neighbors = [] + end + $logger.debug "Transform: #{Time.now-t}" + t = Time.new + # find neighbors - neighbors = [] - @feature_dataset.data_entries.each_with_index do |fingerprint, i| - - sim = Algorithm.run(similarity_algorithm,fingerprint, @query_fingerprint[c]) - # TODO fix for multi feature datasets - neighbors << [@feature_dataset.compounds[i],@training_dataset.data_entries[i].first,sim] if sim > self.min_sim + training_fingerprints.each_with_index do |fingerprint, i| + + sim = Algorithm.run(similarity_algorithm,fingerprint, query_fingerprint) + if sim > self.min_sim + if prediction_algorithm =~ /Regression/ + neighbors << [@feature_dataset.compounds[i],sim,training_activities[i], fingerprint] + else + neighbors << [@feature_dataset.compounds[i],sim,training_activities[i]] + end + end end - prediction = Algorithm.run(prediction_algorithm, neighbors) + if prediction_algorithm =~ /Regression/ + prediction = Algorithm.run(prediction_algorithm, neighbors, :min_train_performance => self.min_train_performance) + else + prediction = Algorithm.run(prediction_algorithm, neighbors) + end $logger.debug "Prediction time: #{Time.now-time}" time = Time.now + p prediction # AM: transform to original space (TODO) - confidence_value = ((confidence_value+1.0)/2.0).abs if similarity_algorithm =~ /cosine/ + confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/ - $logger.debug "predicted value: #{prediction[:prediction]}, confidence: #{prediction[:confidence]}" + $logger.debug "predicted value: #{prediction[0]}, confidence: #{prediction[1]}" end prediction_dataset.compound_ids << compound - prediction_dataset[c,0] = prediction[:prediction] - prediction_dataset[c,1] = prediction[:confidence] + prediction_dataset[c,0] = prediction[0] + prediction_dataset[c,1] = prediction[1] end prediction_dataset diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb index d768cfd..7743247 100644 --- a/lib/opentox-algorithm.rb +++ b/lib/opentox-algorithm.rb @@ -19,5 +19,6 @@ require_relative "fminer.rb" require_relative "lazar.rb" require_relative "transform.rb" require_relative "similarity.rb" -require_relative "neighbors.rb" +#require_relative "neighbors.rb" require_relative "classification.rb" +require_relative "regression.rb" diff --git a/lib/transform.rb b/lib/transform.rb index fad9517..15b7b60 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -231,7 +231,7 @@ module OpenTox # Attaches transformations to an OpenTox::Model # Stores props, sims, performs similarity calculations class ModelTransformer - attr_accessor :model, :similarity_algorithm, :activities, :sims + attr_accessor :model, :similarity_algorithm, :activities, :sims, :n_prop, :q_prop # @params[OpenTox::Model] model Model to transform def initialize model @@ -282,6 +282,7 @@ module OpenTox @ids = [] # surviving compounds become neighbors @sims = [] # calculated by neighbor routine +=begin neighbors n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix acts_tmp = []; @ids.each { |idx| acts_tmp << @activities[idx] }; @activities = acts_tmp @@ -315,6 +316,7 @@ module OpenTox $logger.debug "Sims: #{@sims.size}, Acts: #{@activities.size}" @sims = [ gram_matrix, @sims ] +=end end @@ -393,7 +395,8 @@ module OpenTox # @param[Array] A propositionalized data entry # @return[Float] Similarity to query structure def similarity(training_props) - OpenTox::Algorithm::Similarity.send(@model.similarity_algorithm,training_props, @q_prop) + eval("#{@model.similarity_algorithm}(#{training_props}, #{@q_prop})") + #OpenTox::Algorithm::Similarity.send(@model.similarity_algorithm,training_props, @q_prop) end -- cgit v1.2.3