diff options
-rw-r--r-- | lib/algorithm.rb | 148 | ||||
-rw-r--r-- | lib/feature.rb | 1 | ||||
-rw-r--r-- | lib/model.rb | 128 | ||||
-rw-r--r-- | lib/to-html.rb | 2 | ||||
-rw-r--r-- | lib/validation.rb | 53 |
5 files changed, 259 insertions, 73 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 7fbe0dc..5b41cbf 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -164,11 +164,7 @@ module OpenTox # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features` # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required # @return [Hash] Hash with keys `:prediction, :confidence` - def self.local_svm_regression(neighbors,params ) - sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values between query and neighbors - conf = sims.inject{|sum,x| sum + x } - - # AM: Control log taking + def self.local_svm_regression(neighbors, params) take_logs=true neighbors.each do |n| if (! n[:activity].nil?) && (n[:activity].to_f < 0.0) @@ -180,57 +176,115 @@ module OpenTox take_logs ? Math.log10(act.to_f) : act.to_f end # activities of neighbors for supervised learning - neighbor_matches = neighbors.collect{ |n| n[:features] } # as in classification: URIs of matches - gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel - if neighbor_matches.size == 0 - raise "No neighbors found" - else - # gram matrix - (0..(neighbor_matches.length-1)).each do |i| - gram_matrix[i] = [] unless gram_matrix[i] - # upper triangle - ((i+1)..(neighbor_matches.length-1)).each do |j| - sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values])") - gram_matrix[i][j] = Algorithm.gauss(sim) - gram_matrix[j] = [] unless gram_matrix[j] - gram_matrix[j][i] = gram_matrix[i][j] # lower triangle - end - gram_matrix[i][i] = 1.0 - end + sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors + begin + prediction = local_svm(neighbors, acts, sims, "nu-svr", params) + prediction = (take_logs ? 10**(prediction.to_f) : prediction.to_f) + LOGGER.debug "Prediction is: '" + prediction.to_s + "'." + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" + end - #LOGGER.debug gram_matrix.to_yaml - @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests - @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed - LOGGER.debug "Setting R data ..." - # set data - @r.gram_matrix = gram_matrix.flatten - @r.n = neighbor_matches.size - @r.y = acts - @r.sims = sims + conf = sims.inject{|sum,x| sum + x } + confidence = conf/neighbors.size if neighbors.size > 0 + {:prediction => prediction, :confidence => confidence} + + end - LOGGER.debug "Preparing R data ..." - # prepare data - @r.eval "y<-as.vector(y)" - @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))" - @r.eval "sims<-as.vector(sims)" - - # model + support vectors - LOGGER.debug "Creating SVM model ..." - @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"nu-svr\", nu=0.8)" - @r.eval "sv<-as.vector(SVindex(model))" - @r.eval "sims<-sims[sv]" - @r.eval "sims<-as.kernelMatrix(matrix(sims,1))" - LOGGER.debug "Predicting ..." - @r.eval "p<-predict(model,sims)[1,1]" - prediction = 10**(@r.p.to_f) if take_logs + # Local support vector classification from neighbors + # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features` + # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required + # @return [Hash] Hash with keys `:prediction, :confidence` + def self.local_svm_classification(neighbors, params) + acts = neighbors.collect do |n| + act = n[:activity] + end # activities of neighbors for supervised learning + acts_f = acts.collect {|v| v == true ? 1.0 : 0.0} + sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors + begin + prediction = local_svm (neighbors, acts_f, sims, "C-bsvc", params) LOGGER.debug "Prediction is: '" + prediction.to_s + "'." - @r.quit # free R + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" end + + conf = sims.inject{|sum,x| sum + x } confidence = conf/neighbors.size if neighbors.size > 0 {:prediction => prediction, :confidence => confidence} end + + # Local support vector prediction from neighbors. + # Not to be called directly (use local_svm_regression or local_svm_classification. + # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features` + # @param [Array] acts, activities for neighbors. + # @param [Array] sims, similarities for neighbors. + # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification). + # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required + # @return [Numeric] A prediction value. + def self.local_svm(neighbors, acts, sims, type, params) + neighbor_matches = neighbors.collect{ |n| n[:features] } # URIs of matches + gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel + if neighbor_matches.size == 0 + raise "No neighbors found." + else + # gram matrix + (0..(neighbor_matches.length-1)).each do |i| + gram_matrix[i] = [] unless gram_matrix[i] + # upper triangle + ((i+1)..(neighbor_matches.length-1)).each do |j| + sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values])") + gram_matrix[i][j] = Algorithm.gauss(sim) + gram_matrix[j] = [] unless gram_matrix[j] + gram_matrix[j][i] = gram_matrix[i][j] # lower triangle + end + gram_matrix[i][i] = 1.0 + end + + #LOGGER.debug gram_matrix.to_yaml + @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests + @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed + LOGGER.debug "Setting R data ..." + # set data + @r.gram_matrix = gram_matrix.flatten + @r.n = neighbor_matches.size + @r.y = acts + @r.sims = sims + + begin + LOGGER.debug "Preparing R data ..." + # prepare data + @r.eval "y<-as.vector(y)" + @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))" + @r.eval "sims<-as.vector(sims)" + + # model + support vectors + LOGGER.debug "Creating SVM model ..." + @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)" + @r.eval "sv<-as.vector(SVindex(model))" + @r.eval "sims<-sims[sv]" + @r.eval "sims<-as.kernelMatrix(matrix(sims,1))" + LOGGER.debug "Predicting ..." + if type == "nu-svr" + @r.eval "p<-predict(model,sims)[1,1]" + elsif type == "C-bsvc" + @r.eval "p<-predict(model,sims)" + end + if type == "nu-svr" + prediction = @r.p + elsif type == "C-bsvc" + prediction = (@r.p.to_f == 1.0 ? true : false) + end + @r.quit # free R + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message} #{e.backtrace}" + end + + end + prediction + end + end module Substructure diff --git a/lib/feature.rb b/lib/feature.rb index b631e46..2f1ab6c 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -16,6 +16,7 @@ module OpenTox feature end + # provides feature type, possible types are "regression" or "classification" # @return [String] feature type, unknown if OT.isA property is unknown/ not set def feature_type diff --git a/lib/model.rb b/lib/model.rb index 048de85..998d2dc 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -23,7 +23,7 @@ module OpenTox # Generic OpenTox model class for all API compliant services class Generic include Model - + # Find Generic Opentox Model via URI, and loads metadata, could raise NotFound/NotAuthorized error # @param [String] uri Model URI # @return [OpenTox::Model::Generic] Model instance @@ -34,12 +34,12 @@ module OpenTox raise "could not load model metadata '"+uri.to_s+"'" if model.metadata==nil or model.metadata.size==0 model end - - # provides feature type, possible types are "regression" or "classification" - # @return [String] feature type, "unknown" if type could not be estimated + + # provides feature type, possible types are "regression" or "classification" + # @return [String] feature type, "unknown" if type could not be estimated def feature_type(subjectid=nil) return @feature_type if @feature_type - + # dynamically perform restcalls if necessary load_metadata(subjectid) if @metadata==nil or @metadata.size==0 or (@metadata.size==1 && @metadata.values[0]==@uri) algorithm = OpenTox::Algorithm::Generic.find(@metadata[OT.algorithm], subjectid) @@ -60,9 +60,9 @@ module OpenTox raise "unknown model "+type_indicators.inspect unless @feature_type @feature_type end - + end - + # Lazy Structure Activity Relationship class class Lazar @@ -78,7 +78,7 @@ module OpenTox else super CONFIG[:services]["opentox-model"] end - + @metadata[OT.algorithm] = File.join(CONFIG[:services]["opentox-algorithm"],"lazar") @features = [] @@ -178,9 +178,65 @@ module OpenTox return @prediction_dataset if database_activity(subjectid) - neighbors - prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values})") + if metadata[RDF.type] == [OTA.ClassificationLazySingleTarget] + # AM: Balancing, see http://www.maunz.de/wordpress/opentox/2011/balanced-lazar + l = Array.new # larger + s = Array.new # smaller fraction + @fingerprints.each do |training_compound,training_features| + @activities[training_compound].each do |act| + case act.to_s + when "false" + l << training_compound + when "true" + s << training_compound + else + LOGGER.warn "BLAZAR: Activity #{act.to_s} should not be reached." + end + end + end + if s.size > l.size then + l,s = s,l # happy swapping + LOGGER.info "BLAZAR: |s|=#{s.size}, |l|=#{l.size}." + end + # determine ratio + modulo = l.size.divmod(s.size)# modulo[0]=ratio, modulo[1]=rest + LOGGER.info "BLAZAR: Balance: #{modulo[0]}, rest #{modulo[1]}." + + # AM: Balanced predictions + addon = (modulo[1].to_f/modulo[0]).ceil # what will be added in each round + slack = modulo[1].divmod(addon)[1] # what remains for the last round + position = 0 + predictions = Array.new + + prediction_best=nil + neighbors_best=nil + + begin + for i in 1..modulo[0] do + (i == modulo[0]) && (slack>0) ? lr_size = s.size + slack : lr_size = s.size + addon # determine fraction + LOGGER.info "BLAZAR: Neighbors round #{i}: #{position} + #{lr_size}." + neighbors_balanced(s, l, position, lr_size) # get ratio fraction of larger part + prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values})") + if prediction_best.nil? || prediction[:confidence].abs > prediction_best[:confidence].abs + prediction_best=prediction + neighbors_best=@neighbors + end + position = position + lr_size + end + rescue Exception => e + LOGGER.error "BLAZAR failed in prediction: "+e.class.to_s+": "+e.message + end + + prediction=prediction_best + @neighbors=neighbors_best + ### END AM balanced predictions + + else # regression case: no balancing + neighbors + prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values})") + end + prediction_feature_uri = File.join( @prediction_dataset.uri, "feature", "prediction", File.basename(@metadata[OT.dependentVariables]),@prediction_dataset.compounds.size.to_s) # TODO: fix dependentVariable @prediction_dataset.metadata[OT.dependentVariables] = prediction_feature_uri @@ -269,27 +325,55 @@ module OpenTox end # Find neighbors and store them as object variable - def neighbors - + def neighbors_balanced(s, l, start, offset) @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm @neighbors = [] - @fingerprints.each do |training_compound,training_features| - sim = eval("#{@similarity_algorithm}(@compound_features,training_features,@p_values)") - if sim > @min_sim - @activities[training_compound].each do |act| - @neighbors << { - :compound => training_compound, - :similarity => sim, - :features => training_features, - :activity => act - } + begin + #@fingerprints.each do |training_compound,training_features| # AM: this is original by CH + [ l[start, offset ] , s ].flatten.each do |training_compound| # AM: access only a balanced subset + training_features = @fingerprints[training_compound] + sim = eval("#{@similarity_algorithm}(@compound_features,training_features,@p_values)") + if sim > @min_sim + @activities[training_compound].each do |act| + this_neighbor = { + :compound => training_compound, + :similarity => sim, + :features => training_features, + :activity => act + } + @neighbors << this_neighbor + end end end + rescue Exception => e + LOGGER.error "BLAZAR failed in neighbors: "+e.class.to_s+": "+e.message end end + + # Find neighbors and store them as object variable + def neighbors + + @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm + + @neighbors = [] + @fingerprints.each do |training_compound,training_features| + sim = eval("#{@similarity_algorithm}(@compound_features,training_features,@p_values)") + if sim > @min_sim + @activities[training_compound].each do |act| + @neighbors << { + :compound => training_compound, + :similarity => sim, + :features => training_features, + :activity => act + } + end + end + end + end + # Find database activities and store them in @prediction_dataset # @return [Boolean] true if compound has databasse activities, false if not def database_activity(subjectid) diff --git a/lib/to-html.rb b/lib/to-html.rb index 6785974..66a3e74 100644 --- a/lib/to-html.rb +++ b/lib/to-html.rb @@ -1,5 +1,5 @@ -OT_LOGO = "http://opentox.informatik.uni-freiburg.de/ot-logo.png" +OT_LOGO = File.join(CONFIG[:services]["opentox-validation"],"resources/ot-logo.png") class String diff --git a/lib/validation.rb b/lib/validation.rb index d58d36e..d7a337c 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -65,7 +65,7 @@ module OpenTox def summary if @metadata[OT.classificationStatistics] res = { - :nr_predictions => @metadata[OT.numInstances] - @metadata[OT.numUnpredicted], + :nr_predictions => @metadata[OT.numInstances].to_i - @metadata[OT.numUnpredicted].to_i, :correct_predictions => @metadata[OT.classificationStatistics][OT.percentCorrect], :weighted_area_under_roc => @metadata[OT.classificationStatistics][OT.weightedAreaUnderRoc], } @@ -83,7 +83,7 @@ module OpenTox res elsif @metadata[OT.regressionStatistics] { - :nr_predictions => @metadata[OT.numInstances] - @metadata[OT.numUnpredicted], + :nr_predictions => @metadata[OT.numInstances].to_i - @metadata[OT.numUnpredicted].to_i, :r_square => @metadata[OT.regressionStatistics][OT.rSquare], :root_mean_squared_error => @metadata[OT.regressionStatistics][OT.rootMeanSquaredError], :mean_absolute_error => @metadata[OT.regressionStatistics][OT.meanAbsoluteError], @@ -198,7 +198,6 @@ module OpenTox # @param [String,optional] subjectid # @return [OpenTox::CrossvalidationReport] def self.find( uri, subjectid=nil ) - # PENDING load report data? OpenTox::RestClientWrapper.get(uri,{:subjectid => subjectid}) rep = CrossvalidationReport.new(uri) rep.load_metadata( subjectid ) @@ -227,6 +226,54 @@ module OpenTox end end + + class AlgorithmComparisonReport + include OpenTox + + # finds AlgorithmComparisonReport via uri, raises error if not found + # @param [String] uri + # @param [String,optional] subjectid + # @return [OpenTox::CrossvalidationReport] + def self.find( uri, subjectid=nil ) + OpenTox::RestClientWrapper.get(uri,{:subjectid => subjectid}) + rep = AlgorithmComparisonReport.new(uri) + rep.load_metadata( subjectid ) + rep + end + + # finds AlgorithmComparisonReport for a particular crossvalidation + # @param [String] crossvalidation uri + # @param [String,optional] subjectid + # @return [OpenTox::AlgorithmComparisonReport] nil if no report found + def self.find_for_crossvalidation( crossvalidation_uri, subjectid=nil ) + uris = RestClientWrapper.get(File.join(CONFIG[:services]["opentox-validation"], + "/report/algorithm_comparison?crossvalidation="+crossvalidation_uri), {:subjectid => subjectid}).chomp.split("\n") + uris.size==0 ? nil : AlgorithmComparisonReport.new(uris[-1]) + end + + # creates a crossvalidation report via crossvalidation + # @param [Hash] crossvalidation uri_hash, see example + # @param [String,optional] subjectid + # @param [OpenTox::Task,optional] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly + # @return [OpenTox::AlgorithmComparisonReport] + # example for hash: + # { :lazar-bbrc => [ http://host/validation/crossvalidation/x1, http://host/validation/crossvalidation/x2 ], + # :lazar-last => [ http://host/validation/crossvalidation/xy, http://host/validation/crossvalidation/xy ] } + def self.create( crossvalidation_uri_hash, subjectid=nil, waiting_task=nil ) + identifier = [] + validation_uris = [] + crossvalidation_uri_hash.each do |id, uris| + uris.each do |uri| + identifier << id + validation_uris << uri + end + end + uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-validation"],"/report/algorithm_comparison"), + { :validation_uris => validation_uris.join(","), :identifier => identifier.join(","), :subjectid => subjectid }, {}, waiting_task ) + AlgorithmComparisonReport.new(uri) + end + end + class QMRFReport include OpenTox |