diff options
Diffstat (limited to 'lib/model.rb')
-rw-r--r-- | lib/model.rb | 466 |
1 files changed, 181 insertions, 285 deletions
diff --git a/lib/model.rb b/lib/model.rb index 63013cb..c6a2cf4 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -4,6 +4,9 @@ module OpenTox include OpenTox + # Run a model with parameters + # @param [Hash] params Parameters for OpenTox model + # @return [text/uri-list] Task or resource URI def run(params) if CONFIG[:yaml_hosts].include?(URI.parse(@uri).host) accept = 'application/x-yaml' @@ -11,47 +14,25 @@ module OpenTox accept = 'application/rdf+xml' end begin - params[:acccept] = accept - #TODO fix: REstClientWrapper does not accept accept header - #RestClientWrapper.post(@uri,params)#,{:accept => accept}) - `curl -X POST -H "Accept:#{accept}" #{params.collect{|k,v| "-d #{k}=#{v}"}.join(" ")} #{@uri}`.to_s.chomp + RestClientWrapper.post(@uri,{:accept => accept},params).to_s rescue => e LOGGER.error "Failed to run #{@uri} with #{params.inspect} (#{e.inspect})" raise "Failed to run #{@uri} with #{params.inspect}" end end - -=begin - def classification? - #TODO replace with request to ontology server - if @metadata[DC.title] =~ /(?i)classification/ - return true - elsif @metadata[DC.title] =~ /(?i)regression/ - return false - elsif @uri =~/ntua/ and @metadata[DC.title] =~ /mlr/ - return false - elsif @uri =~/tu-muenchen/ and @metadata[DC.title] =~ /regression|M5P|GaussP/ - return false - elsif @uri =~/ambit2/ and @metadata[DC.title] =~ /pKa/ || @metadata[DC.title] =~ /Regression|Caco/ - return false - elsif @uri =~/majority/ - return (@uri =~ /class/) != nil - else - raise "unknown model, uri:'"+@uri+"' title:'"+@metadata[DC.title]+"'" - end - end -=end + # Generic OpenTox model class for all API compliant services class Generic include Model end + # Lazy Structure Activity Relationship class class Lazar include Model + include Algorithm - #attr_accessor :prediction_type, :feature_type, :features, :effects, :activities, :p_values, :fingerprints, :parameters - attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :parameters, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm + attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim def initialize(uri=nil) @@ -61,7 +42,6 @@ module OpenTox super CONFIG[:services]["opentox-model"] end - # TODO: fix metadata, add parameters @metadata[OT.algorithm] = File.join(CONFIG[:services]["opentox-algorithm"],"lazar") @features = [] @@ -70,284 +50,192 @@ module OpenTox @p_values = {} @fingerprints = {} - @feature_calculation_algorithm = "substructure_match" - @similarity_algorithm = "weighted_tanimoto" - @prediction_algorithm = "weighted_majority_vote" + @feature_calculation_algorithm = "Substructure.match" + @similarity_algorithm = "Similarity.tanimoto" + @prediction_algorithm = "Neighbors.weighted_majority_vote" @min_sim = 0.3 end - def self.find(uri) - YAML.load RestClientWrapper.get(uri,:content_type => 'application/x-yaml') + # Get URIs of all lazar models + # @return [Array] List of lazar model URIs + def self.all + RestClientWrapper.get(CONFIG[:services]["opentox-model"]).to_s.split("\n") end - def self.create_from_dataset(dataset_uri,feature_dataset_uri,prediction_feature=nil) - training_activities = OpenTox::Dataset.find(dataset_uri) - training_features = OpenTox::Dataset.find(feature_dataset_uri) - unless prediction_feature # try to read prediction_feature from dataset - raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1 - prediction_feature = training_activities.features.keys.first - params[:prediction_feature] = prediction_feature - end - lazar = Lazar.new - training_features = OpenTox::Dataset.new(feature_dataset_uri) - case training_features.feature_type - when "classification" - lazar.similarity_algorithm = "weighted_tanimoto" - when "regression" - lazar.similarity_algorithm = "weighted_euclid" - end + # Find a lazar model + # @param [String] uri Model URI + # @return [OpenTox::Model::Lazar] lazar model + def self.find(uri) + YAML.load RestClientWrapper.get(uri,:accept => 'application/x-yaml') end - def self.create(dataset_uri,prediction_feature=nil,feature_generation_uri=File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc"),params=nil) - - training_activities = OpenTox::Dataset.find(dataset_uri) - - unless prediction_feature # try to read prediction_feature from dataset - raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1 - prediction_feature = training_activities.features.keys.first - params[:prediction_feature] = prediction_feature - end - - lazar = Lazar.new - params[:feature_generation_uri] = feature_generation_uri - feature_dataset_uri = OpenTox::Algorithm::Generic.new(feature_generation_uri).run(params).to_s - training_features = OpenTox::Dataset.find(feature_dataset_uri) - raise "Dataset #{feature_dataset_uri} not found or empty." if training_features.nil? - - # sorted features for index lookups - lazar.features = training_features.features.sort if training_features.feature_type == "regression" - - training_features.data_entries.each do |compound,entry| - lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound] - entry.keys.each do |feature| - case training_features.feature_type - when "fminer" - # fingerprints are sets - smarts = training_features.features[feature][OT.smarts] - lazar.fingerprints[compound] << smarts - unless lazar.features.include? smarts - lazar.features << smarts - lazar.p_values[smarts] = training_features.features[feature][OT.p_value] - lazar.effects[smarts] = training_features.features[feature][OT.effect] - end - when "classification" - # fingerprints are sets - if entry[feature].flatten.size == 1 - lazar.fingerprints[compound] << feature if entry[feature].flatten.first.match(TRUE_REGEXP) - lazar.features << feature unless lazar.features.include? feature - else - LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}" - end - when "regression" - # fingerprints are arrays - if entry[feature].flatten.size == 1 - lazar.fingerprints[compound][lazar.features.index(feature)] = entry[feature].flatten.first - else - LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}" - end - end - end - - lazar.activities[compound] = [] unless lazar.activities[compound] - training_activities.data_entries[compound][params[:prediction_feature]].each do |value| - case value.to_s - when "true" - lazar.activities[compound] << true - when "false" - lazar.activities[compound] << false - else - lazar.activities[compound] << value.to_f - lazar.prediction_type = "regression" - end - end - end - - if feature_generation_uri.match(/fminer/) - lazar.feature_calculation_algorithm = "substructure_match" - else - halt 404, "External feature generation services not yet supported" - end - - lazar.metadata[OT.dependentVariables] = params[:prediction_feature] - lazar.metadata[OT.trainingDataset] = dataset_uri - lazar.metadata[OT.featureDataset] = feature_dataset_uri + # Create a new lazar model + # @param [optional,Hash] params Parameters for the lazar algorithm (OpenTox::Algorithm::Lazar) + # @return [OpenTox::Model::Lazar] lazar model + def self.create(params) + lazar_algorithm = OpenTox::Algorithm::Generic.new File.join( CONFIG[:services]["opentox-algorithm"],"lazar") + model_uri = lazar_algorithm.run(params) + OpenTox::Model::Lazar.find(model_uri) + end - lazar.parameters = { - "dataset_uri" => dataset_uri, - "prediction_feature" => prediction_feature, - "feature_generation_uri" => feature_generation_uri - } - - model_uri = lazar.save - LOGGER.info model_uri + " created #{Time.now}" - model_uri +=begin + # Create a new lazar model and return task + # @param [optional,Hash] params Parameters for the lazar algorithm (OpenTox::Algorithm::Lazar) + # @return [OpenTox::Task] Task for lazar model creation + def self.create_task(params) + task_uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"],"lazar"), {}, params, false) + Task.find(task_uri) + #model_uri = lazar_algorithm.run(params) + #OpenTox::Model::Lazar.new(model_uri) + end +=end + def parameter(param) + @metadata[OT.parameters].collect{|p| p[OT.paramValue] if p[DC.title] == param}.compact.first end def predict_dataset(dataset_uri) @prediction_dataset = Dataset.create @prediction_dataset.add_metadata({ - OT.hasSource => @lazar.uri, - DC.creator => @lazar.uri, - DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )) + OT.hasSource => @uri, + DC.creator => @uri, + DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )), + OT.parameters => [{DC.title => "dataset_uri", OT.paramValue => dataset_uri}] }) - @prediction_dataset.add_parameters({"dataset_uri" => dataset_uri}) - Dataset.new(dataset_uri).load_compounds.each do |compound_uri| + d = Dataset.new(dataset_uri) + d.load_compounds + d.compounds.each do |compound_uri| predict(compound_uri,false) end @prediction_dataset.save - @prediction_dataset.uri + @prediction_dataset end + # Predict a compound + # @param [String] compound_uri Compound URI + # @param [optinal,Boolean] verbose Verbose prediction (output includes neighbors and features) + # @return [OpenTox::Dataset] Dataset with prediction def predict(compound_uri,verbose=false) @compound = Compound.new compound_uri + features = {} unless @prediction_dataset + #@prediction_dataset = cached_prediction + #return @prediction_dataset if cached_prediction @prediction_dataset = Dataset.create @prediction_dataset.add_metadata( { - OT.hasSource => @lazar.uri, - DC.creator => @lazar.uri, - DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )) + OT.hasSource => @uri, + DC.creator => @uri, + # TODO: fix dependentVariable + DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )), + OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}] } ) - @prediction_dataset.add_parameters( {"compound_uri" => compound_uri} ) end - neighbors - eval @prediction_algorithm - - if @prediction + return @prediction_dataset if database_activity - feature_uri = File.join( @prediction_dataset.uri, "feature", @prediction_dataset.compounds.size) - @prediction_dataset.add @compound.uri, feature_uri, @prediction + neighbors + prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values})") + + prediction_feature_uri = File.join( @prediction_dataset.uri, "feature", "prediction", File.basename(@metadata[OT.dependentVariables]),@prediction_dataset.compounds.size.to_s) + # TODO: fix dependentVariable + @prediction_dataset.metadata[OT.dependentVariables] = prediction_feature_uri + + if @neighbors.size == 0 + @prediction_dataset.add_feature(prediction_feature_uri, { + OT.hasSource => @uri, + DC.creator => @uri, + DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )), + OT.error => "No similar compounds in training dataset.", + OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}] + }) + @prediction_dataset.add @compound.uri, prediction_feature_uri, prediction[:prediction] - feature_metadata = @prediction_dataset.metadata - feature_metadata[DC.title] = File.basename(@metadata[OT.dependentVariables]) - feature_metadata[OT.prediction] = @prediction - feature_metadata[OT.confidence] = @confidence - @prediction_dataset.add_feature(feature_uri, feature_metadata) + else + @prediction_dataset.add_feature(prediction_feature_uri, { + OT.hasSource => @uri, + DC.creator => @uri, + DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )), + OT.prediction => prediction[:prediction], + OT.confidence => prediction[:confidence], + OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}] + }) + @prediction_dataset.add @compound.uri, prediction_feature_uri, prediction[:prediction] if verbose - if @compound_features + if @feature_calculation_algorithm == "Substructure.match" + f = 0 + @compound_features.each do |feature| + feature_uri = File.join( @prediction_dataset.uri, "feature", "descriptor", f.to_s) + features[feature] = feature_uri + @prediction_dataset.add_feature(feature_uri, { + OT.smarts => feature, + OT.p_value => @p_values[feature], + OT.effect => @effects[feature] + }) + @prediction_dataset.add @compound.uri, feature_uri, true + f+=1 + end + else @compound_features.each do |feature| + features[feature] = feature @prediction_dataset.add @compound.uri, feature, true end end n = 0 - @neighbors.sort{|a,b| a[:similarity] <=> b[:similarity]}.each do |neighbor| - neighbor_uri = File.join( @prediction_dataset.uri, "feature/neighbor", n ) - @prediction_dataset.add @compound.uri, neighbor_uri, true - @prediction_dataset.add_feature(neighbor, { + @neighbors.each do |neighbor| + neighbor_uri = File.join( @prediction_dataset.uri, "feature", "neighbor", n.to_s ) + @prediction_dataset.add_feature(neighbor_uri, { OT.compound => neighbor[:compound], OT.similarity => neighbor[:similarity], OT.activity => neighbor[:activity] }) + @prediction_dataset.add @compound.uri, neighbor_uri, true + f = 0 unless f + neighbor[:features].each do |feature| + if @feature_calculation_algorithm == "Substructure.match" + feature_uri = File.join( @prediction_dataset.uri, "feature", "descriptor", f.to_s) unless feature_uri = features[feature] + else + feature_uri = feature + end + @prediction_dataset.add neighbor[:compound], feature_uri, true + unless features.has_key? feature + features[feature] = feature_uri + @prediction_dataset.add_feature(feature_uri, { + OT.smarts => feature, + OT.p_value => @p_values[feature], + OT.effect => @effects[feature] + }) + f+=1 + end + end n+=1 end + # what happens with dataset predictions? end end - @prediction_dataset.save - @prediction_dataset.uri - end - - def weighted_majority_vote - conf = 0.0 - @neighbors.each do |neighbor| - case neighbor[:activity].to_s - when 'true' - conf += OpenTox::Algorithm.gauss(neighbor[:similarity]) - when 'false' - conf -= OpenTox::Algorithm.gauss(neighbor[:similarity]) - end - end - if conf > 0.0 - @prediction = true - elsif conf < 0.0 - @prediction = false - else - @prediction = nil - end - @confidence = conf/@neighbors.size if @neighbors.size > 0 - end - - def local_svm_regression - sims = @neighbors.collect{ |n| n[:similarity] } # similarity values between query and neighbors - conf = sims.inject{|sum,x| sum + x } - acts = @neighbors.collect do |n| - act = n[:activity] - # TODO: check this in model creation - raise "0 values not allowed in training dataset. log10 is calculated internally." if act.to_f == 0 - Math.log10(act.to_f) - end # activities of neighbors for supervised learning - - neighbor_matches = @neighbors.collect{ |n| n[:features] } # as in classification: URIs of matches - gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel - if neighbor_matches.size == 0 - raise "No neighbors found" - else - # gram matrix - (0..(neighbor_matches.length-1)).each do |i| - gram_matrix[i] = [] - # lower triangle - (0..(i-1)).each do |j| - sim = OpenTox::Algorithm.weighted_tanimoto(neighbor_matches[i], neighbor_matches[j], @lazar.p_values) - gram_matrix[i] << OpenTox::Algorithm.gauss(sim) - end - # diagonal element - gram_matrix[i][i] = 1.0 - # upper triangle - ((i+1)..(neighbor_matches.length-1)).each do |j| - sim = OpenTox::Algorithm.weighted_tanimoto(neighbor_matches[i], neighbor_matches[j], @lazar.p_values) # double calculation? - gram_matrix[i] << OpenTox::Algorithm.gauss(sim) - end - end - @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests - @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed - LOGGER.debug "Setting R data ..." - # set data - @r.gram_matrix = gram_matrix.flatten - @r.n = neighbor_matches.size - @r.y = acts - @r.sims = sims - - LOGGER.debug "Preparing R data ..." - # prepare data - @r.eval "y<-as.vector(y)" - @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))" - @r.eval "sims<-as.vector(sims)" - - # model + support vectors - LOGGER.debug "Creating SVM model ..." - @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"nu-svr\", nu=0.8)" - @r.eval "sv<-as.vector(SVindex(model))" - @r.eval "sims<-sims[sv]" - @r.eval "sims<-as.kernelMatrix(matrix(sims,1))" - LOGGER.debug "Predicting ..." - @r.eval "p<-predict(model,sims)[1,1]" - @prediction = 10**(@r.p.to_f) - LOGGER.debug "Prediction is: '" + prediction.to_s + "'." - @r.quit # free R - end - @confidence = conf/@neighbors.size if @neighbors.size > 0 - + @prediction_dataset.save + @prediction_dataset end + # Find neighbors and store them as object variable def neighbors - @compound_features = eval(@feature_calculation_algorithm) if @feature_calculation_algorithm + @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm - @neighbors = {} - @activities.each do |training_compound,activities| - @training_compound = training_compound - sim = eval(@similarity_algorithm) + @neighbors = [] + @fingerprints.each do |training_compound,training_features| + #@activities.each do |training_compound,activities| + sim = eval("#{@similarity_algorithm}(@compound_features,training_features,@p_values)") if sim > @min_sim - activities.each do |act| + @activities[training_compound].each do |act| @neighbors << { - :compound => @training_compound, + :compound => training_compound, :similarity => sim, - :features => @fingerprints[@training_compound], + :features => training_features, :activity => act } end @@ -356,55 +244,63 @@ module OpenTox end - def tanimoto - OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound]) - end - - def weighted_tanimoto - OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound],@p_values) - end - - def euclid - OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound]) - end - - def weighted_euclid - OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound],@p_values) - end - - def substructure_match - @compound.match(@features) - end - - def database_search - #TODO add features method to dataset - Dataset.new(@metadata[OT.featureDataset]).features(@compound.uri) +=begin + def cached_prediction + dataset_uri = PredictionCache.find(:model_uri => @uri, :compound_uri => @compound.uri).dataset_uri) + return false unless dataset_uri + @prediction_dataset = Dataset.find(dataset_uri) + return false unless @prediction_dataset + LOGGER.debug "Serving cached prediction" + true end +=end - def database_activity(compound_uri) - prediction = OpenTox::Dataset.new - # find database activities - if @activities[compound_uri] - @activities[compound_uri].each { |act| prediction.add compound_uri, @metadata[OT.dependentVariables], act } - prediction.add_metadata(OT.hasSource => @metadata[OT.trainingDataset]) - prediction + # Find database activities and store them in @prediction_dataset + # @return [Boolean] true if compound has databasse activities, false if not + def database_activity + if @activities[@compound.uri] + @activities[@compound.uri].each { |act| @prediction_dataset.add @compound.uri, @metadata[OT.dependentVariables], act } + @prediction_dataset.add_metadata(OT.hasSource => @metadata[OT.trainingDataset]) + @prediction_dataset.save + true else - nil + false end end + # Save model at model service def save - RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml) - end - - def self.all - RestClientWrapper.get(CONFIG[:services]["opentox-model"]).to_s.split("\n") + self.uri = RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml) end + # Delete model at model service def delete RestClientWrapper.delete @uri unless @uri == CONFIG[:services]["opentox-model"] end +=begin +=end + +=begin + def self.create_from_dataset(dataset_uri,feature_dataset_uri,prediction_feature=nil) + training_activities = OpenTox::Dataset.find(dataset_uri) + training_features = OpenTox::Dataset.find(feature_dataset_uri) + unless prediction_feature # try to read prediction_feature from dataset + raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1 + prediction_feature = training_activities.features.keys.first + params[:prediction_feature] = prediction_feature + end + lazar = Lazar.new + training_features = OpenTox::Dataset.new(feature_dataset_uri) + case training_features.feature_type + when "classification" + lazar.similarity_algorithm = "weighted_tanimoto" + when "regression" + lazar.similarity_algorithm = "weighted_euclid" + end + end +=end + end end end |