diff options
author | Christoph Helma <helma@in-silico.ch> | 2010-11-11 09:31:27 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2010-11-11 09:31:27 +0100 |
commit | b93002b4ea50ff7e357da08abd10577347ce2d5f (patch) | |
tree | 840f1b8865032ce59917d8c5a3d6b2e499d19126 /lib/model.rb | |
parent | d6811507c1c1339cc4fe7cdb429b9b34b97dc422 (diff) |
first steps towards version 2.0, yard documentation started, passes compound, dataset, feature, algorithm, fminer tests
Diffstat (limited to 'lib/model.rb')
-rw-r--r-- | lib/model.rb | 485 |
1 files changed, 376 insertions, 109 deletions
diff --git a/lib/model.rb b/lib/model.rb index d0d6703..63013cb 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -1,143 +1,410 @@ module OpenTox + module Model + include OpenTox + + def run(params) + if CONFIG[:yaml_hosts].include?(URI.parse(@uri).host) + accept = 'application/x-yaml' + else + accept = 'application/rdf+xml' + end + begin + params[:acccept] = accept + #TODO fix: REstClientWrapper does not accept accept header + #RestClientWrapper.post(@uri,params)#,{:accept => accept}) + `curl -X POST -H "Accept:#{accept}" #{params.collect{|k,v| "-d #{k}=#{v}"}.join(" ")} #{@uri}`.to_s.chomp + rescue => e + LOGGER.error "Failed to run #{@uri} with #{params.inspect} (#{e.inspect})" + raise "Failed to run #{@uri} with #{params.inspect}" + end + end + +=begin + def classification? + #TODO replace with request to ontology server + if @metadata[DC.title] =~ /(?i)classification/ + return true + elsif @metadata[DC.title] =~ /(?i)regression/ + return false + elsif @uri =~/ntua/ and @metadata[DC.title] =~ /mlr/ + return false + elsif @uri =~/tu-muenchen/ and @metadata[DC.title] =~ /regression|M5P|GaussP/ + return false + elsif @uri =~/ambit2/ and @metadata[DC.title] =~ /pKa/ || @metadata[DC.title] =~ /Regression|Caco/ + return false + elsif @uri =~/majority/ + return (@uri =~ /class/) != nil + else + raise "unknown model, uri:'"+@uri+"' title:'"+@metadata[DC.title]+"'" + end + end +=end + class Generic + include Model + end + + class Lazar + + include Model + + #attr_accessor :prediction_type, :feature_type, :features, :effects, :activities, :p_values, :fingerprints, :parameters + attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :parameters, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm + + def initialize(uri=nil) + + if uri + super uri + else + super CONFIG[:services]["opentox-model"] + end + + # TODO: fix metadata, add parameters + @metadata[OT.algorithm] = File.join(CONFIG[:services]["opentox-algorithm"],"lazar") + + @features = [] + @effects = {} + @activities = {} + @p_values = {} + @fingerprints = {} + + @feature_calculation_algorithm = "substructure_match" + @similarity_algorithm = "weighted_tanimoto" + @prediction_algorithm = "weighted_majority_vote" - MODEL_ATTRIBS = [:uri, :title, :creator, :date, :format, :predictedVariables, :independentVariables, :dependentVariables, :trainingDataset, :algorithm] - MODEL_ATTRIBS.each{ |a| attr_accessor(a) } + @min_sim = 0.3 + + end def self.find(uri) - owl = OpenTox::Owl.from_uri(uri, "Model") - return self.new(owl) - end - - def self.to_rdf(model) - owl = OpenTox::Owl.create 'Model', model.uri - (MODEL_ATTRIBS - [:uri]).each do |a| - owl.set(a.to_s,model.send(a.to_s)) + YAML.load RestClientWrapper.get(uri,:content_type => 'application/x-yaml') + end + + def self.create_from_dataset(dataset_uri,feature_dataset_uri,prediction_feature=nil) + training_activities = OpenTox::Dataset.find(dataset_uri) + training_features = OpenTox::Dataset.find(feature_dataset_uri) + unless prediction_feature # try to read prediction_feature from dataset + raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1 + prediction_feature = training_activities.features.keys.first + params[:prediction_feature] = prediction_feature + end + lazar = Lazar.new + training_features = OpenTox::Dataset.new(feature_dataset_uri) + case training_features.feature_type + when "classification" + lazar.similarity_algorithm = "weighted_tanimoto" + when "regression" + lazar.similarity_algorithm = "weighted_euclid" end - owl.rdf end - - protected - def initialize(owl) - MODEL_ATTRIBS.each do |a| - self.send("#{a.to_s}=".to_sym, owl.get(a.to_s)) unless a==:uri + + def self.create(dataset_uri,prediction_feature=nil,feature_generation_uri=File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc"),params=nil) + + training_activities = OpenTox::Dataset.find(dataset_uri) + + unless prediction_feature # try to read prediction_feature from dataset + raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1 + prediction_feature = training_activities.features.keys.first + params[:prediction_feature] = prediction_feature end - @uri = owl.uri - if ENV['RACK_ENV'] =~ /test|debug/ - begin - raise "uri invalid" unless Utils.is_uri?(@uri) - raise "no predicted variables" unless @predictedVariables and @predictedVariables.size>0 - rescue => ex - RestClientWrapper.raise_uri_error "invalid model: '"+ex.message+"'\n"+self.to_yaml+"\n",@uri.to_s + + lazar = Lazar.new + params[:feature_generation_uri] = feature_generation_uri + feature_dataset_uri = OpenTox::Algorithm::Generic.new(feature_generation_uri).run(params).to_s + training_features = OpenTox::Dataset.find(feature_dataset_uri) + raise "Dataset #{feature_dataset_uri} not found or empty." if training_features.nil? + + # sorted features for index lookups + lazar.features = training_features.features.sort if training_features.feature_type == "regression" + + training_features.data_entries.each do |compound,entry| + lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound] + entry.keys.each do |feature| + case training_features.feature_type + when "fminer" + # fingerprints are sets + smarts = training_features.features[feature][OT.smarts] + lazar.fingerprints[compound] << smarts + unless lazar.features.include? smarts + lazar.features << smarts + lazar.p_values[smarts] = training_features.features[feature][OT.p_value] + lazar.effects[smarts] = training_features.features[feature][OT.effect] + end + when "classification" + # fingerprints are sets + if entry[feature].flatten.size == 1 + lazar.fingerprints[compound] << feature if entry[feature].flatten.first.match(TRUE_REGEXP) + lazar.features << feature unless lazar.features.include? feature + else + LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}" + end + when "regression" + # fingerprints are arrays + if entry[feature].flatten.size == 1 + lazar.fingerprints[compound][lazar.features.index(feature)] = entry[feature].flatten.first + else + LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}" + end + end + end + + lazar.activities[compound] = [] unless lazar.activities[compound] + training_activities.data_entries[compound][params[:prediction_feature]].each do |value| + case value.to_s + when "true" + lazar.activities[compound] << true + when "false" + lazar.activities[compound] << false + else + lazar.activities[compound] << value.to_f + lazar.prediction_type = "regression" + end end - LOGGER.warn "model has no dependent variable" unless @dependentVariables and @dependentVariables.size>0 - LOGGER.warn "model has no algorithm" unless @algorithm and @algorithm.size>0 - LOGGER.warn "model has no indenpendent variables" unless @independentVariables end + + if feature_generation_uri.match(/fminer/) + lazar.feature_calculation_algorithm = "substructure_match" + else + halt 404, "External feature generation services not yet supported" + end + + lazar.metadata[OT.dependentVariables] = params[:prediction_feature] + lazar.metadata[OT.trainingDataset] = dataset_uri + lazar.metadata[OT.featureDataset] = feature_dataset_uri + + lazar.parameters = { + "dataset_uri" => dataset_uri, + "prediction_feature" => prediction_feature, + "feature_generation_uri" => feature_generation_uri + } + + model_uri = lazar.save + LOGGER.info model_uri + " created #{Time.now}" + model_uri end - end - - class PredictionModel < Generic - - def self.build( algorithm_uri, algorithm_params ) - - LOGGER.debug "Build model, algorithm_uri:"+algorithm_uri.to_s+", algorithm_parms: "+algorithm_params.inspect.to_s - uri = OpenTox::RestClientWrapper.post(algorithm_uri,algorithm_params).to_s - LOGGER.debug "Build model done: "+uri.to_s - RestClientWrapper.raise_uri_error("Invalid build model result: '"+uri.to_s+"'", algorithm_uri, algorithm_params ) unless Utils.model_uri?(uri) - return PredictionModel.find(uri) - end - - def predict_dataset( dataset_uri ) - - LOGGER.debug "Predict dataset: "+dataset_uri.to_s+" with model "+@uri.to_s - uri = RestClientWrapper.post(@uri, {:accept => "text/uri-list", :dataset_uri=>dataset_uri}) - RestClientWrapper.raise_uri_error("Prediciton result no dataset uri: "+uri.to_s, @uri, {:dataset_uri=>dataset_uri} ) unless Utils.dataset_uri?(uri) - uri - end - - def classification? - #HACK replace with request to ontology server - if @title =~ /(?i)classification/ - return true - elsif @title =~ /(?i)regression/ - return false - elsif @uri =~/ntua/ and @title =~ /mlr/ - return false - elsif @uri =~/tu-muenchen/ and @title =~ /regression|M5P|GaussP/ - return false - elsif @uri =~/ambit2/ and @title =~ /pKa/ || @title =~ /Regression|Caco/ - return false - elsif @uri =~/majority/ - return (@uri =~ /class/) != nil + + def predict_dataset(dataset_uri) + @prediction_dataset = Dataset.create + @prediction_dataset.add_metadata({ + OT.hasSource => @lazar.uri, + DC.creator => @lazar.uri, + DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )) + }) + @prediction_dataset.add_parameters({"dataset_uri" => dataset_uri}) + Dataset.new(dataset_uri).load_compounds.each do |compound_uri| + predict(compound_uri,false) + end + @prediction_dataset.save + @prediction_dataset.uri + end + + def predict(compound_uri,verbose=false) + + @compound = Compound.new compound_uri + + unless @prediction_dataset + @prediction_dataset = Dataset.create + @prediction_dataset.add_metadata( { + OT.hasSource => @lazar.uri, + DC.creator => @lazar.uri, + DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )) + } ) + @prediction_dataset.add_parameters( {"compound_uri" => compound_uri} ) + end + + neighbors + eval @prediction_algorithm + + if @prediction + + feature_uri = File.join( @prediction_dataset.uri, "feature", @prediction_dataset.compounds.size) + @prediction_dataset.add @compound.uri, feature_uri, @prediction + + feature_metadata = @prediction_dataset.metadata + feature_metadata[DC.title] = File.basename(@metadata[OT.dependentVariables]) + feature_metadata[OT.prediction] = @prediction + feature_metadata[OT.confidence] = @confidence + @prediction_dataset.add_feature(feature_uri, feature_metadata) + + if verbose + if @compound_features + @compound_features.each do |feature| + @prediction_dataset.add @compound.uri, feature, true + end + end + n = 0 + @neighbors.sort{|a,b| a[:similarity] <=> b[:similarity]}.each do |neighbor| + neighbor_uri = File.join( @prediction_dataset.uri, "feature/neighbor", n ) + @prediction_dataset.add @compound.uri, neighbor_uri, true + @prediction_dataset.add_feature(neighbor, { + OT.compound => neighbor[:compound], + OT.similarity => neighbor[:similarity], + OT.activity => neighbor[:activity] + }) + n+=1 + end + end + end + @prediction_dataset.save + @prediction_dataset.uri + end + + def weighted_majority_vote + conf = 0.0 + @neighbors.each do |neighbor| + case neighbor[:activity].to_s + when 'true' + conf += OpenTox::Algorithm.gauss(neighbor[:similarity]) + when 'false' + conf -= OpenTox::Algorithm.gauss(neighbor[:similarity]) + end + end + if conf > 0.0 + @prediction = true + elsif conf < 0.0 + @prediction = false else - raise "unknown model, uri:'"+@uri.to_s+"' title:'"+@title.to_s+"'" + @prediction = nil end + @confidence = conf/@neighbors.size if @neighbors.size > 0 end - end - - class Lazar < Generic - - attr_accessor :feature_dataset_uri, :effects, :activities, :p_values, :fingerprints, :features - - def initialize - @source = "http://github.com/helma/opentox-model" - @algorithm = File.join(CONFIG[:services]["opentox-algorithm"],"lazar") - #@independent_variables = File.join(CONFIG[:services]["opentox-algorithm"],"fminer#BBRC_representative") - @features = [] - @effects = {} - @activities = {} - @p_values = {} - @fingerprints = {} + + def local_svm_regression + sims = @neighbors.collect{ |n| n[:similarity] } # similarity values between query and neighbors + conf = sims.inject{|sum,x| sum + x } + acts = @neighbors.collect do |n| + act = n[:activity] + # TODO: check this in model creation + raise "0 values not allowed in training dataset. log10 is calculated internally." if act.to_f == 0 + Math.log10(act.to_f) + end # activities of neighbors for supervised learning + + neighbor_matches = @neighbors.collect{ |n| n[:features] } # as in classification: URIs of matches + gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel + if neighbor_matches.size == 0 + raise "No neighbors found" + else + # gram matrix + (0..(neighbor_matches.length-1)).each do |i| + gram_matrix[i] = [] + # lower triangle + (0..(i-1)).each do |j| + sim = OpenTox::Algorithm.weighted_tanimoto(neighbor_matches[i], neighbor_matches[j], @lazar.p_values) + gram_matrix[i] << OpenTox::Algorithm.gauss(sim) + end + # diagonal element + gram_matrix[i][i] = 1.0 + # upper triangle + ((i+1)..(neighbor_matches.length-1)).each do |j| + sim = OpenTox::Algorithm.weighted_tanimoto(neighbor_matches[i], neighbor_matches[j], @lazar.p_values) # double calculation? + gram_matrix[i] << OpenTox::Algorithm.gauss(sim) + end + end + + @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests + @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed + LOGGER.debug "Setting R data ..." + # set data + @r.gram_matrix = gram_matrix.flatten + @r.n = neighbor_matches.size + @r.y = acts + @r.sims = sims + + LOGGER.debug "Preparing R data ..." + # prepare data + @r.eval "y<-as.vector(y)" + @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))" + @r.eval "sims<-as.vector(sims)" + + # model + support vectors + LOGGER.debug "Creating SVM model ..." + @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"nu-svr\", nu=0.8)" + @r.eval "sv<-as.vector(SVindex(model))" + @r.eval "sims<-sims[sv]" + @r.eval "sims<-as.kernelMatrix(matrix(sims,1))" + LOGGER.debug "Predicting ..." + @r.eval "p<-predict(model,sims)[1,1]" + @prediction = 10**(@r.p.to_f) + LOGGER.debug "Prediction is: '" + prediction.to_s + "'." + @r.quit # free R + end + @confidence = conf/@neighbors.size if @neighbors.size > 0 + end - def save - @features.uniq! - resource = RestClient::Resource.new(CONFIG[:services]["opentox-model"]) - resource.post(self.to_yaml, :content_type => "application/x-yaml").chomp.to_s + def neighbors + + @compound_features = eval(@feature_calculation_algorithm) if @feature_calculation_algorithm + + @neighbors = {} + @activities.each do |training_compound,activities| + @training_compound = training_compound + sim = eval(@similarity_algorithm) + if sim > @min_sim + activities.each do |act| + @neighbors << { + :compound => @training_compound, + :similarity => sim, + :features => @fingerprints[@training_compound], + :activity => act + } + end + end + end + end - def self.find_all - RestClientWrapper.get(CONFIG[:services]["opentox-model"]).chomp.split("\n") + def tanimoto + OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound]) end - def self.predict(compound_uri,model_uri) - #RestClientWrapper.post(model_uri,{:compound_uri => compound_uri, :accept => 'application/x-yaml'}) - `curl -X POST -d 'compound_uri=#{compound_uri}' -H 'Accept:application/x-yaml' #{model_uri}` + def weighted_tanimoto + OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound],@p_values) end - end - - class PropertyLazar < Generic - - attr_accessor :feature_dataset_uri, :properties, :features, :activities#, :effects, :p_values - - def initialize - @source = "http://github.com/helma/opentox-model" - @algorithm = File.join(CONFIG[:services]["opentox-algorithm"],"property_lazar") - #@independent_variables = File.join(CONFIG[:services]["opentox-algorithm"],"fminer#BBRC_representative") - @features = [] - #@effects = {} - @activities = {} - #@p_values = {} - @properties = {} + + def euclid + OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound]) + end + + def weighted_euclid + OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound],@p_values) + end + + def substructure_match + @compound.match(@features) + end + + def database_search + #TODO add features method to dataset + Dataset.new(@metadata[OT.featureDataset]).features(@compound.uri) + end + + def database_activity(compound_uri) + prediction = OpenTox::Dataset.new + # find database activities + if @activities[compound_uri] + @activities[compound_uri].each { |act| prediction.add compound_uri, @metadata[OT.dependentVariables], act } + prediction.add_metadata(OT.hasSource => @metadata[OT.trainingDataset]) + prediction + else + nil + end end def save - @features.uniq! - resource = RestClient::Resource.new(CONFIG[:services]["opentox-model"]) - resource.post(self.to_yaml, :content_type => "application/x-yaml").chomp.to_s + RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml) end - def self.find_all - RestClientWrapper.get(CONFIG[:services]["opentox-model"]).chomp.split("\n") + def self.all + RestClientWrapper.get(CONFIG[:services]["opentox-model"]).to_s.split("\n") end - def self.predict(compound_uri,model_uri) - #RestClientWrapper.post(model_uri,{:compound_uri => compound_uri, :accept => 'application/x-yaml'}) - `curl -X POST -d 'compound_uri=#{compound_uri}' -H 'Accept:application/x-yaml' #{model_uri}` + def delete + RestClientWrapper.delete @uri unless @uri == CONFIG[:services]["opentox-model"] end + end end end |