summaryrefslogtreecommitdiff
path: root/lib/model.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2010-11-11 09:31:27 +0100
committerChristoph Helma <helma@in-silico.ch>2010-11-11 09:31:27 +0100
commitb93002b4ea50ff7e357da08abd10577347ce2d5f (patch)
tree840f1b8865032ce59917d8c5a3d6b2e499d19126 /lib/model.rb
parentd6811507c1c1339cc4fe7cdb429b9b34b97dc422 (diff)
first steps towards version 2.0, yard documentation started, passes compound, dataset, feature, algorithm, fminer tests
Diffstat (limited to 'lib/model.rb')
-rw-r--r--lib/model.rb485
1 files changed, 376 insertions, 109 deletions
diff --git a/lib/model.rb b/lib/model.rb
index d0d6703..63013cb 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -1,143 +1,410 @@
module OpenTox
+
module Model
+ include OpenTox
+
+ def run(params)
+ if CONFIG[:yaml_hosts].include?(URI.parse(@uri).host)
+ accept = 'application/x-yaml'
+ else
+ accept = 'application/rdf+xml'
+ end
+ begin
+ params[:acccept] = accept
+ #TODO fix: REstClientWrapper does not accept accept header
+ #RestClientWrapper.post(@uri,params)#,{:accept => accept})
+ `curl -X POST -H "Accept:#{accept}" #{params.collect{|k,v| "-d #{k}=#{v}"}.join(" ")} #{@uri}`.to_s.chomp
+ rescue => e
+ LOGGER.error "Failed to run #{@uri} with #{params.inspect} (#{e.inspect})"
+ raise "Failed to run #{@uri} with #{params.inspect}"
+ end
+ end
+
+=begin
+ def classification?
+ #TODO replace with request to ontology server
+ if @metadata[DC.title] =~ /(?i)classification/
+ return true
+ elsif @metadata[DC.title] =~ /(?i)regression/
+ return false
+ elsif @uri =~/ntua/ and @metadata[DC.title] =~ /mlr/
+ return false
+ elsif @uri =~/tu-muenchen/ and @metadata[DC.title] =~ /regression|M5P|GaussP/
+ return false
+ elsif @uri =~/ambit2/ and @metadata[DC.title] =~ /pKa/ || @metadata[DC.title] =~ /Regression|Caco/
+ return false
+ elsif @uri =~/majority/
+ return (@uri =~ /class/) != nil
+ else
+ raise "unknown model, uri:'"+@uri+"' title:'"+@metadata[DC.title]+"'"
+ end
+ end
+=end
+
class Generic
+ include Model
+ end
+
+ class Lazar
+
+ include Model
+
+ #attr_accessor :prediction_type, :feature_type, :features, :effects, :activities, :p_values, :fingerprints, :parameters
+ attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :parameters, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm
+
+ def initialize(uri=nil)
+
+ if uri
+ super uri
+ else
+ super CONFIG[:services]["opentox-model"]
+ end
+
+ # TODO: fix metadata, add parameters
+ @metadata[OT.algorithm] = File.join(CONFIG[:services]["opentox-algorithm"],"lazar")
+
+ @features = []
+ @effects = {}
+ @activities = {}
+ @p_values = {}
+ @fingerprints = {}
+
+ @feature_calculation_algorithm = "substructure_match"
+ @similarity_algorithm = "weighted_tanimoto"
+ @prediction_algorithm = "weighted_majority_vote"
- MODEL_ATTRIBS = [:uri, :title, :creator, :date, :format, :predictedVariables, :independentVariables, :dependentVariables, :trainingDataset, :algorithm]
- MODEL_ATTRIBS.each{ |a| attr_accessor(a) }
+ @min_sim = 0.3
+
+ end
def self.find(uri)
- owl = OpenTox::Owl.from_uri(uri, "Model")
- return self.new(owl)
- end
-
- def self.to_rdf(model)
- owl = OpenTox::Owl.create 'Model', model.uri
- (MODEL_ATTRIBS - [:uri]).each do |a|
- owl.set(a.to_s,model.send(a.to_s))
+ YAML.load RestClientWrapper.get(uri,:content_type => 'application/x-yaml')
+ end
+
+ def self.create_from_dataset(dataset_uri,feature_dataset_uri,prediction_feature=nil)
+ training_activities = OpenTox::Dataset.find(dataset_uri)
+ training_features = OpenTox::Dataset.find(feature_dataset_uri)
+ unless prediction_feature # try to read prediction_feature from dataset
+ raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1
+ prediction_feature = training_activities.features.keys.first
+ params[:prediction_feature] = prediction_feature
+ end
+ lazar = Lazar.new
+ training_features = OpenTox::Dataset.new(feature_dataset_uri)
+ case training_features.feature_type
+ when "classification"
+ lazar.similarity_algorithm = "weighted_tanimoto"
+ when "regression"
+ lazar.similarity_algorithm = "weighted_euclid"
end
- owl.rdf
end
-
- protected
- def initialize(owl)
- MODEL_ATTRIBS.each do |a|
- self.send("#{a.to_s}=".to_sym, owl.get(a.to_s)) unless a==:uri
+
+ def self.create(dataset_uri,prediction_feature=nil,feature_generation_uri=File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc"),params=nil)
+
+ training_activities = OpenTox::Dataset.find(dataset_uri)
+
+ unless prediction_feature # try to read prediction_feature from dataset
+ raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1
+ prediction_feature = training_activities.features.keys.first
+ params[:prediction_feature] = prediction_feature
end
- @uri = owl.uri
- if ENV['RACK_ENV'] =~ /test|debug/
- begin
- raise "uri invalid" unless Utils.is_uri?(@uri)
- raise "no predicted variables" unless @predictedVariables and @predictedVariables.size>0
- rescue => ex
- RestClientWrapper.raise_uri_error "invalid model: '"+ex.message+"'\n"+self.to_yaml+"\n",@uri.to_s
+
+ lazar = Lazar.new
+ params[:feature_generation_uri] = feature_generation_uri
+ feature_dataset_uri = OpenTox::Algorithm::Generic.new(feature_generation_uri).run(params).to_s
+ training_features = OpenTox::Dataset.find(feature_dataset_uri)
+ raise "Dataset #{feature_dataset_uri} not found or empty." if training_features.nil?
+
+ # sorted features for index lookups
+ lazar.features = training_features.features.sort if training_features.feature_type == "regression"
+
+ training_features.data_entries.each do |compound,entry|
+ lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound]
+ entry.keys.each do |feature|
+ case training_features.feature_type
+ when "fminer"
+ # fingerprints are sets
+ smarts = training_features.features[feature][OT.smarts]
+ lazar.fingerprints[compound] << smarts
+ unless lazar.features.include? smarts
+ lazar.features << smarts
+ lazar.p_values[smarts] = training_features.features[feature][OT.p_value]
+ lazar.effects[smarts] = training_features.features[feature][OT.effect]
+ end
+ when "classification"
+ # fingerprints are sets
+ if entry[feature].flatten.size == 1
+ lazar.fingerprints[compound] << feature if entry[feature].flatten.first.match(TRUE_REGEXP)
+ lazar.features << feature unless lazar.features.include? feature
+ else
+ LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}"
+ end
+ when "regression"
+ # fingerprints are arrays
+ if entry[feature].flatten.size == 1
+ lazar.fingerprints[compound][lazar.features.index(feature)] = entry[feature].flatten.first
+ else
+ LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}"
+ end
+ end
+ end
+
+ lazar.activities[compound] = [] unless lazar.activities[compound]
+ training_activities.data_entries[compound][params[:prediction_feature]].each do |value|
+ case value.to_s
+ when "true"
+ lazar.activities[compound] << true
+ when "false"
+ lazar.activities[compound] << false
+ else
+ lazar.activities[compound] << value.to_f
+ lazar.prediction_type = "regression"
+ end
end
- LOGGER.warn "model has no dependent variable" unless @dependentVariables and @dependentVariables.size>0
- LOGGER.warn "model has no algorithm" unless @algorithm and @algorithm.size>0
- LOGGER.warn "model has no indenpendent variables" unless @independentVariables
end
+
+ if feature_generation_uri.match(/fminer/)
+ lazar.feature_calculation_algorithm = "substructure_match"
+ else
+ halt 404, "External feature generation services not yet supported"
+ end
+
+ lazar.metadata[OT.dependentVariables] = params[:prediction_feature]
+ lazar.metadata[OT.trainingDataset] = dataset_uri
+ lazar.metadata[OT.featureDataset] = feature_dataset_uri
+
+ lazar.parameters = {
+ "dataset_uri" => dataset_uri,
+ "prediction_feature" => prediction_feature,
+ "feature_generation_uri" => feature_generation_uri
+ }
+
+ model_uri = lazar.save
+ LOGGER.info model_uri + " created #{Time.now}"
+ model_uri
end
- end
-
- class PredictionModel < Generic
-
- def self.build( algorithm_uri, algorithm_params )
-
- LOGGER.debug "Build model, algorithm_uri:"+algorithm_uri.to_s+", algorithm_parms: "+algorithm_params.inspect.to_s
- uri = OpenTox::RestClientWrapper.post(algorithm_uri,algorithm_params).to_s
- LOGGER.debug "Build model done: "+uri.to_s
- RestClientWrapper.raise_uri_error("Invalid build model result: '"+uri.to_s+"'", algorithm_uri, algorithm_params ) unless Utils.model_uri?(uri)
- return PredictionModel.find(uri)
- end
-
- def predict_dataset( dataset_uri )
-
- LOGGER.debug "Predict dataset: "+dataset_uri.to_s+" with model "+@uri.to_s
- uri = RestClientWrapper.post(@uri, {:accept => "text/uri-list", :dataset_uri=>dataset_uri})
- RestClientWrapper.raise_uri_error("Prediciton result no dataset uri: "+uri.to_s, @uri, {:dataset_uri=>dataset_uri} ) unless Utils.dataset_uri?(uri)
- uri
- end
-
- def classification?
- #HACK replace with request to ontology server
- if @title =~ /(?i)classification/
- return true
- elsif @title =~ /(?i)regression/
- return false
- elsif @uri =~/ntua/ and @title =~ /mlr/
- return false
- elsif @uri =~/tu-muenchen/ and @title =~ /regression|M5P|GaussP/
- return false
- elsif @uri =~/ambit2/ and @title =~ /pKa/ || @title =~ /Regression|Caco/
- return false
- elsif @uri =~/majority/
- return (@uri =~ /class/) != nil
+
+ def predict_dataset(dataset_uri)
+ @prediction_dataset = Dataset.create
+ @prediction_dataset.add_metadata({
+ OT.hasSource => @lazar.uri,
+ DC.creator => @lazar.uri,
+ DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] ))
+ })
+ @prediction_dataset.add_parameters({"dataset_uri" => dataset_uri})
+ Dataset.new(dataset_uri).load_compounds.each do |compound_uri|
+ predict(compound_uri,false)
+ end
+ @prediction_dataset.save
+ @prediction_dataset.uri
+ end
+
+ def predict(compound_uri,verbose=false)
+
+ @compound = Compound.new compound_uri
+
+ unless @prediction_dataset
+ @prediction_dataset = Dataset.create
+ @prediction_dataset.add_metadata( {
+ OT.hasSource => @lazar.uri,
+ DC.creator => @lazar.uri,
+ DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] ))
+ } )
+ @prediction_dataset.add_parameters( {"compound_uri" => compound_uri} )
+ end
+
+ neighbors
+ eval @prediction_algorithm
+
+ if @prediction
+
+ feature_uri = File.join( @prediction_dataset.uri, "feature", @prediction_dataset.compounds.size)
+ @prediction_dataset.add @compound.uri, feature_uri, @prediction
+
+ feature_metadata = @prediction_dataset.metadata
+ feature_metadata[DC.title] = File.basename(@metadata[OT.dependentVariables])
+ feature_metadata[OT.prediction] = @prediction
+ feature_metadata[OT.confidence] = @confidence
+ @prediction_dataset.add_feature(feature_uri, feature_metadata)
+
+ if verbose
+ if @compound_features
+ @compound_features.each do |feature|
+ @prediction_dataset.add @compound.uri, feature, true
+ end
+ end
+ n = 0
+ @neighbors.sort{|a,b| a[:similarity] <=> b[:similarity]}.each do |neighbor|
+ neighbor_uri = File.join( @prediction_dataset.uri, "feature/neighbor", n )
+ @prediction_dataset.add @compound.uri, neighbor_uri, true
+ @prediction_dataset.add_feature(neighbor, {
+ OT.compound => neighbor[:compound],
+ OT.similarity => neighbor[:similarity],
+ OT.activity => neighbor[:activity]
+ })
+ n+=1
+ end
+ end
+ end
+ @prediction_dataset.save
+ @prediction_dataset.uri
+ end
+
+ def weighted_majority_vote
+ conf = 0.0
+ @neighbors.each do |neighbor|
+ case neighbor[:activity].to_s
+ when 'true'
+ conf += OpenTox::Algorithm.gauss(neighbor[:similarity])
+ when 'false'
+ conf -= OpenTox::Algorithm.gauss(neighbor[:similarity])
+ end
+ end
+ if conf > 0.0
+ @prediction = true
+ elsif conf < 0.0
+ @prediction = false
else
- raise "unknown model, uri:'"+@uri.to_s+"' title:'"+@title.to_s+"'"
+ @prediction = nil
end
+ @confidence = conf/@neighbors.size if @neighbors.size > 0
end
- end
-
- class Lazar < Generic
-
- attr_accessor :feature_dataset_uri, :effects, :activities, :p_values, :fingerprints, :features
-
- def initialize
- @source = "http://github.com/helma/opentox-model"
- @algorithm = File.join(CONFIG[:services]["opentox-algorithm"],"lazar")
- #@independent_variables = File.join(CONFIG[:services]["opentox-algorithm"],"fminer#BBRC_representative")
- @features = []
- @effects = {}
- @activities = {}
- @p_values = {}
- @fingerprints = {}
+
+ def local_svm_regression
+ sims = @neighbors.collect{ |n| n[:similarity] } # similarity values between query and neighbors
+ conf = sims.inject{|sum,x| sum + x }
+ acts = @neighbors.collect do |n|
+ act = n[:activity]
+ # TODO: check this in model creation
+ raise "0 values not allowed in training dataset. log10 is calculated internally." if act.to_f == 0
+ Math.log10(act.to_f)
+ end # activities of neighbors for supervised learning
+
+ neighbor_matches = @neighbors.collect{ |n| n[:features] } # as in classification: URIs of matches
+ gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
+ if neighbor_matches.size == 0
+ raise "No neighbors found"
+ else
+ # gram matrix
+ (0..(neighbor_matches.length-1)).each do |i|
+ gram_matrix[i] = []
+ # lower triangle
+ (0..(i-1)).each do |j|
+ sim = OpenTox::Algorithm.weighted_tanimoto(neighbor_matches[i], neighbor_matches[j], @lazar.p_values)
+ gram_matrix[i] << OpenTox::Algorithm.gauss(sim)
+ end
+ # diagonal element
+ gram_matrix[i][i] = 1.0
+ # upper triangle
+ ((i+1)..(neighbor_matches.length-1)).each do |j|
+ sim = OpenTox::Algorithm.weighted_tanimoto(neighbor_matches[i], neighbor_matches[j], @lazar.p_values) # double calculation?
+ gram_matrix[i] << OpenTox::Algorithm.gauss(sim)
+ end
+ end
+
+ @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
+ @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
+ LOGGER.debug "Setting R data ..."
+ # set data
+ @r.gram_matrix = gram_matrix.flatten
+ @r.n = neighbor_matches.size
+ @r.y = acts
+ @r.sims = sims
+
+ LOGGER.debug "Preparing R data ..."
+ # prepare data
+ @r.eval "y<-as.vector(y)"
+ @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
+ @r.eval "sims<-as.vector(sims)"
+
+ # model + support vectors
+ LOGGER.debug "Creating SVM model ..."
+ @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"nu-svr\", nu=0.8)"
+ @r.eval "sv<-as.vector(SVindex(model))"
+ @r.eval "sims<-sims[sv]"
+ @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
+ LOGGER.debug "Predicting ..."
+ @r.eval "p<-predict(model,sims)[1,1]"
+ @prediction = 10**(@r.p.to_f)
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+ @r.quit # free R
+ end
+ @confidence = conf/@neighbors.size if @neighbors.size > 0
+
end
- def save
- @features.uniq!
- resource = RestClient::Resource.new(CONFIG[:services]["opentox-model"])
- resource.post(self.to_yaml, :content_type => "application/x-yaml").chomp.to_s
+ def neighbors
+
+ @compound_features = eval(@feature_calculation_algorithm) if @feature_calculation_algorithm
+
+ @neighbors = {}
+ @activities.each do |training_compound,activities|
+ @training_compound = training_compound
+ sim = eval(@similarity_algorithm)
+ if sim > @min_sim
+ activities.each do |act|
+ @neighbors << {
+ :compound => @training_compound,
+ :similarity => sim,
+ :features => @fingerprints[@training_compound],
+ :activity => act
+ }
+ end
+ end
+ end
+
end
- def self.find_all
- RestClientWrapper.get(CONFIG[:services]["opentox-model"]).chomp.split("\n")
+ def tanimoto
+ OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound])
end
- def self.predict(compound_uri,model_uri)
- #RestClientWrapper.post(model_uri,{:compound_uri => compound_uri, :accept => 'application/x-yaml'})
- `curl -X POST -d 'compound_uri=#{compound_uri}' -H 'Accept:application/x-yaml' #{model_uri}`
+ def weighted_tanimoto
+ OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound],@p_values)
end
- end
-
- class PropertyLazar < Generic
-
- attr_accessor :feature_dataset_uri, :properties, :features, :activities#, :effects, :p_values
-
- def initialize
- @source = "http://github.com/helma/opentox-model"
- @algorithm = File.join(CONFIG[:services]["opentox-algorithm"],"property_lazar")
- #@independent_variables = File.join(CONFIG[:services]["opentox-algorithm"],"fminer#BBRC_representative")
- @features = []
- #@effects = {}
- @activities = {}
- #@p_values = {}
- @properties = {}
+
+ def euclid
+ OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound])
+ end
+
+ def weighted_euclid
+ OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound],@p_values)
+ end
+
+ def substructure_match
+ @compound.match(@features)
+ end
+
+ def database_search
+ #TODO add features method to dataset
+ Dataset.new(@metadata[OT.featureDataset]).features(@compound.uri)
+ end
+
+ def database_activity(compound_uri)
+ prediction = OpenTox::Dataset.new
+ # find database activities
+ if @activities[compound_uri]
+ @activities[compound_uri].each { |act| prediction.add compound_uri, @metadata[OT.dependentVariables], act }
+ prediction.add_metadata(OT.hasSource => @metadata[OT.trainingDataset])
+ prediction
+ else
+ nil
+ end
end
def save
- @features.uniq!
- resource = RestClient::Resource.new(CONFIG[:services]["opentox-model"])
- resource.post(self.to_yaml, :content_type => "application/x-yaml").chomp.to_s
+ RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml)
end
- def self.find_all
- RestClientWrapper.get(CONFIG[:services]["opentox-model"]).chomp.split("\n")
+ def self.all
+ RestClientWrapper.get(CONFIG[:services]["opentox-model"]).to_s.split("\n")
end
- def self.predict(compound_uri,model_uri)
- #RestClientWrapper.post(model_uri,{:compound_uri => compound_uri, :accept => 'application/x-yaml'})
- `curl -X POST -d 'compound_uri=#{compound_uri}' -H 'Accept:application/x-yaml' #{model_uri}`
+ def delete
+ RestClientWrapper.delete @uri unless @uri == CONFIG[:services]["opentox-model"]
end
+
end
end
end