summaryrefslogtreecommitdiff
path: root/lib/model.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2010-11-19 16:53:21 +0100
committerChristoph Helma <helma@in-silico.ch>2010-11-19 16:53:21 +0100
commitf8552611c2dbe25d76474f51e4e895bf9c2b5c5e (patch)
treeda145cd1d69adc4cdb8d299f0cea2e0810b88eaf /lib/model.rb
parent91c95f8dc8f60a8f0029b970ef881eecee28401b (diff)
lazar predictions for toxcreate working
Diffstat (limited to 'lib/model.rb')
-rw-r--r--lib/model.rb466
1 files changed, 181 insertions, 285 deletions
diff --git a/lib/model.rb b/lib/model.rb
index 63013cb..c6a2cf4 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -4,6 +4,9 @@ module OpenTox
include OpenTox
+ # Run a model with parameters
+ # @param [Hash] params Parameters for OpenTox model
+ # @return [text/uri-list] Task or resource URI
def run(params)
if CONFIG[:yaml_hosts].include?(URI.parse(@uri).host)
accept = 'application/x-yaml'
@@ -11,47 +14,25 @@ module OpenTox
accept = 'application/rdf+xml'
end
begin
- params[:acccept] = accept
- #TODO fix: REstClientWrapper does not accept accept header
- #RestClientWrapper.post(@uri,params)#,{:accept => accept})
- `curl -X POST -H "Accept:#{accept}" #{params.collect{|k,v| "-d #{k}=#{v}"}.join(" ")} #{@uri}`.to_s.chomp
+ RestClientWrapper.post(@uri,{:accept => accept},params).to_s
rescue => e
LOGGER.error "Failed to run #{@uri} with #{params.inspect} (#{e.inspect})"
raise "Failed to run #{@uri} with #{params.inspect}"
end
end
-
-=begin
- def classification?
- #TODO replace with request to ontology server
- if @metadata[DC.title] =~ /(?i)classification/
- return true
- elsif @metadata[DC.title] =~ /(?i)regression/
- return false
- elsif @uri =~/ntua/ and @metadata[DC.title] =~ /mlr/
- return false
- elsif @uri =~/tu-muenchen/ and @metadata[DC.title] =~ /regression|M5P|GaussP/
- return false
- elsif @uri =~/ambit2/ and @metadata[DC.title] =~ /pKa/ || @metadata[DC.title] =~ /Regression|Caco/
- return false
- elsif @uri =~/majority/
- return (@uri =~ /class/) != nil
- else
- raise "unknown model, uri:'"+@uri+"' title:'"+@metadata[DC.title]+"'"
- end
- end
-=end
+ # Generic OpenTox model class for all API compliant services
class Generic
include Model
end
+ # Lazy Structure Activity Relationship class
class Lazar
include Model
+ include Algorithm
- #attr_accessor :prediction_type, :feature_type, :features, :effects, :activities, :p_values, :fingerprints, :parameters
- attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :parameters, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm
+ attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :min_sim
def initialize(uri=nil)
@@ -61,7 +42,6 @@ module OpenTox
super CONFIG[:services]["opentox-model"]
end
- # TODO: fix metadata, add parameters
@metadata[OT.algorithm] = File.join(CONFIG[:services]["opentox-algorithm"],"lazar")
@features = []
@@ -70,284 +50,192 @@ module OpenTox
@p_values = {}
@fingerprints = {}
- @feature_calculation_algorithm = "substructure_match"
- @similarity_algorithm = "weighted_tanimoto"
- @prediction_algorithm = "weighted_majority_vote"
+ @feature_calculation_algorithm = "Substructure.match"
+ @similarity_algorithm = "Similarity.tanimoto"
+ @prediction_algorithm = "Neighbors.weighted_majority_vote"
@min_sim = 0.3
end
- def self.find(uri)
- YAML.load RestClientWrapper.get(uri,:content_type => 'application/x-yaml')
+ # Get URIs of all lazar models
+ # @return [Array] List of lazar model URIs
+ def self.all
+ RestClientWrapper.get(CONFIG[:services]["opentox-model"]).to_s.split("\n")
end
- def self.create_from_dataset(dataset_uri,feature_dataset_uri,prediction_feature=nil)
- training_activities = OpenTox::Dataset.find(dataset_uri)
- training_features = OpenTox::Dataset.find(feature_dataset_uri)
- unless prediction_feature # try to read prediction_feature from dataset
- raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1
- prediction_feature = training_activities.features.keys.first
- params[:prediction_feature] = prediction_feature
- end
- lazar = Lazar.new
- training_features = OpenTox::Dataset.new(feature_dataset_uri)
- case training_features.feature_type
- when "classification"
- lazar.similarity_algorithm = "weighted_tanimoto"
- when "regression"
- lazar.similarity_algorithm = "weighted_euclid"
- end
+ # Find a lazar model
+ # @param [String] uri Model URI
+ # @return [OpenTox::Model::Lazar] lazar model
+ def self.find(uri)
+ YAML.load RestClientWrapper.get(uri,:accept => 'application/x-yaml')
end
- def self.create(dataset_uri,prediction_feature=nil,feature_generation_uri=File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc"),params=nil)
-
- training_activities = OpenTox::Dataset.find(dataset_uri)
-
- unless prediction_feature # try to read prediction_feature from dataset
- raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1
- prediction_feature = training_activities.features.keys.first
- params[:prediction_feature] = prediction_feature
- end
-
- lazar = Lazar.new
- params[:feature_generation_uri] = feature_generation_uri
- feature_dataset_uri = OpenTox::Algorithm::Generic.new(feature_generation_uri).run(params).to_s
- training_features = OpenTox::Dataset.find(feature_dataset_uri)
- raise "Dataset #{feature_dataset_uri} not found or empty." if training_features.nil?
-
- # sorted features for index lookups
- lazar.features = training_features.features.sort if training_features.feature_type == "regression"
-
- training_features.data_entries.each do |compound,entry|
- lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound]
- entry.keys.each do |feature|
- case training_features.feature_type
- when "fminer"
- # fingerprints are sets
- smarts = training_features.features[feature][OT.smarts]
- lazar.fingerprints[compound] << smarts
- unless lazar.features.include? smarts
- lazar.features << smarts
- lazar.p_values[smarts] = training_features.features[feature][OT.p_value]
- lazar.effects[smarts] = training_features.features[feature][OT.effect]
- end
- when "classification"
- # fingerprints are sets
- if entry[feature].flatten.size == 1
- lazar.fingerprints[compound] << feature if entry[feature].flatten.first.match(TRUE_REGEXP)
- lazar.features << feature unless lazar.features.include? feature
- else
- LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}"
- end
- when "regression"
- # fingerprints are arrays
- if entry[feature].flatten.size == 1
- lazar.fingerprints[compound][lazar.features.index(feature)] = entry[feature].flatten.first
- else
- LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}"
- end
- end
- end
-
- lazar.activities[compound] = [] unless lazar.activities[compound]
- training_activities.data_entries[compound][params[:prediction_feature]].each do |value|
- case value.to_s
- when "true"
- lazar.activities[compound] << true
- when "false"
- lazar.activities[compound] << false
- else
- lazar.activities[compound] << value.to_f
- lazar.prediction_type = "regression"
- end
- end
- end
-
- if feature_generation_uri.match(/fminer/)
- lazar.feature_calculation_algorithm = "substructure_match"
- else
- halt 404, "External feature generation services not yet supported"
- end
-
- lazar.metadata[OT.dependentVariables] = params[:prediction_feature]
- lazar.metadata[OT.trainingDataset] = dataset_uri
- lazar.metadata[OT.featureDataset] = feature_dataset_uri
+ # Create a new lazar model
+ # @param [optional,Hash] params Parameters for the lazar algorithm (OpenTox::Algorithm::Lazar)
+ # @return [OpenTox::Model::Lazar] lazar model
+ def self.create(params)
+ lazar_algorithm = OpenTox::Algorithm::Generic.new File.join( CONFIG[:services]["opentox-algorithm"],"lazar")
+ model_uri = lazar_algorithm.run(params)
+ OpenTox::Model::Lazar.find(model_uri)
+ end
- lazar.parameters = {
- "dataset_uri" => dataset_uri,
- "prediction_feature" => prediction_feature,
- "feature_generation_uri" => feature_generation_uri
- }
-
- model_uri = lazar.save
- LOGGER.info model_uri + " created #{Time.now}"
- model_uri
+=begin
+ # Create a new lazar model and return task
+ # @param [optional,Hash] params Parameters for the lazar algorithm (OpenTox::Algorithm::Lazar)
+ # @return [OpenTox::Task] Task for lazar model creation
+ def self.create_task(params)
+ task_uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"],"lazar"), {}, params, false)
+ Task.find(task_uri)
+ #model_uri = lazar_algorithm.run(params)
+ #OpenTox::Model::Lazar.new(model_uri)
+ end
+=end
+ def parameter(param)
+ @metadata[OT.parameters].collect{|p| p[OT.paramValue] if p[DC.title] == param}.compact.first
end
def predict_dataset(dataset_uri)
@prediction_dataset = Dataset.create
@prediction_dataset.add_metadata({
- OT.hasSource => @lazar.uri,
- DC.creator => @lazar.uri,
- DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] ))
+ OT.hasSource => @uri,
+ DC.creator => @uri,
+ DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )),
+ OT.parameters => [{DC.title => "dataset_uri", OT.paramValue => dataset_uri}]
})
- @prediction_dataset.add_parameters({"dataset_uri" => dataset_uri})
- Dataset.new(dataset_uri).load_compounds.each do |compound_uri|
+ d = Dataset.new(dataset_uri)
+ d.load_compounds
+ d.compounds.each do |compound_uri|
predict(compound_uri,false)
end
@prediction_dataset.save
- @prediction_dataset.uri
+ @prediction_dataset
end
+ # Predict a compound
+ # @param [String] compound_uri Compound URI
+ # @param [optinal,Boolean] verbose Verbose prediction (output includes neighbors and features)
+ # @return [OpenTox::Dataset] Dataset with prediction
def predict(compound_uri,verbose=false)
@compound = Compound.new compound_uri
+ features = {}
unless @prediction_dataset
+ #@prediction_dataset = cached_prediction
+ #return @prediction_dataset if cached_prediction
@prediction_dataset = Dataset.create
@prediction_dataset.add_metadata( {
- OT.hasSource => @lazar.uri,
- DC.creator => @lazar.uri,
- DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] ))
+ OT.hasSource => @uri,
+ DC.creator => @uri,
+ # TODO: fix dependentVariable
+ DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )),
+ OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
} )
- @prediction_dataset.add_parameters( {"compound_uri" => compound_uri} )
end
- neighbors
- eval @prediction_algorithm
-
- if @prediction
+ return @prediction_dataset if database_activity
- feature_uri = File.join( @prediction_dataset.uri, "feature", @prediction_dataset.compounds.size)
- @prediction_dataset.add @compound.uri, feature_uri, @prediction
+ neighbors
+ prediction = eval("#{@prediction_algorithm}(@neighbors,{:similarity_algorithm => @similarity_algorithm, :p_values => @p_values})")
+
+ prediction_feature_uri = File.join( @prediction_dataset.uri, "feature", "prediction", File.basename(@metadata[OT.dependentVariables]),@prediction_dataset.compounds.size.to_s)
+ # TODO: fix dependentVariable
+ @prediction_dataset.metadata[OT.dependentVariables] = prediction_feature_uri
+
+ if @neighbors.size == 0
+ @prediction_dataset.add_feature(prediction_feature_uri, {
+ OT.hasSource => @uri,
+ DC.creator => @uri,
+ DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )),
+ OT.error => "No similar compounds in training dataset.",
+ OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
+ })
+ @prediction_dataset.add @compound.uri, prediction_feature_uri, prediction[:prediction]
- feature_metadata = @prediction_dataset.metadata
- feature_metadata[DC.title] = File.basename(@metadata[OT.dependentVariables])
- feature_metadata[OT.prediction] = @prediction
- feature_metadata[OT.confidence] = @confidence
- @prediction_dataset.add_feature(feature_uri, feature_metadata)
+ else
+ @prediction_dataset.add_feature(prediction_feature_uri, {
+ OT.hasSource => @uri,
+ DC.creator => @uri,
+ DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )),
+ OT.prediction => prediction[:prediction],
+ OT.confidence => prediction[:confidence],
+ OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
+ })
+ @prediction_dataset.add @compound.uri, prediction_feature_uri, prediction[:prediction]
if verbose
- if @compound_features
+ if @feature_calculation_algorithm == "Substructure.match"
+ f = 0
+ @compound_features.each do |feature|
+ feature_uri = File.join( @prediction_dataset.uri, "feature", "descriptor", f.to_s)
+ features[feature] = feature_uri
+ @prediction_dataset.add_feature(feature_uri, {
+ OT.smarts => feature,
+ OT.p_value => @p_values[feature],
+ OT.effect => @effects[feature]
+ })
+ @prediction_dataset.add @compound.uri, feature_uri, true
+ f+=1
+ end
+ else
@compound_features.each do |feature|
+ features[feature] = feature
@prediction_dataset.add @compound.uri, feature, true
end
end
n = 0
- @neighbors.sort{|a,b| a[:similarity] <=> b[:similarity]}.each do |neighbor|
- neighbor_uri = File.join( @prediction_dataset.uri, "feature/neighbor", n )
- @prediction_dataset.add @compound.uri, neighbor_uri, true
- @prediction_dataset.add_feature(neighbor, {
+ @neighbors.each do |neighbor|
+ neighbor_uri = File.join( @prediction_dataset.uri, "feature", "neighbor", n.to_s )
+ @prediction_dataset.add_feature(neighbor_uri, {
OT.compound => neighbor[:compound],
OT.similarity => neighbor[:similarity],
OT.activity => neighbor[:activity]
})
+ @prediction_dataset.add @compound.uri, neighbor_uri, true
+ f = 0 unless f
+ neighbor[:features].each do |feature|
+ if @feature_calculation_algorithm == "Substructure.match"
+ feature_uri = File.join( @prediction_dataset.uri, "feature", "descriptor", f.to_s) unless feature_uri = features[feature]
+ else
+ feature_uri = feature
+ end
+ @prediction_dataset.add neighbor[:compound], feature_uri, true
+ unless features.has_key? feature
+ features[feature] = feature_uri
+ @prediction_dataset.add_feature(feature_uri, {
+ OT.smarts => feature,
+ OT.p_value => @p_values[feature],
+ OT.effect => @effects[feature]
+ })
+ f+=1
+ end
+ end
n+=1
end
+ # what happens with dataset predictions?
end
end
- @prediction_dataset.save
- @prediction_dataset.uri
- end
-
- def weighted_majority_vote
- conf = 0.0
- @neighbors.each do |neighbor|
- case neighbor[:activity].to_s
- when 'true'
- conf += OpenTox::Algorithm.gauss(neighbor[:similarity])
- when 'false'
- conf -= OpenTox::Algorithm.gauss(neighbor[:similarity])
- end
- end
- if conf > 0.0
- @prediction = true
- elsif conf < 0.0
- @prediction = false
- else
- @prediction = nil
- end
- @confidence = conf/@neighbors.size if @neighbors.size > 0
- end
-
- def local_svm_regression
- sims = @neighbors.collect{ |n| n[:similarity] } # similarity values between query and neighbors
- conf = sims.inject{|sum,x| sum + x }
- acts = @neighbors.collect do |n|
- act = n[:activity]
- # TODO: check this in model creation
- raise "0 values not allowed in training dataset. log10 is calculated internally." if act.to_f == 0
- Math.log10(act.to_f)
- end # activities of neighbors for supervised learning
-
- neighbor_matches = @neighbors.collect{ |n| n[:features] } # as in classification: URIs of matches
- gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
- if neighbor_matches.size == 0
- raise "No neighbors found"
- else
- # gram matrix
- (0..(neighbor_matches.length-1)).each do |i|
- gram_matrix[i] = []
- # lower triangle
- (0..(i-1)).each do |j|
- sim = OpenTox::Algorithm.weighted_tanimoto(neighbor_matches[i], neighbor_matches[j], @lazar.p_values)
- gram_matrix[i] << OpenTox::Algorithm.gauss(sim)
- end
- # diagonal element
- gram_matrix[i][i] = 1.0
- # upper triangle
- ((i+1)..(neighbor_matches.length-1)).each do |j|
- sim = OpenTox::Algorithm.weighted_tanimoto(neighbor_matches[i], neighbor_matches[j], @lazar.p_values) # double calculation?
- gram_matrix[i] << OpenTox::Algorithm.gauss(sim)
- end
- end
- @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
- @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
- LOGGER.debug "Setting R data ..."
- # set data
- @r.gram_matrix = gram_matrix.flatten
- @r.n = neighbor_matches.size
- @r.y = acts
- @r.sims = sims
-
- LOGGER.debug "Preparing R data ..."
- # prepare data
- @r.eval "y<-as.vector(y)"
- @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
- @r.eval "sims<-as.vector(sims)"
-
- # model + support vectors
- LOGGER.debug "Creating SVM model ..."
- @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"nu-svr\", nu=0.8)"
- @r.eval "sv<-as.vector(SVindex(model))"
- @r.eval "sims<-sims[sv]"
- @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
- LOGGER.debug "Predicting ..."
- @r.eval "p<-predict(model,sims)[1,1]"
- @prediction = 10**(@r.p.to_f)
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
- @r.quit # free R
- end
- @confidence = conf/@neighbors.size if @neighbors.size > 0
-
+ @prediction_dataset.save
+ @prediction_dataset
end
+ # Find neighbors and store them as object variable
def neighbors
- @compound_features = eval(@feature_calculation_algorithm) if @feature_calculation_algorithm
+ @compound_features = eval("#{@feature_calculation_algorithm}(@compound,@features)") if @feature_calculation_algorithm
- @neighbors = {}
- @activities.each do |training_compound,activities|
- @training_compound = training_compound
- sim = eval(@similarity_algorithm)
+ @neighbors = []
+ @fingerprints.each do |training_compound,training_features|
+ #@activities.each do |training_compound,activities|
+ sim = eval("#{@similarity_algorithm}(@compound_features,training_features,@p_values)")
if sim > @min_sim
- activities.each do |act|
+ @activities[training_compound].each do |act|
@neighbors << {
- :compound => @training_compound,
+ :compound => training_compound,
:similarity => sim,
- :features => @fingerprints[@training_compound],
+ :features => training_features,
:activity => act
}
end
@@ -356,55 +244,63 @@ module OpenTox
end
- def tanimoto
- OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound])
- end
-
- def weighted_tanimoto
- OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound],@p_values)
- end
-
- def euclid
- OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound])
- end
-
- def weighted_euclid
- OpenTox::Algorithm.tanimoto(@compound_features,@fingerprints[@training_compound],@p_values)
- end
-
- def substructure_match
- @compound.match(@features)
- end
-
- def database_search
- #TODO add features method to dataset
- Dataset.new(@metadata[OT.featureDataset]).features(@compound.uri)
+=begin
+ def cached_prediction
+ dataset_uri = PredictionCache.find(:model_uri => @uri, :compound_uri => @compound.uri).dataset_uri)
+ return false unless dataset_uri
+ @prediction_dataset = Dataset.find(dataset_uri)
+ return false unless @prediction_dataset
+ LOGGER.debug "Serving cached prediction"
+ true
end
+=end
- def database_activity(compound_uri)
- prediction = OpenTox::Dataset.new
- # find database activities
- if @activities[compound_uri]
- @activities[compound_uri].each { |act| prediction.add compound_uri, @metadata[OT.dependentVariables], act }
- prediction.add_metadata(OT.hasSource => @metadata[OT.trainingDataset])
- prediction
+ # Find database activities and store them in @prediction_dataset
+ # @return [Boolean] true if compound has databasse activities, false if not
+ def database_activity
+ if @activities[@compound.uri]
+ @activities[@compound.uri].each { |act| @prediction_dataset.add @compound.uri, @metadata[OT.dependentVariables], act }
+ @prediction_dataset.add_metadata(OT.hasSource => @metadata[OT.trainingDataset])
+ @prediction_dataset.save
+ true
else
- nil
+ false
end
end
+ # Save model at model service
def save
- RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml)
- end
-
- def self.all
- RestClientWrapper.get(CONFIG[:services]["opentox-model"]).to_s.split("\n")
+ self.uri = RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml)
end
+ # Delete model at model service
def delete
RestClientWrapper.delete @uri unless @uri == CONFIG[:services]["opentox-model"]
end
+=begin
+=end
+
+=begin
+ def self.create_from_dataset(dataset_uri,feature_dataset_uri,prediction_feature=nil)
+ training_activities = OpenTox::Dataset.find(dataset_uri)
+ training_features = OpenTox::Dataset.find(feature_dataset_uri)
+ unless prediction_feature # try to read prediction_feature from dataset
+ raise "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1
+ prediction_feature = training_activities.features.keys.first
+ params[:prediction_feature] = prediction_feature
+ end
+ lazar = Lazar.new
+ training_features = OpenTox::Dataset.new(feature_dataset_uri)
+ case training_features.feature_type
+ when "classification"
+ lazar.similarity_algorithm = "weighted_tanimoto"
+ when "regression"
+ lazar.similarity_algorithm = "weighted_euclid"
+ end
+ end
+=end
+
end
end
end