diff options
Diffstat (limited to 'lib/lazar.rb')
-rw-r--r-- | lib/lazar.rb | 201 |
1 files changed, 85 insertions, 116 deletions
diff --git a/lib/lazar.rb b/lib/lazar.rb index 399f5c1..d9195ad 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -1,10 +1,3 @@ -=begin -* Name: lazar.rb -* Description: Lazar model representation -* Author: Andreas Maunz <andreas@maunz.de>, Christoph Helma -* Date: 10/2012 -=end - module OpenTox module Model @@ -16,26 +9,18 @@ module OpenTox store_in collection: "models" field :title, type: String - field :description, type: String - #field :parameters, type: Array, default: [] + field :endpoint, type: String field :creator, type: String, default: __FILE__ # datasets field :training_dataset_id, type: BSON::ObjectId field :feature_dataset_id, type: BSON::ObjectId # algorithms - #field :feature_generation, type: String - #field :feature_calculation_algorithm, type: String + field :feature_calculation_algorithm, type: String field :prediction_algorithm, type: String field :similarity_algorithm, type: String - # prediction features - field :prediction_feature_id, type: BSON::ObjectId - field :predicted_value_id, type: BSON::ObjectId - field :predicted_variables, type: Array - # parameters - field :nr_hits, type: Boolean field :min_sim, type: Float - #field :propositionalized, type:Boolean - field :min_train_performance, type: Float + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId attr_accessor :prediction_dataset attr_accessor :training_dataset @@ -43,84 +28,31 @@ module OpenTox attr_accessor :query_fingerprint attr_accessor :neighbors - # Check parameters for plausibility - # Prepare lazar object (includes graph mining) - # @param[Array] lazar parameters as strings - # @param[Hash] REST parameters, as input by user - def self.create training_dataset, feature_dataset, prediction_feature=nil, nr_hits=false, params={} - - lazar = OpenTox::Model::Lazar.new + # Create a lazar model from a training_dataset and a feature_dataset + # @param [OpenTox::Dataset] training_dataset + # @param [OpenTox::Dataset] feature_dataset + # @return [OpenTox::Model::Lazar] Regression or classification model + def self.create training_dataset, feature_dataset bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty? - lazar.feature_dataset_id = feature_dataset.id - @training_dataset = training_dataset - bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless @training_dataset.compounds == feature_dataset.compounds - lazar.training_dataset_id = @training_dataset.id - - if prediction_feature - resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{@training_dataset.id}'" unless @training_dataset.features.include?( params[:prediction_feature] ) - else # try to read prediction_feature from dataset - resource_not_found_error "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1 - prediction_feature = @training_dataset.features.first - end + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless training_dataset.compounds == feature_dataset.compounds + prediction_feature = training_dataset.features.first + prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new + lazar.feature_dataset_id = feature_dataset.id + lazar.training_dataset_id = training_dataset.id lazar.prediction_feature_id = prediction_feature.id lazar.title = prediction_feature.title - if params and params[:prediction_algorithm] - bad_request_error "Unknown prediction_algorithm #{params[:prediction_algorithm]}" unless OpenTox::Algorithm::Neighbors.respond_to?(params[:prediction_algorithm]) - lazar.prediction_algorithm = params[:prediction_algorithm] - end - - unless lazar.prediction_algorithm # set defaults - # TODO consider params - if prediction_feature.nominal - lazar.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto" - lazar.min_sim = 0.3 unless lazar.min_sim - elsif prediction_feature.numeric - lazar.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression" - lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine" - # cosine similartiy is default - lazar.min_sim = 0.7 unless lazar.min_sim - end - end - #lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true - - lazar.min_sim = params[:min_sim].to_f if params[:min_sim] and params[:min_sim].numeric? - # TODO: get info from training_dataset - lazar.nr_hits = nr_hits - #lazar.feature_generation = feature_dataset.training_algorithm - #lazar.parameters << {"title" => "feature_generation_uri", "paramValue" => params[:feature_generation_uri]} - - bad_request_error "Parameter min_train_performance is not numeric." if params[:min_train_performance] and !params[:min_train_performance].numeric? - lazar.min_train_performance = params[:min_train_performance].to_f if params[:min_train_performance] and params[:min_train_performance].numeric? - lazar.min_train_performance = 0.1 unless lazar.min_train_performance - lazar.save lazar end def predict object - # tailored for performance - # all consistency checks should be done during model creation - time = Time.now - # prepare prediction dataset - prediction_dataset = LazarPrediction.new - prediction_feature = OpenTox::Feature.find prediction_feature_id - prediction_dataset.title = "Lazar prediction for #{prediction_feature.title}", - prediction_dataset.creator = __FILE__, - - confidence_feature = OpenTox::Feature.find_or_create_by({ - "title" => "Prediction confidence", - "numeric" => true - }) - - prediction_dataset.features = [ confidence_feature, prediction_feature ] - @training_dataset = OpenTox::Dataset.find(training_dataset_id) @feature_dataset = OpenTox::Dataset.find(feature_dataset_id) @@ -139,52 +71,44 @@ module OpenTox $logger.debug "Setup: #{Time.now-time}" time = Time.now - @query_fingerprint = Algorithm.run(feature_dataset.feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} ) - - $logger.debug "Fingerprint calculation: #{Time.now-time}" - time = Time.now + @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} ) - # AM: transform to cosine space - min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/ + $logger.debug "Query fingerprint calculation: #{Time.now-time}" + predictions = [] + prediction_feature = OpenTox::Feature.find prediction_feature_id + tt = 0 + pt = 0 compounds.each_with_index do |compound,c| + t = Time.new $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}" database_activities = @training_dataset.values(compound,prediction_feature) if database_activities and !database_activities.empty? - database_activities.each do |database_activity| - $logger.debug "do not predict compound, it occurs in dataset with activity #{database_activity}" - prediction_dataset.compound_ids << compound.id - prediction_dataset[c,0] = database_activity - prediction_dataset[c,1] = nil - end + database_activities = database_activities.first if database_activities.size == 1 + $logger.debug "Compound #{compound.inchi} occurs in training dataset with activity #{database_activities}" + predictions << {:compound => compound, :value => database_activities, :confidence => "measured"} next else - t = Time.new if prediction_algorithm =~ /Regression/ mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) mtf.transform training_fingerprints = mtf.n_prop - training_activities = mtf.activities - p training_activities query_fingerprint = mtf.q_prop neighbors = [[nil,nil,nil,query_fingerprint]] else training_fingerprints = @feature_dataset.data_entries - # TODO fix for multi feature datasets - training_activities = @training_dataset.data_entries[i].first query_fingerprint = @query_fingerprint[c] neighbors = [] end - $logger.debug "Transform: #{Time.now-t}" + tt += Time.now-t t = Time.new # find neighbors training_fingerprints.each_with_index do |fingerprint, i| - sim = Algorithm.run(similarity_algorithm,fingerprint, query_fingerprint) if sim > self.min_sim if prediction_algorithm =~ /Regression/ @@ -195,40 +119,85 @@ module OpenTox end end + if neighbors.empty? + predictions << {:compound => compound, :value => nil, :confidence => nil, :warning => "No neighbors with similarity > #{min_sim} in dataset #{training_dataset.id}"} + #$logger.warn "No neighbors found for compound #{compound}." + next + end + if prediction_algorithm =~ /Regression/ prediction = Algorithm.run(prediction_algorithm, neighbors, :min_train_performance => self.min_train_performance) else prediction = Algorithm.run(prediction_algorithm, neighbors) end + prediction[:compound] = compound + prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort with ascending similarities - $logger.debug "Prediction time: #{Time.now-time}" - time = Time.now - p prediction # AM: transform to original space (TODO) confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/ - $logger.debug "predicted value: #{prediction[0]}, confidence: #{prediction[1]}" + $logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}" + predictions << prediction + pt += Time.now-t end - prediction_dataset.compound_ids << compound - prediction_dataset[c,0] = prediction[0] - prediction_dataset[c,1] = prediction[1] end - prediction_dataset + $logger.debug "Transform time: #{tt}" + $logger.debug "Prediction time: #{pt}" + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + return predictions.first + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :title => "Lazar prediction for #{prediction_feature.title}", + :creator => __FILE__ + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) + warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence],p[:warning]]} + prediction_dataset.save_all + return prediction_dataset + end end def training_activities - # TODO select predicted variable - #@training_activities = @training_dataset.data_entries.collect{|entry| - #act = entry[prediction_feature_pos] if entry - #@prediction_feature.feature_type=="classification" ? @prediction_feature.value_map.invert[act] : act - #} - @training_dataset.data_entries.flatten + i = @training_dataset.feature_ids.index prediction_feature_id + @training_dataset.data_entries.collect{|de| de[i]} + end + + end + + class LazarRegression < Lazar + field :min_train_performance, type: Float, default: 0.1 + def initialize + super + self.prediction_algorithm = "OpenTox::Algorithm::Regression.local_svm_regression" + self.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine" + self.min_sim = 0.7 + + # AM: transform to cosine space + min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/ end + end + class LazarClassification < Lazar + def initialize + super + self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" + self.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto" + self.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_match" + self.min_sim = 0.3 + end end end |