From 23d0106f985206c898c8b30f1859b619a3970398 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 19 Jul 2015 21:25:36 +0200 Subject: lazar predictions working in principle --- lib/bbrc.rb | 54 +++++---- lib/descriptor.rb | 13 +- lib/fminer.rb | 25 ++-- lib/last.rb | 127 ++++++++++++++++++++ lib/lazar.rb | 306 +++++++++++++++++++++++++++-------------------- lib/opentox-algorithm.rb | 11 ++ lib/transform.rb | 6 +- 7 files changed, 366 insertions(+), 176 deletions(-) create mode 100644 lib/last.rb diff --git a/lib/bbrc.rb b/lib/bbrc.rb index f7d29f9..40de186 100644 --- a/lib/bbrc.rb +++ b/lib/bbrc.rb @@ -1,12 +1,7 @@ -ENV['FMINER_SMARTS'] = 'true' -ENV['FMINER_NO_AROMATIC'] = 'true' -ENV['FMINER_PVALUES'] = 'true' -ENV['FMINER_SILENT'] = 'true' -ENV['FMINER_NR_HITS'] = 'true' - module OpenTox module Algorithm class Fminer + # # Run bbrc algorithm on dataset # # @param [String] dataset_uri URI of the training dataset @@ -24,6 +19,7 @@ module OpenTox @fminer=OpenTox::Algorithm::Fminer.new @fminer.check_params(params,5) + # TODO introduce task again #task = OpenTox::Task.run("Mining BBRC features", __FILE__ ) do |task| time = Time.now @@ -50,7 +46,7 @@ module OpenTox feature_dataset.creator = __FILE__ feature_dataset.parameters = [ { "title" => "dataset_id", "paramValue" => params[:dataset].id }, - { "title" => "prediction_feature", "paramValue" => params[:prediction_feature].id }, + { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }, { "title" => "min_frequency", "paramValue" => @fminer.minfreq }, { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") }, { "title" => "backbone", "paramValue" => (params[:backbone] == "false" ? "false" : "true") } @@ -67,8 +63,9 @@ module OpenTox #task.progress 10 step_width = 80 / @bbrc.GetNoRootNodes().to_f + #features_smarts = Set.new features = [] - data_entries = [[]] + data_entries = Array.new(params[:dataset].compounds.size) {[]} puts "Setup: #{Time.now-time}" time = Time.now @@ -76,7 +73,6 @@ module OpenTox # run @bbrc - # prepare to receive results as hash { c => [ [f,v], ... ] } fminer_results = {} (0 .. @bbrc.GetNoRootNodes()-1).each do |j| @@ -114,37 +110,50 @@ module OpenTox "substructure" => true, "smarts" => smarts.dup, "pValue" => p_value.to_f.abs.round(5), - "effect" => effect + "effect" => effect, + "parameters" => [ + { "title" => "dataset_id", "paramValue" => params[:dataset].id }, + { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id } + ] }) features << feature - features.uniq! ftime += Time.now - ft id_arrs.each { |id_count_hash| id=id_count_hash.keys[0].to_i count=id_count_hash.values[0].to_i - compound_idx = params[:dataset].compounds.index @fminer.compounds[id] + fminer_results[@fminer.compounds[id]] || fminer_results[@fminer.compounds[id]] = {} + compound_idx = params[:dataset].compounds.index @fminer.compounds[id] feature_idx = features.index feature data_entries[compound_idx] ||= [] if params[:nr_hits] == "true" + fminer_results[@fminer.compounds[id]][feature] = count data_entries[compound_idx][feature_idx] = count else + fminer_results[@fminer.compounds[id]][feature] = 1 data_entries[compound_idx][feature_idx] = 1 end } end # end of end # feature parsing - #p features - p data_entries - #p params[:dataset].compounds - #p @fminer.compounds - puts "Fminer: #{Time.now-time} (find/create Features: #{ftime})" time = Time.now - #puts JSON.pretty_generate(fminer_results) + + # convert nil entries to 0 + data_entries.collect! do |r| + if r.empty? + Array.new(features.size,0) + else + r[features.size-1] = 0 if r.size < features.size # grow array to match feature size + r.collect!{|c| c.nil? ? 0 : c} # remove nils + end + end + =begin + # This part increases runtime by a factor of ~65 + # TODO: check if any information is lost due to simplification fminer_compounds = @fminer.training_dataset.compounds prediction_feature_idx = @fminer.training_dataset.features.index @fminer.prediction_feature prediction_feature_all_acts = fminer_compounds.each_with_index.collect { |c,idx| @@ -178,15 +187,10 @@ module OpenTox feature_dataset.save puts "Save: #{Time.now-time}" - p feature_dataset feature_dataset - - end - #end + #end + end end end end - - - diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 9a93b32..d862a41 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -56,25 +56,26 @@ module OpenTox end def self.smarts_match compounds, smarts, count=false + bad_request_error "Compounds for smarts_match are empty" unless compounds + bad_request_error "Smarts for smarts_match are empty" unless smarts compounds = parse compounds obconversion = OpenBabel::OBConversion.new obmol = OpenBabel::OBMol.new obconversion.set_in_format('inchi') smarts_pattern = OpenBabel::OBSmartsPattern.new - fingerprint = {} - compounds = [compounds] unless compounds.is_a? Array + #fingerprint = {} smarts = [smarts] unless smarts.is_a? Array - compounds.each do |compound| + fingerprint = Array.new(compounds.size){Array.new(smarts.size,false)} + compounds.each_with_index do |compound,c| obconversion.read_string(obmol,compound.inchi) - fingerprint[compound] = {} - smarts.each do |smart| + smarts.each_with_index do |smart,s| smarts_pattern.init(smart) if smarts_pattern.match(obmol) count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 else value = 0 end - fingerprint[compound][smart] = value + fingerprint[c][s] = value end end fingerprint diff --git a/lib/fminer.rb b/lib/fminer.rb index 3333517..59ee224 100644 --- a/lib/fminer.rb +++ b/lib/fminer.rb @@ -1,4 +1,5 @@ require_relative 'bbrc' +require_relative 'last' =begin * Name: fminer.rb * Description: Fminer library @@ -61,8 +62,7 @@ module OpenTox end if @minfreq.nil? @minfreq=min_frequency(@training_dataset,@prediction_feature,per_mil) - p "min_frequency #{@minfreq} (input was #{per_mil} per-mil)" - #$logger.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)" + $logger.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)" end end @@ -164,17 +164,18 @@ module OpenTox end metadata = { - RDF.type => [RDF::OT.Feature, RDF::OT.Substructure, RDF::OT.NumericFeature], - RDF::OT.smarts => smarts.dup, - RDF::OT.pValue => p_value.abs.round(5), - RDF::OT.effect => effect + "title" => smarts.dup, + "substructure" => true, + "numeric" => true, + "smarts" => smarts.dup, + "pValue" => p_value.abs.round(5), + "effect" => effect, + "parameters" => [ + { "title" => "dataset_id", "paramValue" => params[:dataset].id }, + { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id } + ] } - parameters = [ - { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] }, - { RDF::DC.title => "prediction_feature", RDF::OT.paramValue => params[:prediction_feature] } - ] - metadata[RDF::OT.hasSource]=feature_dataset_uri if feature_dataset_uri - [ metadata, parameters ] + metadata end # Minimum Frequency diff --git a/lib/last.rb b/lib/last.rb new file mode 100644 index 0000000..944d95e --- /dev/null +++ b/lib/last.rb @@ -0,0 +1,127 @@ +module OpenTox + module Algorithm + class Fminer + + # Run last algorithm on a dataset + # + # @param [String] dataset_uri URI of the training dataset + # @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable) + # @param [optional] parameters LAST parameters, accepted parameters are + # - min_frequency freq Minimum frequency (default 5) + # - feature_type Feature type, can be 'paths' or 'trees' (default "trees") + # - nr_hits Set to "true" to get hit count instead of presence + # - get_target Set to "true" to obtain target variable as feature + # @return [text/uri-list] Task URI + def self.last params + + @fminer=OpenTox::Algorithm::Fminer.new + @fminer.check_params(params,80) + + # TODO introduce task again + #task = OpenTox::Task.run("Mining LAST features", uri('/fminer/last')) do |task| + + @last = Last::Last.new + @last.Reset + if @fminer.prediction_feature.feature_type == "regression" + @last.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! + else + bad_request_error "No accept values for "\ + "dataset '#{fminer.training_dataset.id}' and "\ + "feature '#{fminer.prediction_feature.id}'" unless + @fminer.prediction_feature.accept_values + value_map=@fminer.prediction_feature.value_map + end + @last.SetMinfreq(@fminer.minfreq) + @last.SetType(1) if params[:feature_type] == "paths" + @last.SetConsoleOut(false) + + + feature_dataset = OpenTox::Dataset.new + feature_dataset["title"] = "LAST representatives for #{@fminer.training_dataset.title}", + feature_dataset.creator = __FILE__ + feature_dataset.parameters = [ + { "title" => "dataset_id", "paramValue" => params[:dataset].id }, + { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }, + { "title" => "min_frequency", "paramValue" => @fminer.minfreq }, + { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") } + ] + + @fminer.compounds = [] + @fminer.db_class_sizes = Array.new # AM: effect + @fminer.all_activities = Hash.new # DV: for effect calculation (class and regr) + @fminer.smi = [] # needed for matching the patterns back + + # Add data to fminer + @fminer.add_fminer_data(@last, value_map) + #task.progress 10 + #step_width = 80 / @bbrc.GetNoRootNodes().to_f + # run @last + xml = "" + (0 .. @last.GetNoRootNodes()-1).each do |j| + results = @last.MineRoot(j) + #task.progress 10+step_width*(j+1) + results.each do |result| + xml << result + end + end + + lu = LU.new # uses last-utils here + dom=lu.read(xml) # parse GraphML + smarts=lu.smarts_rb(dom,'nls') # converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de) + params[:nr_hits] == "true" ? hit_count=true : hit_count=false + matches, counts = lu.match_rb(@fminer.smi,smarts,hit_count,true) # creates instantiations + + features = [] + # create table with correct size + data_entries = Array.new(params[:dataset].compounds.size) {Array.new(matches.size,0)} + matches.each do |smarts, ids| + metadata = @fminer.calc_metadata(smarts, ids, counts[smarts], @last, nil, value_map, params) + feature = OpenTox::Feature.find_or_create_by(metadata) + features << feature + ids.each_with_index do |id,idx| + compound_idx = params[:dataset].compounds.index @fminer.compounds[id] + feature_idx = features.index feature + data_entries[compound_idx] ||= [] + data_entries[compound_idx][feature_idx] = counts[smarts][idx] + end + end + feature_dataset.compounds = @fminer.training_dataset.compounds + feature_dataset.features = features + feature_dataset.data_entries = data_entries + +=begin + # TODO check if this code is necessary, I dont understand what it does + fminer_compounds = @fminer.training_dataset.compounds + prediction_feature_idx = @fminer.training_dataset.features.index @fminer.prediction_feature + prediction_feature_all_acts = fminer_compounds.each_with_index.collect { |c,idx| + @fminer.training_dataset.data_entries[idx][prediction_feature_idx] + } + fminer_noact_compounds = fminer_compounds - @fminer.compounds + + if (params[:get_target] == "true") + feature_dataset.features = [ @fminer.prediction_feature ] + feature_dataset.features + end + fminer_compounds.each_with_index { |c,idx| + # TODO: fix value insertion + row = [ c ] + if (params[:get_target] == "true") + row = row + [ prediction_feature_all_acts[idx] ] + end + features.each { |f| + row << (fminer_results[c] ? fminer_results[c][f] : nil) + } + row.collect! { |v| v ? v : 0 } unless fminer_noact_compounds.include? c + feature_dataset << row + } +=end + + feature_dataset.save + feature_dataset + + # end + end + + end + end +end + diff --git a/lib/lazar.rb b/lib/lazar.rb index 1957c24..d0d2b76 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -11,6 +11,30 @@ module OpenTox class Lazar include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "model" + + field :title, type: String + field :description, type: String + #field :parameters, type: Array, default: [] + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + field :feature_dataset_id, type: BSON::ObjectId + # algorithms + field :feature_generation, type: String + field :feature_calculation_algorithm, type: String + field :prediction_algorithm, type: Symbol + field :similarity_algorithm, type: Symbol + # prediction features + field :prediction_feature_id, type: BSON::ObjectId + field :predicted_value_id, type: BSON::ObjectId + field :predicted_variables, type: Array + # parameters + field :min_sim, type: Float + field :propositionalized, type:Boolean + field :min_train_performance, type: Float attr_accessor :prediction_dataset @@ -18,131 +42,127 @@ module OpenTox # Prepare lazar object (includes graph mining) # @param[Array] lazar parameters as strings # @param[Hash] REST parameters, as input by user - def self.create params + def self.create feature_dataset, prediction_feature=nil, params={} - lazar = OpenTox::Model::Lazar.new(File.join($model[:uri],SecureRandom.uuid)) + lazar = OpenTox::Model::Lazar.new - training_dataset = OpenTox::Dataset.new(params[:dataset_uri]) - lazar.parameters << {RDF::DC.title => "training_dataset_uri", RDF::OT.paramValue => training_dataset.uri} + bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty? + lazar.feature_dataset_id = feature_dataset.id + training_dataset = OpenTox::Dataset.find(feature_dataset.parameters.select{|p| p["title"] == "dataset_id"}.first["paramValue"]) + bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless training_dataset.compounds == feature_dataset.compounds + lazar.training_dataset_id = training_dataset.id - if params[:prediction_feature] - resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{params[:dataset_uri]}'" unless training_dataset.find_feature_uri( params[:prediction_feature] ) + if prediction_feature + resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{training_dataset.id}'" unless training_dataset.features.include?( params[:prediction_feature] ) else # try to read prediction_feature from dataset resource_not_found_error "Please provide a prediction_feature parameter" unless training_dataset.features.size == 1 - params[:prediction_feature] = training_dataset.features.first.uri + prediction_feature = training_dataset.features.first end - lazar[RDF::OT.trainingDataset] = training_dataset.uri - prediction_feature = OpenTox::Feature.new(params[:prediction_feature]) - predicted_variable = OpenTox::Feature.find_or_create({RDF::DC.title => "#{prediction_feature.title} prediction", RDF.type => [RDF::OT.Feature, prediction_feature[RDF.type]]}) - lazar[RDF::DC.title] = prediction_feature.title - lazar.parameters << {RDF::DC.title => "prediction_feature_uri", RDF::OT.paramValue => prediction_feature.uri} - lazar[RDF::OT.dependentVariables] = prediction_feature.uri - - bad_request_error "Unknown prediction_algorithm #{params[:prediction_algorithm]}" if params[:prediction_algorithm] and !OpenTox::Algorithm::Neighbors.respond_to?(params[:prediction_algorithm]) - lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => params[:prediction_algorithm]} if params[:prediction_algorithm] - - confidence_feature = OpenTox::Feature.find_or_create({RDF::DC.title => "predicted_confidence", RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature]}) - lazar[RDF::OT.predictedVariables] = [ predicted_variable.uri, confidence_feature.uri ] - case prediction_feature.feature_type - when "classification" - lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => "weighted_majority_vote"} unless lazar.parameter_value "prediction_algorithm" - lazar[RDF.type] = [RDF::OT.Model, RDF::OTA.ClassificationLazySingleTarget] - when "regression" - lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => "local_svm_regression"} unless lazar.parameter_value "prediction_algorithm" - lazar[RDF.type] = [RDF::OT.Model, RDF::OTA.RegressionLazySingleTarget] + + lazar.prediction_feature_id = prediction_feature.id + lazar.title = prediction_feature.title + + if params and params[:prediction_algorithm] + bad_request_error "Unknown prediction_algorithm #{params[:prediction_algorithm]}" unless OpenTox::Algorithm::Neighbors.respond_to?(params[:prediction_algorithm]) + lazar.prediction_algorithm = params[:prediction_algorithm] end - lazar.parameter_value("prediction_algorithm") =~ /majority_vote/ ? lazar.parameters << {RDF::DC.title => "propositionalized", RDF::OT.paramValue => false} : lazar.parameters << {RDF::DC.title => "propositionalized", RDF::OT.paramValue => true} - lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => params[:min_sim].to_f} if params[:min_sim] and params[:min_sim].numeric? - lazar.parameters << {RDF::DC.title => "feature_generation_uri", RDF::OT.paramValue => params[:feature_generation_uri]} - #lazar.parameters["nr_hits"] = params[:nr_hits] + confidence_feature = OpenTox::Feature.find_or_create_by({ + "title" => "Prediction confidence", + "numeric" => true + }) - if params["feature_generation_uri"]=~/fminer/ - if (params[:nr_hits] == "true") - lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "smarts_count"} + unless lazar.prediction_algorithm + lazar.prediction_algorithm = :weighted_majority_vote if prediction_feature.nominal + lazar.prediction_algorithm = :local_svm_regression if prediction_feature.numeric + end + lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true + + lazar.min_sim = params[:min_sim].to_f if params[:min_sim] and params[:min_sim].numeric? + lazar.nr_hits = params[:nr_hits] if params[:nr_hits] + lazar.feature_generation = feature_dataset.creator + #lazar.parameters << {"title" => "feature_generation_uri", "paramValue" => params[:feature_generation_uri]} + # TODO insert algorithm into feature dataset + # TODO store algorithms in mongodb? + if lazar.feature_generation =~ /fminer|bbrc|last/ + if (lazar[:nr_hits] == "true") + lazar.feature_calculation_algorithm = "smarts_count" else - lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "smarts_match"} - end - lazar.parameters << {RDF::DC.title => "similarity_algorithm", RDF::OT.paramValue => "tanimoto"} - lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => 0.3} unless lazar.parameter_value("min_sim") - elsif params["feature_generation_uri"]=~/descriptor/ or params["feature_generation_uri"]==nil - if params["feature_generation_uri"] - method = params["feature_generation_uri"].split(%r{/}).last.chomp - lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => method} + lazar.feature_calculation_algorithm = "smarts_match" end + lazar.similarity_algorithm = "tanimoto" + lazar.min_sim = 0.3 unless lazar.min_sim + elsif lazar.feature_generation =~/descriptor/ or lazar.feature_generation.nil? # cosine similartiy is default (e.g. used when no fetature_generation_uri is given and a feature_dataset_uri is provided instead) - lazar.parameters << {RDF::DC.title => "similarity_algorithm", RDF::OT.paramValue => "cosine"} - lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => 0.7} unless lazar.parameter_value("min_sim") + lazar.similarity_algorithm = "cosine" + lazar.min_sim = 0.7 unless lazar.min_sim else - bad_request_error "unnkown feature generation method #{params["feature_generation_uri"]}" + bad_request_error "unkown feature generation method #{lazar.feature_generation}" end bad_request_error "Parameter min_train_performance is not numeric." if params[:min_train_performance] and !params[:min_train_performance].numeric? - lazar.parameters << {RDF::DC.title => "min_train_performance", RDF::OT.paramValue => params[:min_train_performance].to_f} if params[:min_train_performance] and params[:min_train_performance].numeric? - lazar.parameters << {RDF::DC.title => "min_train_performance", RDF::OT.paramValue => 0.1} unless lazar.parameter_value("min_train_performance") + lazar.min_train_performance = params[:min_train_performance].to_f if params[:min_train_performance] and params[:min_train_performance].numeric? + lazar.min_train_performance = 0.1 unless lazar.min_train_performance +=begin if params[:feature_dataset_uri] bad_request_error "Feature dataset #{params[:feature_dataset_uri]} does not exist." unless URI.accessible? params[:feature_dataset_uri] - lazar.parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => params[:feature_dataset_uri]} + lazar.parameters << {"title" => "feature_dataset_uri", "paramValue" => params[:feature_dataset_uri]} lazar[RDF::OT.featureDataset] = params["feature_dataset_uri"] else # run feature generation algorithm feature_dataset_uri = OpenTox::Algorithm::Generic.new(params[:feature_generation_uri]).run(params) - lazar.parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => feature_dataset_uri} + lazar.parameters << {"title" => "feature_dataset_uri", "paramValue" => feature_dataset_uri} lazar[RDF::OT.featureDataset] = feature_dataset_uri end - lazar.put - lazar.uri +=end + lazar.save + lazar end - def predict(params) - @prediction_dataset = OpenTox::Dataset.new - # set instance variables and prediction dataset parameters from parameters - params.each {|k,v| - self.class.class_eval { attr_accessor k.to_sym } - instance_variable_set "@#{k}", v - @prediction_dataset.parameters << {RDF::DC.title => k, RDF::OT.paramValue => v} - } - #["training_compounds", "fingerprints", "training_activities", "training_fingerprints", "query_fingerprint", "neighbors"].each {|k| - ["training_compounds", "training_activities", "training_fingerprints", "query_fingerprint", "neighbors"].each {|k| - self.class.class_eval { attr_accessor k.to_sym } - instance_variable_set("@#{k}", []) - } - - @prediction_feature = OpenTox::Feature.new @prediction_feature_uri - @predicted_variable = OpenTox::Feature.new @predicted_variable_uri - @predicted_confidence = OpenTox::Feature.new @predicted_confidence_uri - @prediction_dataset.metadata = { - RDF::DC.title => "Lazar prediction for #{@prediction_feature.title}", - RDF::DC.creator => @model_uri, - RDF::OT.hasSource => @model_uri, - RDF::OT.dependentVariables => @prediction_feature_uri, - RDF::OT.predictedVariables => [@predicted_variable_uri,@predicted_confidence_uri] - } - - @training_dataset = OpenTox::Dataset.new(@training_dataset_uri) - - @feature_dataset = OpenTox::Dataset.new(@feature_dataset_uri) - bad_request_error "No features found in feature dataset #{@feature_dataset.uri}." if @feature_dataset.features.empty? - - @similarity_feature = OpenTox::Feature.find_or_create({RDF::DC.title => "#{@similarity_algorithm.capitalize} similarity", RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature]}) - - @prediction_dataset.features = [ @predicted_variable, @predicted_confidence, @prediction_feature, @similarity_feature ] + def predict params + + # tailored for performance + # all consistency checks should be done during model creation + + time = Time.now + + # prepare prediction dataset + prediction_dataset = OpenTox::Dataset.new + prediction_feature = OpenTox::Feature.find prediction_feature_id + prediction_feature = OpenTox::Feature.find prediction_feature_id + prediction_dataset.title = "Lazar prediction for #{prediction_feature.title}", + prediction_dataset.creator = __FILE__, - prediction_feature_pos = @training_dataset.features.collect{|f| f.uri}.index @prediction_feature.uri + similarity_feature = OpenTox::Feature.find_or_create_by({ + "title" => "#{similarity_algorithm.capitalize} similarity", + "numeric" => true + }) + + #prediction_dataset.features = [ predicted_confidence, prediction_feature, similarity_feature ] - if @dataset_uri - compounds = OpenTox::Dataset.new(@dataset_uri).compounds + # TODO set instance variables and prediction dataset parameters from parameters (see development branch) + + + training_dataset = OpenTox::Dataset.find(training_dataset_id) + + feature_dataset = OpenTox::Dataset.find(feature_dataset_id) + + if params[:compound] + compounds = [ params[:compound]] else - compounds = [ OpenTox::Compound.new(@compound_uri) ] + compounds = params[:dataset].compounds end - # @training_fingerprints = @feature_dataset.data_entries + puts "Setup: #{Time.now-time}" + time = Time.now + + # TODO: this seems to be very time consuming + # uses > 11" on development machine # select training fingerprints from feature dataset (do NOT use entire feature dataset) - feature_compound_uris = @feature_dataset.compounds.collect{|c| c.uri} - @training_fingerprints = [] +=begin @training_dataset.compounds.each do |c| - idx = feature_compound_uris.index(c.uri) + idx = @feature_dataset.compounds.index(c) bad_request_error "training dataset compound not found in feature dataset" if idx==nil @training_fingerprints << @feature_dataset.data_entries[idx][0..-1] end @@ -151,61 +171,85 @@ module OpenTox values << nil while (values.size < @feature_dataset.features.size) values end - @training_compounds = @training_dataset.compounds - internal_server_error "sth went wrong #{@training_compounds.size} != #{@training_fingerprints.size}" if @training_compounds.size != @training_fingerprints.size - - feature_names = @feature_dataset.features.collect{ |f| f[RDF::DC.title] } - query_fingerprints = {} - # first lookup in feature dataset, than apply feature_generation_uri - compounds.each do |c| - idx = feature_compound_uris.index(c.uri) # just use first index, features should be equal for duplicates - if idx!=nil - fingerprint = {} - @feature_dataset.features.each do |f| - fingerprint[f[RDF::DC.title]] = @feature_dataset.data_entry_value(idx,f.uri) - end - query_fingerprints[c] = fingerprint - end - end - # if lookup failed, try computing! - if query_fingerprints.size!=compounds.size - bad_request_error "no feature_generation_uri provided in model AND cannot lookup all test compounds in existing feature dataset" unless @feature_calculation_algorithm - query_fingerprints = OpenTox::Algorithm::Descriptor.send( @feature_calculation_algorithm, compounds, feature_names )#.collect{|row| row.collect{|val| val ? val.to_f : 0.0 } } - end +=end + # replacement code (sequence has been preserved in bbrc and last + # uses ~0.025" on development machine + #@training_fingerprints = @feature_dataset.data_entries + #@training_compounds = @training_dataset.compounds + + #feature_names = @feature_dataset.features.collect{ |f| f[:title] } + + puts "Fingerprint: #{Time.now-time}" + time = Time.now + query_fingerprint = OpenTox::Algorithm::Descriptor.send( feature_calculation_algorithm, compounds, feature_dataset.features.collect{|f| f["title"]} ) + + puts "Fingerprint calculation: #{Time.now-time}" + time = Time.now # AM: transform to cosine space - @min_sim = (@min_sim.to_f*2.0-1.0).to_s if @similarity_algorithm =~ /cosine/ + min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/ - compounds.each_with_index do |compound,c_count| - $logger.debug "predict compound #{c_count+1}/#{compounds.size} #{compound.uri}" + neighbors = [] + compounds.each_with_index do |compound,c| + $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}" - database_activities = @training_dataset.values(compound,@prediction_feature) + database_activities = training_dataset.values(compound,prediction_feature) if database_activities and !database_activities.empty? database_activities.each do |database_activity| $logger.debug "do not predict compound, it occurs in dataset with activity #{database_activity}" - @prediction_dataset << [compound, nil, nil, database_activity, nil] + prediction_dataset << [compound, nil, nil, database_activity, nil] end next - elsif @prediction_dataset.compound_indices(compound.uri) - $logger.debug "compound already predicted (copy old prediction)" - predicted_value = @prediction_dataset.data_entry_value(@prediction_dataset.compound_indices(compound.uri).first,@predicted_variable.uri) - confidence_value = @prediction_dataset.data_entry_value(@prediction_dataset.compound_indices(compound.uri).first,@predicted_confidence.uri) else +=begin @training_activities = @training_dataset.data_entries.collect{|entry| act = entry[prediction_feature_pos] if entry @prediction_feature.feature_type=="classification" ? @prediction_feature.value_map.invert[act] : act } +=end + + #@query_fingerprint = @feature_dataset.features.collect { |f| + #val = query_fingerprints[compound][f.title] + #bad_request_error "Can not parse value '#{val}' to numeric" if val and !val.numeric? + #val ? val.to_f : 0.0 + #} # query structure + + # TODO reintroduce for regression + #mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) + #mtf.transform + # - @query_fingerprint = @feature_dataset.features.collect { |f| - val = query_fingerprints[compound][f.title] - bad_request_error "Can not parse value '#{val}' to numeric" if val and !val.numeric? - val ? val.to_f : 0.0 - } # query structure + feature_dataset.data_entries.each_with_index do |fingerprint, i| + + sim = OpenTox::Algorithm::Similarity.send(similarity_algorithm,fingerprint, query_fingerprint[c]) + # TODO fix for multi feature datasets + neighbors << [feature_dataset.compounds[i],training_dataset.data_entries[i].first,sim] if sim > self.min_sim + end + similarity_sum = 0.0 + confidence_sum = 0.0 + prediction = nil + activities = training_dataset.data_entries.flatten.uniq.sort + neighbors.each do |n| + similarity_sum += n.last + if activities.index(n[1]) == 0 + confidence_sum += n.last + elsif activities.index(n[1]) == 1 + confidence_sum -= n.last + end + end + + if confidence_sum > 0.0 + prediction = activities[0] + else + prediction = activities[1] + end + + p prediction, confidence_sum/similarity_sum + - mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) - mtf.transform - prediction = OpenTox::Algorithm::Neighbors.send(@prediction_algorithm, +=begin + prediction = OpenTox::Algorithm::Neighbors.send(prediction_algorithm, { :props => mtf.props, :activities => mtf.activities, :sims => mtf.sims, @@ -220,8 +264,10 @@ module OpenTox confidence_value = ((confidence_value+1.0)/2.0).abs if @similarity_algorithm =~ /cosine/ predicted_value = @prediction_feature.value_map[prediction[:prediction].to_i] if @prediction_feature.feature_type == "classification" $logger.debug "predicted value: #{predicted_value}, confidence: #{confidence_value}" +=end end +=begin @prediction_dataset << [ compound, predicted_value, confidence_value, nil, nil ] if @compound_uri # add neighbors only for compound predictions @@ -231,9 +277,9 @@ module OpenTox @prediction_dataset << [ n, nil, nil, a, neighbor[:similarity] ] end end +=end end # iteration over compounds - @prediction_dataset.put @prediction_dataset end diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb index 4aaad9c..03236ee 100644 --- a/lib/opentox-algorithm.rb +++ b/lib/opentox-algorithm.rb @@ -1,5 +1,12 @@ require 'statsample' +ENV['FMINER_SMARTS'] = 'true' +ENV['FMINER_NO_AROMATIC'] = 'true' +ENV['FMINER_PVALUES'] = 'true' +ENV['FMINER_SILENT'] = 'true' +ENV['FMINER_NR_HITS'] = 'true' + + # Require sub-Repositories require_relative '../libfminer/libbbrc/bbrc' # include before openbabel require_relative '../libfminer/liblast/last' # @@ -8,3 +15,7 @@ require_relative '../last-utils/lu.rb' #Dir[File.join(File.dirname(__FILE__),"*.rb")].each{ |f| require_relative f} require_relative "descriptor.rb" require_relative "fminer.rb" +require_relative "lazar.rb" +require_relative "transform.rb" +require_relative "similarity.rb" +require_relative "neighbors.rb" diff --git a/lib/transform.rb b/lib/transform.rb index 8b124f9..cbfa915 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -236,7 +236,6 @@ module OpenTox # @params[OpenTox::Model] model Model to transform def initialize model @model = model - @similarity_algorithm = @model.similarity_algorithm end # Transforms the model @@ -282,6 +281,7 @@ module OpenTox # neighbor calculation @ids = [] # surviving compounds become neighbors @sims = [] # calculated by neighbor routine + neighbors n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix acts_tmp = []; @ids.each { |idx| acts_tmp << @activities[idx] }; @activities = acts_tmp @@ -294,7 +294,7 @@ module OpenTox gram_matrix[i] = [] unless gram_matrix[i] @n_prop.each_index do |j| if (j>i) - sim = eval("OpenTox::Algorithm::Similarity::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])") + sim = OpenTox::Algorithm::Similarity.send(@similarity_algorithm.to_sym, @n_prop[i], @n_prop[j]) gram_matrix[i][j] = sim gram_matrix[j] = [] unless gram_matrix[j] gram_matrix[j][i] = gram_matrix[i][j] @@ -393,7 +393,7 @@ module OpenTox # @param[Array] A propositionalized data entry # @return[Float] Similarity to query structure def similarity(training_props) - eval("OpenTox::Algorithm::Similarity").send(@model.similarity_algorithm,training_props, @q_prop) + OpenTox::Algorithm::Similarity.send(@model.similarity_algorithm,training_props, @q_prop) end -- cgit v1.2.3