From ca2bb0f90335b1f2c4ecc28ee423e85b281ffcf0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 Nov 2015 17:50:17 +0100 Subject: neighbor search delegated to database backend --- lib/classification.rb | 6 ++---- lib/compound.rb | 52 ++++++++++++++++++++++++++++++-------------------- lib/crossvalidation.rb | 4 ++-- lib/dataset.rb | 21 ++++++++++++++++---- lib/descriptor.rb | 1 - lib/model.rb | 31 +++++++++++++++--------------- lib/regression.rb | 37 +++++++++++------------------------ 7 files changed, 79 insertions(+), 73 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index b4b2e59..7a225bb 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -5,14 +5,12 @@ module OpenTox def self.weighted_majority_vote compound, params neighbors = params[:neighbors] - return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty? weighted_sum = {} sim_sum = 0.0 confidence = 0.0 neighbors.each do |row| - n,sim,acts = row - #confidence = sim if sim > confidence # distance to nearest neighbor - acts.each do |act| + sim = row["tanimoto"] + row["features"][params[:prediction_feature_id].to_s].each do |act| weighted_sum[act] ||= 0 weighted_sum[act] += sim end diff --git a/lib/compound.rb b/lib/compound.rb index a26528b..c5e7f02 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -23,13 +23,16 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer + field :dataset_ids, type: Array, default: [] + field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) + #index({default_fingerprint: 1}, {unique: false}) # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params compound = self.find_or_initialize_by params - compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT) + compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size compound.save compound end @@ -41,7 +44,7 @@ module OpenTox if type == "MP2D" fp = obconversion(smiles,"smi","mpd").strip.split("\t") name = fp.shift # remove Title - fingerprints[type] = fp + fingerprints[type] = fp.uniq # no fingerprint counts #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html elsif type== "MNA" level = 2 # TODO: level as parameter, evaluate level 1, see paper @@ -244,20 +247,23 @@ module OpenTox def fingerprint_neighbors params bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] neighbors = [] - #if params[:type] == DEFAULT_FINGERPRINT - #neighbors = db_neighbors params - #p neighbors - #else + if params[:type] == DEFAULT_FINGERPRINT + neighbors = db_neighbors params + else query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| - unless self == compound + training_dataset = Dataset.find(params[:training_dataset_id]) + prediction_feature = training_dataset.features.first + training_dataset.compounds.each do |compound| + #unless self == compound candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - neighbors << [compound.id, sim] if sim >= params[:min_sim] - end + feature_values = training_dataset.values(compound,prediction_feature) + neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] + #end end - #end - neighbors.sort{|a,b| b.last <=> a.last} + neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} + end + neighbors end def fminer_neighbors params @@ -299,30 +305,34 @@ module OpenTox end def db_neighbors params - p "DB NEIGHBORS" - p params - # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb - qn = fingerprint(params[:type]).size + + #qn = default_fingerprint_size #qmin = qn * threshold #qmax = qn / threshold #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] aggregate = [ #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, - {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self + #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self {'$project' => { 'tanimoto' => {'$let' => { - 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}}, - 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]} + 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, + #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}}, + 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, - '_id' => 1 + '_id' => 1, + 'features' => 1, + 'dataset_ids' => 1 }}, {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, {'$sort' => {'tanimoto' => -1}} ] - $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } + $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + + + #$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 2e6dabb..3127351 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -136,7 +136,7 @@ module OpenTox incorrect_predictions = 0 predictions.each do |p| if p[1] and p[2] - p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1 + p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f confidences << p[3] @@ -243,7 +243,7 @@ module OpenTox :neighbors => neighbors } end - end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1] + end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] end def confidence_plot diff --git a/lib/dataset.rb b/lib/dataset.rb index d989bdf..af116a9 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -105,10 +105,18 @@ module OpenTox test_cids = test_idxs.collect{|i| self.compound_ids[i]} test_data_entries = test_idxs.collect{|i| self.data_entries[i]} test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) + test_dataset.compounds.each do |compound| + compound.dataset_ids << test_dataset.id + compound.save + end training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| self.compound_ids[i]} training_data_entries = training_idxs.collect{|i| self.data_entries[i]} training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) + training_dataset.compounds.each do |compound| + compound.dataset_ids << training_dataset.id + compound.save + end test_dataset.save_all training_dataset.save_all chunks << [training_dataset,test_dataset] @@ -229,7 +237,7 @@ module OpenTox table.each_with_index do |vals,i| ct = Time.now - identifier = vals.shift + identifier = vals.shift.strip warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format @@ -246,7 +254,7 @@ module OpenTox warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - # TODO insert empty compounds to keep positions? + compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id compound_time += Time.now-ct r += 1 @@ -263,10 +271,15 @@ module OpenTox warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] - self.data_entries.last[j] = v.to_f + v = v.to_f else - self.data_entries.last[j] = v.strip + v = v.strip end + self.data_entries.last[j] = v + #i = compound.feature_ids.index feature_ids[j] + compound.features[feature_ids[j].to_s] ||= [] + compound.features[feature_ids[j].to_s] << v + compound.save end end compounds.duplicates.each do |compound| diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 9733bde..93ce591 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -107,7 +107,6 @@ module OpenTox des[lib] << descriptor end des.each do |lib,descriptors| - p lib, descriptors send(lib, descriptors) end serialize diff --git a/lib/model.rb b/lib/model.rb index 227d4d3..44b36e6 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -76,22 +76,23 @@ module OpenTox t = Time.new neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) - # add activities - # TODO: improve efficiency, takes 3 times longer than previous version - neighbors.collect! do |n| - rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} - acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact - acts.empty? ? nil : n << acts - end - neighbors.compact! # remove neighbors without training activities + # remove neighbors without prediction_feature + # check for database activities (neighbors may include query compound) + database_activities = nil + if neighbors.collect{|n| n["_id"]}.include? compound.id - database_activities = training_dataset.values(compound,prediction_feature) - if use_database_values and database_activities and !database_activities.empty? - database_activities = database_activities.first if database_activities.size == 1 - predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} - next + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s] + neighbors.delete_if{|n| n["_id"] == compound.id} + end + neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } + if neighbors.empty? + prediction = {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} + else + prediction = Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset.id,:prediction_feature_id => prediction_feature.id}) end - predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size}) + prediction[:database_activities] = database_activities + predictions << prediction + =begin # TODO scaled dataset for physchem p neighbor_algorithm_parameters @@ -126,7 +127,7 @@ module OpenTox warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]} + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:database_activities] ? "measured" : p[:confidence] , p[:warning]]} prediction_dataset.save_all return prediction_dataset end diff --git a/lib/regression.rb b/lib/regression.rb index 868c25f..575a1ef 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,39 +1,26 @@ -# TODO install R packages kernlab, caret, doMC, class, e1071 - - - # log transform activities (create new dataset) - # scale, normalize features, might not be necessary - # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is - # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression - # zero-order correlation and the semi-partial correlation - # seems to be necessary for svm - # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1 - # http://stackoverflow.com/questions/15436367/svm-scaling-input-values - # use lasso or elastic net?? - # select relevant features - # remove features with a single value - # remove correlated features - # remove features not correlated with endpoint module OpenTox module Algorithm class Regression def self.weighted_average compound, params + #p params.keys weighted_sum = 0.0 sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] activities = [] neighbors.each do |row| - n,sim,acts = row - confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors - acts.each do |act| - weighted_sum += sim*Math.log10(act) - activities << act - sim_sum += sim - end + #if row["dataset_ids"].include? params[:training_dataset_id] + sim = row["tanimoto"] + confidence = sim if sim > confidence # distance to nearest neighbor + # TODO add LOO errors + row["features"][params[:prediction_feature_id].to_s].each do |act| + weighted_sum += sim*Math.log10(act) + activities << act + sim_sum += sim + end + #end end #R.assign "activities", activities #R.eval "cv = cv(activities)" @@ -47,10 +34,8 @@ module OpenTox end def self.local_linear_regression compound, neighbors - p neighbors.size return nil unless neighbors.size > 0 features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq - p features training_data = Array.new(neighbors.size){Array.new(features.size,0)} neighbors.each_with_index do |n,i| #p n.first -- cgit v1.2.3