From 06fc914653face2c58fd4e6c47161cb03e217582 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 8 May 2016 12:22:58 +0200 Subject: default validations fixed --- lib/classification.rb | 5 +++-- lib/compound.rb | 2 +- lib/crossvalidation.rb | 4 +--- lib/dataset.rb | 15 +++++++++------ lib/leave-one-out-validation.rb | 2 +- lib/model.rb | 5 ++--- lib/regression.rb | 10 +++++----- lib/validation.rb | 4 +++- 8 files changed, 25 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 93b4f0f..4cc9201 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -6,13 +6,14 @@ module OpenTox def self.weighted_majority_vote compound, params neighbors = params[:neighbors] feature_id = params[:prediction_feature_id].to_s + dataset_id = params[:training_dataset_id].to_s sims = {} neighbors.each do |n| sim = n["tanimoto"] - n["toxicities"][feature_id].each do |act| + n["toxicities"][feature_id][dataset_id].each do |act| sims[act] ||= [] sims[act] << sim - end + end if n["toxicities"][feature_id][dataset_id] end sim_all = sims.collect{|a,s| s}.flatten sim_sum = sim_all.sum diff --git a/lib/compound.rb b/lib/compound.rb index c2ce5d0..3af6f6c 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -288,7 +288,7 @@ module OpenTox training_dataset.compounds.each do |compound| candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => compound.toxicities[prediction_feature.id.to_s]}, "tanimoto" => sim} if sim >= params[:min_sim] + neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => {training_dataset_id.to_s => compound.toxicities[prediction_feature.id.to_s][training_dataset_id.to_s]}}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index e1f956b..8e0c5b9 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -133,14 +133,12 @@ module OpenTox neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) neighbors.collect! do |n| neighbor = Compound.find(n[0]) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]} + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s][training_dataset.id.to_s]} end { :smiles => compound.smiles, - #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, :measured => p[1], :predicted => p[2], - #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, :error => (p[1]-p[2]).abs, :relative_error => (p[1]-p[2]).abs/p[1], :confidence => p[3], diff --git a/lib/dataset.rb b/lib/dataset.rb index 9b24440..86800c6 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -64,6 +64,9 @@ module OpenTox dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id + compound.toxicities.each do |feature_id,data| + data[dataset.id.to_s] = data[self.id.to_s] # copy data entries + end compound.save end dataset @@ -92,7 +95,7 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s][self.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq if nr_measurements.size > 1 warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." @@ -100,8 +103,8 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if substance.toxicities[f.id.to_s] - row << substance.toxicities[f.id.to_s][i] + if substance.toxicities[f.id.to_s] and substance.toxicities[f.id.to_s][self.id.to_s] + row << substance.toxicities[f.id.to_s][self.id.to_s][i] else row << "" end @@ -149,7 +152,6 @@ module OpenTox feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip - # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] # guess feature types @@ -219,8 +221,9 @@ module OpenTox else v = v.strip end - compound.toxicities[feature_ids[j].to_s] ||= [] - compound.toxicities[feature_ids[j].to_s] << v + compound.toxicities[feature_ids[j].to_s] ||= {} + compound.toxicities[feature_ids[j].to_s][self.id.to_s] ||= [] + compound.toxicities[feature_ids[j].to_s][self.id.to_s] << v compound.save end end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index ed917eb..2306041 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -19,7 +19,7 @@ module OpenTox nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][dataset_id.to_s] else nr_unpredicted += 1 end diff --git a/lib/model.rb b/lib/model.rb index 841ab20..5b094fb 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -41,7 +41,7 @@ module OpenTox toxicities = [] substances = [] training_dataset.substances.each do |s| - s["toxicities"][prediction_feature_id].each do |act| + s["toxicities"][prediction_feature_id][training_dataset_id.to_s].each do |act| toxicities << act substances << s end @@ -76,8 +76,7 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - #TODO restrict to dataset features - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} diff --git a/lib/regression.rb b/lib/regression.rb index d2c4e91..13e1380 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -11,7 +11,7 @@ module OpenTox sim = row["tanimoto"] sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| weighted_sum += sim*act sim_sum += sim end @@ -33,7 +33,7 @@ module OpenTox neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| activities << act weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| @@ -77,10 +77,10 @@ module OpenTox def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" - neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities + neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] @@ -90,7 +90,7 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| diff --git a/lib/validation.rb b/lib/validation.rb index 68cb1a1..334efd7 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -34,7 +34,9 @@ module OpenTox nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + #prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][test_set.id.to_s] + prediction[:measured] = tox[test_set.id.to_s] if tox else nr_unpredicted += 1 end -- cgit v1.2.3