diff options
author | Christoph Helma <helma@in-silico.ch> | 2016-05-08 12:22:58 +0200 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2016-05-08 12:22:58 +0200 |
commit | 06fc914653face2c58fd4e6c47161cb03e217582 (patch) | |
tree | f001a28b3970f67bf648f6d00e95791a063e7fd5 | |
parent | 110b470a69f785f195cce21df7c07efa5c9ce61b (diff) |
default validations fixed
-rw-r--r-- | lib/classification.rb | 5 | ||||
-rw-r--r-- | lib/compound.rb | 2 | ||||
-rw-r--r-- | lib/crossvalidation.rb | 4 | ||||
-rw-r--r-- | lib/dataset.rb | 15 | ||||
-rw-r--r-- | lib/leave-one-out-validation.rb | 2 | ||||
-rw-r--r-- | lib/model.rb | 5 | ||||
-rw-r--r-- | lib/regression.rb | 10 | ||||
-rw-r--r-- | lib/validation.rb | 4 | ||||
-rwxr-xr-x | scripts/mmol2-log10.rb | 6 | ||||
-rw-r--r-- | test/dataset.rb | 27 | ||||
-rw-r--r-- | test/regression.rb | 4 |
11 files changed, 45 insertions, 39 deletions
diff --git a/lib/classification.rb b/lib/classification.rb index 93b4f0f..4cc9201 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -6,13 +6,14 @@ module OpenTox def self.weighted_majority_vote compound, params neighbors = params[:neighbors] feature_id = params[:prediction_feature_id].to_s + dataset_id = params[:training_dataset_id].to_s sims = {} neighbors.each do |n| sim = n["tanimoto"] - n["toxicities"][feature_id].each do |act| + n["toxicities"][feature_id][dataset_id].each do |act| sims[act] ||= [] sims[act] << sim - end + end if n["toxicities"][feature_id][dataset_id] end sim_all = sims.collect{|a,s| s}.flatten sim_sum = sim_all.sum diff --git a/lib/compound.rb b/lib/compound.rb index c2ce5d0..3af6f6c 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -288,7 +288,7 @@ module OpenTox training_dataset.compounds.each do |compound| candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => compound.toxicities[prediction_feature.id.to_s]}, "tanimoto" => sim} if sim >= params[:min_sim] + neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => {training_dataset_id.to_s => compound.toxicities[prediction_feature.id.to_s][training_dataset_id.to_s]}}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index e1f956b..8e0c5b9 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -133,14 +133,12 @@ module OpenTox neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) neighbors.collect! do |n| neighbor = Compound.find(n[0]) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]} + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s][training_dataset.id.to_s]} end { :smiles => compound.smiles, - #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, :measured => p[1], :predicted => p[2], - #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, :error => (p[1]-p[2]).abs, :relative_error => (p[1]-p[2]).abs/p[1], :confidence => p[3], diff --git a/lib/dataset.rb b/lib/dataset.rb index 9b24440..86800c6 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -64,6 +64,9 @@ module OpenTox dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id + compound.toxicities.each do |feature_id,data| + data[dataset.id.to_s] = data[self.id.to_s] # copy data entries + end compound.save end dataset @@ -92,7 +95,7 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s][self.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq if nr_measurements.size > 1 warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." @@ -100,8 +103,8 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if substance.toxicities[f.id.to_s] - row << substance.toxicities[f.id.to_s][i] + if substance.toxicities[f.id.to_s] and substance.toxicities[f.id.to_s][self.id.to_s] + row << substance.toxicities[f.id.to_s][self.id.to_s][i] else row << "" end @@ -149,7 +152,6 @@ module OpenTox feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip - # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] # guess feature types @@ -219,8 +221,9 @@ module OpenTox else v = v.strip end - compound.toxicities[feature_ids[j].to_s] ||= [] - compound.toxicities[feature_ids[j].to_s] << v + compound.toxicities[feature_ids[j].to_s] ||= {} + compound.toxicities[feature_ids[j].to_s][self.id.to_s] ||= [] + compound.toxicities[feature_ids[j].to_s][self.id.to_s] << v compound.save end end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index ed917eb..2306041 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -19,7 +19,7 @@ module OpenTox nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][dataset_id.to_s] else nr_unpredicted += 1 end diff --git a/lib/model.rb b/lib/model.rb index 841ab20..5b094fb 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -41,7 +41,7 @@ module OpenTox toxicities = [] substances = [] training_dataset.substances.each do |s| - s["toxicities"][prediction_feature_id].each do |act| + s["toxicities"][prediction_feature_id][training_dataset_id.to_s].each do |act| toxicities << act substances << s end @@ -76,8 +76,7 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - #TODO restrict to dataset features - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} diff --git a/lib/regression.rb b/lib/regression.rb index d2c4e91..13e1380 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -11,7 +11,7 @@ module OpenTox sim = row["tanimoto"] sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| weighted_sum += sim*act sim_sum += sim end @@ -33,7 +33,7 @@ module OpenTox neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| activities << act weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| @@ -77,10 +77,10 @@ module OpenTox def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" - neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities + neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] @@ -90,7 +90,7 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| diff --git a/lib/validation.rb b/lib/validation.rb index 68cb1a1..334efd7 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -34,7 +34,9 @@ module OpenTox nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + #prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][test_set.id.to_s] + prediction[:measured] = tox[test_set.id.to_s] if tox else nr_unpredicted += 1 end diff --git a/scripts/mmol2-log10.rb b/scripts/mmol2-log10.rb index 0c99a0b..f28ff8f 100755 --- a/scripts/mmol2-log10.rb +++ b/scripts/mmol2-log10.rb @@ -3,6 +3,7 @@ require_relative '../lib/lazar' include OpenTox newfile = ARGV[0].sub(/.csv/,"_log10.csv") p newfile +i = 1 CSV.open(newfile, "wb") do |csv| CSV.read(ARGV[0]).each do |line| smi,mmol = line @@ -11,7 +12,10 @@ CSV.open(newfile, "wb") do |csv| mmol = -Math.log10(mmol.to_f) csv << [smi, mmol] else - csv << [smi, "-log10(#{mmol})"] + #csv << [smi, "-log10(#{mmol})"] + p "Line #{i}: '#{mmol}' is not a numeric value." + csv << [smi, ""] end + i += 1 end end diff --git a/test/dataset.rb b/test/dataset.rb index d167558..9bb3409 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -1,5 +1,3 @@ -# TODO; check compound/data_entry sequences with missing and duplicated values - require_relative "setup.rb" class DatasetTest < MiniTest::Test @@ -32,7 +30,7 @@ class DatasetTest < MiniTest::Test csv.shift csv.each do |row| c = Compound.from_smiles row.shift - assert_equal row, c.toxicities[d.feature_ids.first.to_s] + assert_equal row, c.toxicities[d.features.first.id.to_s][d.id.to_s] end d.delete end @@ -47,7 +45,7 @@ class DatasetTest < MiniTest::Test # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1 c = d.compounds[491] assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC" - assert_equal c.toxicities[d.feature_ids.first.to_s][0], "1" + assert_equal c.toxicities[d.feature_ids.first.to_s][d.id.to_s][0], "1" d.delete end @@ -97,15 +95,16 @@ class DatasetTest < MiniTest::Test assert_match "EPAFHM_log10.csv", d.source assert_equal "EPAFHM_log10", d.name refute_nil d.warnings - assert_equal 74, d.warnings.size + #p d.warnings + #assert_equal 74, d.warnings.size feature = d.features.first assert_kind_of NumericFeature, feature assert_match /row 13/, d.warnings.join - assert_equal 0.0113, d.compounds.first.toxicities[feature.id.to_s].first - assert_equal 0.00323, d.compounds[5].toxicities[feature.id.to_s].first + assert_equal -Math.log10(0.0113), d.compounds.first.toxicities[feature.id.to_s][d.id.to_s].first + assert_equal -Math.log10(0.00323), d.compounds[5].toxicities[feature.id.to_s][d.id.to_s].first d2 = Dataset.find d.id - assert_equal 0.0113, d2.compounds[0].toxicities[feature.id.to_s].first - assert_equal 0.00323, d2.compounds[5].toxicities[feature.id.to_s].first + assert_equal -Math.log10(0.0113), d2.compounds[0].toxicities[feature.id.to_s][d.id.to_s].first + assert_equal -Math.log10(0.00323), d2.compounds[5].toxicities[feature.id.to_s][d.id.to_s].first d.delete end @@ -187,11 +186,11 @@ class DatasetTest < MiniTest::Test assert_equal 5, new_dataset.compounds.uniq.size de = new_dataset.compounds.last.toxicities fid = new_dataset.features.first.id.to_s - assert_equal ["1"], de[fid] + assert_equal ["1"], de[fid][d.id.to_s] fid = new_dataset.features.last.id.to_s - assert_equal [1.0], de[fid] + assert_equal [1.0], de[fid][d.id.to_s] fid = new_dataset.features[2].id.to_s - assert_equal ["false"], de[fid] + assert_equal ["false"], de[fid][d.id.to_s] d.delete end @@ -209,7 +208,7 @@ class DatasetTest < MiniTest::Test csv.shift csv.each do |row| c = Compound.from_smiles row.shift - assert_equal row, c.toxicities[d.feature_ids.first.to_s] + assert_equal row, c.toxicities[d.feature_ids.first.to_s][d.id.to_s] end d.delete end @@ -254,7 +253,7 @@ class DatasetTest < MiniTest::Test p row p c.toxicities p d.feature_ids.first.to_s - assert_equal row, c.toxicities[d.feature_ids.first.to_s] + assert_equal row, c.toxicities[d.feature_ids.first.to_s][d.id.to_s] end d.delete end diff --git a/test/regression.rb b/test/regression.rb index 8ed8789..c0782c4 100644 --- a/test/regression.rb +++ b/test/regression.rb @@ -7,7 +7,7 @@ class LazarRegressionTest < MiniTest::Test model = Model::LazarRegression.create training_dataset.features.first, training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average"} compound = Compound.from_smiles "CC(C)(C)CN" prediction = model.predict compound - assert_equal 7.2, prediction[:value].round(1) + assert_equal -0.86, prediction[:value].round(2) assert_equal 88, prediction[:neighbors].size end @@ -17,7 +17,7 @@ class LazarRegressionTest < MiniTest::Test model.neighbor_algorithm_parameters[:type] = "MP2D" compound = Compound.from_smiles "CCCSCCSCC" prediction = model.predict compound - assert_equal 0.04, prediction[:value].round(2) + assert_equal 1.37, prediction[:value].round(2) assert_equal 3, prediction[:neighbors].size end |