From 51f57e2858b60bed74ebcc97189b2188c900c283 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 6 May 2016 12:49:28 +0200 Subject: dataset tests cleanup --- test/dataset.rb | 364 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 242 insertions(+), 122 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index a7b8769..f028dbe 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -4,6 +4,15 @@ require_relative "setup.rb" class DatasetTest < MiniTest::Test + # basics + + def test_create_empty + d = Dataset.new + assert_equal Dataset, d.class + refute_nil d.id + assert_kind_of BSON::ObjectId, d.id + end + def test_all d1 = Dataset.new d1.save @@ -12,70 +21,160 @@ class DatasetTest < MiniTest::Test d1.delete end + # real datasets + + def test_upload_hamster + d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + assert_equal Dataset, d.class + assert_equal 1, d.features.size + assert_equal 85, d.compounds.size + csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") + csv.shift + csv.each do |row| + c = Compound.from_smiles row.shift + assert_equal c.toxicities[d.feature_ids.first.to_s], row + end + d.delete + end + + def test_upload_kazius + f = File.join DATA_DIR, "kazius.csv" + d = OpenTox::Dataset.from_csv_file f + csv = CSV.read f + assert_equal csv.size-1, d.compounds.size + assert_equal csv.first.size-1, d.features.size + assert_empty d.warnings + # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1 + c = d.compounds[491] + assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC" + assert_equal c.toxicities[d.feature_ids.first.to_s][0], "1" + d.delete + end + + def test_upload_multicell + duplicates = [ + "InChI=1S/C6HCl5O/c7-1-2(8)4(10)6(12)5(11)3(1)9/h12H", + "InChI=1S/C12H8Cl6O/c13-8-9(14)11(16)5-3-1-2(6-7(3)19-6)4(5)10(8,15)12(11,17)18/h2-7H,1H2", + "InChI=1S/C2HCl3/c3-1-2(4)5/h1H", + "InChI=1S/C4H5Cl/c1-3-4(2)5/h3H,1-2H2", + "InChI=1S/C4H7Cl/c1-4(2)3-5/h1,3H2,2H3", + "InChI=1S/C8H14O4/c1-5-4-8(11-6(2)9)12-7(3)10-5/h5,7-8H,4H2,1-3H3", + "InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3", + ].collect{|inchi| Compound.from_inchi(inchi).smiles} + errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ] + f = File.join DATA_DIR, "multi_cell_call.csv" + d = OpenTox::Dataset.from_csv_file f + csv = CSV.read f + assert_equal true, d.features.first.nominal + assert_equal csv.size-1-errors.size, d.compounds.size + assert_equal csv.first.size-1, d.features.size + puts d.warnings.to_yaml + errors.each do |smi| + refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}} + end + duplicates.each do |smi| + refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}} + end + d.delete + end + + def test_upload_isscan + f = File.join DATA_DIR, "ISSCAN-multi.csv" + d = OpenTox::Dataset.from_csv_file f + csv = CSV.read f + assert_equal csv.size-1, d.compounds.size + assert_equal csv.first.size-1, d.features.size + d.delete + end + + def test_upload_epafhm + f = File.join DATA_DIR, "EPAFHM.csv" + d = OpenTox::Dataset.from_csv_file f + assert_equal Dataset, d.class + csv = CSV.read f + assert_equal csv.size-1, d.compounds.size + assert_equal csv.first.size-1, d.features.size + assert_match "EPAFHM.csv", d.source + assert_equal "EPAFHM", d.name + refute_nil d.warnings + assert_equal 74, d.warnings.size + feature = d.features.first + assert_kind_of NumericFeature, feature + assert_match /row 13/, d.warnings.join + assert_equal 0.0113, d.compounds.first.toxicities[feature.id.to_s].first + assert_equal 0.00323, d.compounds[5].toxicities[feature.id.to_s].first + d2 = Dataset.find d.id + assert_equal 0.0113, d2.compounds[0].toxicities[feature.id.to_s].first + assert_equal 0.00323, d2.compounds[5].toxicities[feature.id.to_s].first + d.delete + end + + # batch predictions + def test_create_without_features_smiles_and_inchi ["smiles", "inchi"].each do |type| d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv") assert_equal Dataset, d.class refute_nil d.id dataset = Dataset.find d.id - #p dataset.compounds assert_equal 3, d.compounds.size.to_i d.delete end end - def test_create_empty - d = Dataset.new - assert_equal Dataset, d.class - refute_nil d.id - assert_kind_of BSON::ObjectId, d.id + # dataset operations + + def test_folds + dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") + dataset.folds(10).each do |fold| + fold.each do |d| + assert_operator d.compounds.size, :>=, d.compounds.uniq.size + end + assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size + assert_equal dataset.substance_ids.size, fold.first.substance_ids.size + fold.last.substance_ids.size + assert_empty (fold.first.substance_ids & fold.last.substance_ids) + end end - def test_client_create - d = Dataset.new - assert_equal Dataset, d.class - d.name = "Create dataset test" + # serialisation - # add data entries - features = ["test1", "test2"].collect do |title| - f = Feature.new - f.name = title - f.numeric = true - f.save - f + def test_to_csv + d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" + refute_nil d.warnings + assert d.warnings.grep(/Duplicate compound/) + assert d.warnings.grep(/3, 5/) + assert_equal 6, d.features.size + assert_equal 5, d.compounds.uniq.size + assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size + csv = CSV.parse(d.to_csv) + original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv") + csv.shift + original_csv.shift + original = {} + original_csv.each do |row| + c = Compound.from_smiles row.shift.strip + original[c.inchi] = row.collect{|v| v.strip} end - - # manual low-level insertions without consistency checks for runtime efficiency - compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi| - Compound.from_smiles smi + serialized = {} + csv.each do |row| + c = Compound.from_smiles row.shift + serialized[c.inchi] = row end - data_entries = [] - data_entries << [1,2] - data_entries << [4,5] - data_entries << [6,7] - compounds.each_with_index do |c,i| - features.each_with_index do |f,j| - d.data_entries[c.id.to_s] ||= {} - d.data_entries[c.id.to_s][f.id.to_s] ||= [] - d.data_entries[c.id.to_s][f.id.to_s] << data_entries[i][j] + original.each do |inchi,row| + row.each_with_index do |v,i| + if v.numeric? + assert_equal v.to_f, serialized[inchi][i].to_f + else + assert_equal v, serialized[inchi][i] + end end - end - assert_equal 3, d.compounds.size - assert_equal 2, d.features.size - p d.data_entries - assert_equal [[1,2],[4,5],[6,7]], d.data_entries - d.save - # check if dataset has been saved correctly - new_dataset = Dataset.find d.id - assert_equal 3, new_dataset.compounds.size - assert_equal 2, new_dataset.features.size - assert_equal [[1,2],[4,5],[6,7]], new_dataset.data_entries - d.delete - assert_nil Dataset.find d.id - assert_nil Dataset.find new_dataset.id + end + d.delete end + # special cases/details + def test_dataset_accessors d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" # create empty dataset @@ -85,8 +184,8 @@ class DatasetTest < MiniTest::Test assert_equal "multicolumn", new_dataset.name # get features assert_equal 6, new_dataset.features.size - assert_equal 5, new_dataset.compounds.size - de = new_dataset.data_entries[new_dataset.compounds.last.id.to_s] + assert_equal 5, new_dataset.compounds.uniq.size + de = new_dataset.compounds.last.toxicities fid = new_dataset.features.first.id.to_s assert_equal ["1"], de[fid] fid = new_dataset.features.last.id.to_s @@ -96,16 +195,6 @@ class DatasetTest < MiniTest::Test d.delete end - def test_create_from_file - d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv") - assert_equal Dataset, d.class - refute_nil d.warnings - assert_match "EPAFHM.mini.csv", d.source - assert_equal "EPAFHM.mini.csv", d.name - d.delete - #assert_equal false, URI.accessible?(d.uri) - end - def test_create_from_file_with_wrong_smiles_compound_entries d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv") refute_nil d.warnings @@ -113,56 +202,14 @@ class DatasetTest < MiniTest::Test d.delete end - def test_multicolumn_csv - d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" - refute_nil d.warnings - assert d.warnings.grep(/Duplicate compound/) - assert d.warnings.grep(/3, 5/) - assert_equal 6, d.features.size - assert_equal 5, d.compounds.size - assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size - assert_equal [["1", "1", "true", "true", "test", 1.1], ["1", "2", "false", "7.5", "test", 0.24], ["1", "3", "true", "5", "test", 3578.239], ["0", "4", "false", "false", "test", -2.35], ["1", "2", "true", "4", "test_2", 1], ["1", "2", "false", "false", "test", -1.5], ["1", nil, "false", nil, nil, 1.0]], d.data_entries - assert_equal "c1ccc[nH]1,1,,false,,,1.0", d.to_csv.split("\n")[7] - csv = CSV.parse(d.to_csv) - original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv") - csv.shift - original_csv.shift - csv.each_with_index do |row,i| - compound = Compound.from_smiles row.shift - original_compound = Compound.from_smiles original_csv[i].shift.strip - assert_equal original_compound.inchi, compound.inchi - row.each_with_index do |v,j| - if v.numeric? - assert_equal original_csv[i][j].strip.to_f, row[j].to_f - else - assert_equal original_csv[i][j].strip, row[j].to_s - end - end - end - d.delete - end - - def test_from_csv - d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - assert_equal Dataset, d.class - assert_equal 1, d.features.size - assert_equal 85, d.compounds.size - assert_equal 85, d.data_entries.size - csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") - csv.shift - assert_equal csv.collect{|r| r[1]}, d.data_entries.flatten - d.delete - #assert_equal false, URI.accessible?(d.uri) - end - def test_from_csv_classification ["int", "float", "string"].each do |mode| d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv" csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv") csv.shift - entries = d.data_entries.flatten - csv.each_with_index do |r, i| - assert_equal r[1].to_s, entries[i] + csv.each do |row| + c = Compound.from_smiles row.shift + assert_equal c.toxicities[d.feature_ids.first.to_s], row end d.delete end @@ -189,32 +236,105 @@ class DatasetTest < MiniTest::Test datasets.each{|d| d.delete} end - def test_create_from_file - d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv") - assert_equal Dataset, d.class - refute_nil d.warnings - assert_match /row 13/, d.warnings.join - assert_match "EPAFHM.mini.csv", d.source - assert_equal 1, d.features.size - feature = d.features.first - assert_kind_of NumericFeature, feature - assert_equal 0.0113, d.data_entries[0][0] - assert_equal 0.00323, d.data_entries[5][0] + # skips, may be removed in the future + + def test_simultanous_upload + skip + threads = [] + 3.times do |t| + threads << Thread.new(t) do |up| + d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + assert_equal OpenTox::Dataset, d.class + assert_equal 1, d.features.size + assert_equal 85, d.compounds.size + csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") + csv.shift + csv.each do |row| + c = Compound.from_smiles(row.shift) + p row + p c.toxicities + p d.feature_ids.first.to_s + assert_equal row, c.toxicities[d.feature_ids.first.to_s] + end + d.delete + end + end + threads.each {|aThread| aThread.join} + end + + def test_upload_feature_dataset + skip + t = Time.now + f = File.join DATA_DIR, "rat_feature_dataset.csv" + d = Dataset.from_csv_file f + assert_equal 458, d.features.size + d.save + #p "Upload: #{Time.now-t}" d2 = Dataset.find d.id - assert_equal 0.0113, d2.data_entries[0][0] - assert_equal 0.00323, d2.data_entries[5][0] + t = Time.now + assert_equal d.features.size, d2.features.size + csv = CSV.read f + csv.shift # remove header + assert_empty d2.warnings + assert_equal csv.size, d2.compounds.size + assert_equal csv.first.size-1, d2.features.size + d2.compounds.each_with_index do |compound,i| + row = csv[i] + row.shift # remove compound + assert_equal row, d2.data_entries[i] + end + #p "Dowload: #{Time.now-t}" + d2.delete + assert_nil Dataset.find d.id end - def test_folds - dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") - dataset.folds(10).each do |fold| - fold.each do |d| - assert_equal d.data_entries.size, d.compounds.size - assert_equal d.compounds.size, :>=, d.compounds.uniq.size + def test_client_create + skip + d = Dataset.new + assert_equal Dataset, d.class + d.name = "Create dataset test" + + # add data entries + features = ["test1", "test2"].collect do |title| + f = Feature.new + f.name = title + f.numeric = true + f.save + f + end + + # manual low-level insertions without consistency checks for runtime efficiency + compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi| + Compound.from_smiles smi + end + data_entries = [] + data_entries << [1,2] + data_entries << [4,5] + data_entries << [6,7] + compounds.each_with_index do |c,i| + features.each_with_index do |f,j| + d.substance_ids << c.id + d.feature_ids << f.id + c.toxicities[f.id.to_s] = data_entries[i][j] end - assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size end - #puts dataset.folds 10 + + assert_equal 3, d.compounds.size + assert_equal 2, d.features.size + #assert_equal [[1,2],[4,5],[6,7]], d.data_entries + d.save + # check if dataset has been saved correctly + new_dataset = Dataset.find d.id + assert_equal 3, new_dataset.compounds.size + assert_equal 2, new_dataset.features.size + new_dataset.compounds.each_with_index do |c,i| + new_dataset.features.each_with_index do |f,j| + assert_equal data_entries[i][j], c.toxicities[f.id.to_s].first + end + end + d.delete + assert_nil Dataset.find d.id + assert_nil Dataset.find new_dataset.id end end -- cgit v1.2.3