From e718cf76f32fb29d6c7c3732ec82f35b0da49122 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 5 Oct 2018 17:06:46 +0200 Subject: sdf import, csv files with id column --- test/dataset.rb | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 055a029..11a4697 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -1,6 +1,21 @@ +# batch class + require_relative "setup.rb" class DatasetTest < MiniTest::Test + + # TODO + def test_from_pubchem + d = Dataset.from_pubchem 1190 + end + + def test_merge + skip "TODO" + end + + def test_to_sdf + skip "TODO" + end # basics @@ -21,6 +36,34 @@ class DatasetTest < MiniTest::Test # real datasets + def test_upload_csv_with_id + d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv" + assert_equal 53, d.compounds.size + assert_equal 1, d.features.size + f = d.features[0] + assert_equal "original_id", f.name + assert_equal ["123-30-8"], d.values(d.compounds.first,f) + end + + def test_upload_tsv_with_id + d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv" + assert_equal 53, d.compounds.size + assert_equal 1, d.features.size + assert_equal 1, d.features.size + f = d.features[0] + assert_equal "original_id", f.name + assert_equal ["123-30-8"], d.values(d.compounds.first,f) + end + + def test_upload_sdf + #d = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" + assert_equal Compound.from_smiles("C[C@H]1C(=O)O[C@@H]2CCN3[C@@H]2C(=CC3)COC(=O)[C@]([C@]1(C)O)(C)O").smiles, d.compounds.first.smiles + f = Feature.find_by(:name => "original_id") + assert_equal 35, d.features.size + assert_equal ["9415"], d.values(d.compounds.first,f) + end + def test_upload_hamster d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal Dataset, d.class @@ -103,6 +146,15 @@ class DatasetTest < MiniTest::Test d.delete end + def test_multiple_uploads + datasets = [] + 2.times do + d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") + datasets << d + end + assert_equal datasets[0],datasets[1] + end + # batch predictions def test_create_without_features_smiles_and_inchi -- cgit v1.2.3 From 47a49508a736549006418ac9a9607ec0f5083a55 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 5 Oct 2018 19:31:48 +0200 Subject: partial pubchem classification import --- test/dataset.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 11a4697..5157803 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -4,9 +4,13 @@ require_relative "setup.rb" class DatasetTest < MiniTest::Test - # TODO def test_from_pubchem - d = Dataset.from_pubchem 1190 + d = Dataset.from_pubchem 1191 + assert_equal 87, d.compounds.size + assert_equal 2, d.features.size + assert_equal "Active", d.values(d.compounds[10],d.features[1]) + # TODO endpoint name + # TODO regression import end def test_merge -- cgit v1.2.3 From 0a8da103e020b4a584a28a52b4ba12e1f3f90fd3 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Sun, 7 Oct 2018 18:12:39 +0200 Subject: dataset merge with feature/value maps --- test/dataset.rb | 74 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 28 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 5157803..11b8d49 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -3,23 +3,6 @@ require_relative "setup.rb" class DatasetTest < MiniTest::Test - - def test_from_pubchem - d = Dataset.from_pubchem 1191 - assert_equal 87, d.compounds.size - assert_equal 2, d.features.size - assert_equal "Active", d.values(d.compounds[10],d.features[1]) - # TODO endpoint name - # TODO regression import - end - - def test_merge - skip "TODO" - end - - def test_to_sdf - skip "TODO" - end # basics @@ -39,27 +22,37 @@ class DatasetTest < MiniTest::Test end # real datasets + + def test_import_pubchem + d = Dataset.from_pubchem 1191 + assert_equal 87, d.compounds.size + assert_equal 2, d.features.size + assert_equal "Active", d.values(d.compounds[10],d.features[1]) + # TODO endpoint name + # TODO regression import + end - def test_upload_csv_with_id + def test_import_csv_with_id d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv" assert_equal 53, d.compounds.size assert_equal 1, d.features.size f = d.features[0] - assert_equal "original_id", f.name + assert_equal "input_53.csv.ID", f.name + assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end - def test_upload_tsv_with_id + def test_import_tsv_with_id d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv" assert_equal 53, d.compounds.size assert_equal 1, d.features.size - assert_equal 1, d.features.size f = d.features[0] - assert_equal "original_id", f.name + assert_equal "input_53.tsv.ID", f.name + assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end - def test_upload_sdf + def test_import_sdf #d = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" assert_equal Compound.from_smiles("C[C@H]1C(=O)O[C@@H]2CCN3[C@@H]2C(=CC3)COC(=O)[C@]([C@]1(C)O)(C)O").smiles, d.compounds.first.smiles @@ -68,7 +61,7 @@ class DatasetTest < MiniTest::Test assert_equal ["9415"], d.values(d.compounds.first,f) end - def test_upload_hamster + def test_import_hamster d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal Dataset, d.class assert_equal 1, d.features.size @@ -82,7 +75,7 @@ class DatasetTest < MiniTest::Test d.delete end - def test_upload_kazius + def test_import_kazius f = File.join DATA_DIR, "kazius.csv" d = OpenTox::Dataset.from_csv_file f csv = CSV.read f @@ -96,7 +89,7 @@ class DatasetTest < MiniTest::Test d.delete end - def test_upload_multicell + def test_import_multicell duplicates = [ "InChI=1S/C6HCl5O/c7-1-2(8)4(10)6(12)5(11)3(1)9/h12H", "InChI=1S/C12H8Cl6O/c13-8-9(14)11(16)5-3-1-2(6-7(3)19-6)4(5)10(8,15)12(11,17)18/h2-7H,1H2", @@ -122,7 +115,7 @@ class DatasetTest < MiniTest::Test d.delete end - def test_upload_isscan + def test_import_isscan f = File.join DATA_DIR, "ISSCAN-multi.csv" d = OpenTox::Dataset.from_csv_file f csv = CSV.read f @@ -131,7 +124,7 @@ class DatasetTest < MiniTest::Test d.delete end - def test_upload_epafhm + def test_import_epafhm f = File.join DATA_DIR, "EPAFHM_log10.csv" d = OpenTox::Dataset.from_csv_file f assert_equal Dataset, d.class @@ -174,6 +167,21 @@ class DatasetTest < MiniTest::Test # dataset operations + def test_merge + source_feature = Feature.where(:name => "Ames test categorisation").first + target_feature = Feature.where(:name => "Mutagenicity").first + kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" + efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" + d = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} + File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + assert_equal 8281, d.compounds.size + assert_equal 4, d.features.size + c = Compound.from_smiles("C/C=C/C=O") + assert_equal ["mutagen"], d.values(c,target_feature) + assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source + end + def test_folds dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") dataset.folds(10).each do |fold| @@ -224,6 +232,16 @@ class DatasetTest < MiniTest::Test d.delete end + def test_to_sdf + d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.csv" + File.open("#{DATA_DIR}/tmp.sdf","w+") do |f| + f.puts d.to_sdf + end + d2 = Dataset.from_sdf_file "#{DATA_DIR}/tmp.sdf" + assert_equal d.compounds.size, d2.compounds.size + `rm #{DATA_DIR}/tmp.sdf` + end + # special cases/details def test_dataset_accessors -- cgit v1.2.3 From bdc6b5b40437896384561d74a510560e9e592364 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 9 Oct 2018 18:20:27 +0200 Subject: tentative random forest classification: hangs unpredictably during caret model generation/optimization for some (inorganic?) compounds. --- test/dataset.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 11b8d49..4196fd8 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -66,6 +66,7 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class assert_equal 1, d.features.size assert_equal 85, d.compounds.size + assert_equal true, d.features.first.measured csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") csv.shift csv.each do |row| @@ -174,7 +175,7 @@ class DatasetTest < MiniTest::Test hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" d = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} - File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + #File.open("tmp.csv","w+"){|f| f.puts d.to_csv} assert_equal 8281, d.compounds.size assert_equal 4, d.features.size c = Compound.from_smiles("C/C=C/C=O") -- cgit v1.2.3 From 8b31acab67e22f30a87c995a94f1ee1e2a3d510f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 10 Oct 2018 21:39:11 +0200 Subject: dataset tests fixed --- test/dataset.rb | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 4196fd8..2b439bb 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -24,10 +24,10 @@ class DatasetTest < MiniTest::Test # real datasets def test_import_pubchem - d = Dataset.from_pubchem 1191 + d = Dataset.from_pubchem_aid 1191 assert_equal 87, d.compounds.size assert_equal 2, d.features.size - assert_equal "Active", d.values(d.compounds[10],d.features[1]) + assert_equal ["Active"], d.values(d.compounds[10],d.features[1]) # TODO endpoint name # TODO regression import end @@ -37,7 +37,7 @@ class DatasetTest < MiniTest::Test assert_equal 53, d.compounds.size assert_equal 1, d.features.size f = d.features[0] - assert_equal "input_53.csv.ID", f.name + assert_equal "input_53.ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end @@ -47,18 +47,18 @@ class DatasetTest < MiniTest::Test assert_equal 53, d.compounds.size assert_equal 1, d.features.size f = d.features[0] - assert_equal "input_53.tsv.ID", f.name + assert_equal "input_53.ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end def test_import_sdf - #d = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" - assert_equal Compound.from_smiles("C[C@H]1C(=O)O[C@@H]2CCN3[C@@H]2C(=CC3)COC(=O)[C@]([C@]1(C)O)(C)O").smiles, d.compounds.first.smiles - f = Feature.find_by(:name => "original_id") assert_equal 35, d.features.size - assert_equal ["9415"], d.values(d.compounds.first,f) + assert_kind_of NumericSubstanceProperty, d.features[1] + assert_equal NominalSubstanceProperty, d.features.last.class + assert_equal 602, d.compounds.size + assert_match "PUBCHEM_XLOGP3_AA", d.warnings.last end def test_import_hamster @@ -66,7 +66,7 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class assert_equal 1, d.features.size assert_equal 85, d.compounds.size - assert_equal true, d.features.first.measured + assert_equal NominalBioActivity, d.features.first.class csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") csv.shift csv.each do |row| @@ -104,7 +104,7 @@ class DatasetTest < MiniTest::Test f = File.join DATA_DIR, "multi_cell_call.csv" d = OpenTox::Dataset.from_csv_file f csv = CSV.read f - assert_equal true, d.features.first.nominal? + assert_equal NominalBioActivity, d.features.first.class assert_equal 1056, d.compounds.size assert_equal csv.first.size-1, d.features.size errors.each do |smi| @@ -157,7 +157,7 @@ class DatasetTest < MiniTest::Test def test_create_without_features_smiles_and_inchi ["smiles", "inchi"].each do |type| - d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv"), true + d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv") assert_equal Dataset, d.class refute_nil d.id dataset = Dataset.find d.id @@ -169,6 +169,7 @@ class DatasetTest < MiniTest::Test # dataset operations def test_merge + skip # TODO use new Features source_feature = Feature.where(:name => "Ames test categorisation").first target_feature = Feature.where(:name => "Mutagenicity").first kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" @@ -177,10 +178,11 @@ class DatasetTest < MiniTest::Test d = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} #File.open("tmp.csv","w+"){|f| f.puts d.to_csv} assert_equal 8281, d.compounds.size - assert_equal 4, d.features.size c = Compound.from_smiles("C/C=C/C=O") assert_equal ["mutagen"], d.values(c,target_feature) assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source + p d.features + assert_equal 4, d.features.size end def test_folds @@ -219,7 +221,6 @@ class DatasetTest < MiniTest::Test c = Compound.from_smiles row.shift serialized[c.inchi] = row end - #puts serialized.to_yaml original.each do |inchi,row| row.each_with_index do |v,i| if v.numeric? -- cgit v1.2.3 From 1652fd5df948da7ace622c73d158010add656b9f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 24 Oct 2018 18:21:34 +0200 Subject: dataset map --- test/dataset.rb | 89 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 32 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 2b439bb..163f178 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -26,8 +26,8 @@ class DatasetTest < MiniTest::Test def test_import_pubchem d = Dataset.from_pubchem_aid 1191 assert_equal 87, d.compounds.size - assert_equal 2, d.features.size - assert_equal ["Active"], d.values(d.compounds[10],d.features[1]) + assert_equal 3, d.features.size + assert_equal ["Active"], d.values(d.compounds[10],d.features[2]) # TODO endpoint name # TODO regression import end @@ -35,9 +35,9 @@ class DatasetTest < MiniTest::Test def test_import_csv_with_id d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv" assert_equal 53, d.compounds.size - assert_equal 1, d.features.size - f = d.features[0] - assert_equal "input_53.ID", f.name + assert_equal 2, d.features.size + f = d.features[1] + assert_equal "ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end @@ -45,16 +45,16 @@ class DatasetTest < MiniTest::Test def test_import_tsv_with_id d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv" assert_equal 53, d.compounds.size - assert_equal 1, d.features.size - f = d.features[0] - assert_equal "input_53.ID", f.name + assert_equal 2, d.features.size + f = d.features[1] + assert_equal "ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end def test_import_sdf d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" - assert_equal 35, d.features.size + assert_equal 37, d.features.size assert_kind_of NumericSubstanceProperty, d.features[1] assert_equal NominalSubstanceProperty, d.features.last.class assert_equal 602, d.compounds.size @@ -64,7 +64,7 @@ class DatasetTest < MiniTest::Test def test_import_hamster d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal Dataset, d.class - assert_equal 1, d.features.size + assert_equal 3, d.features.size assert_equal 85, d.compounds.size assert_equal NominalBioActivity, d.features.first.class csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") @@ -81,7 +81,7 @@ class DatasetTest < MiniTest::Test d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size + assert_equal csv.first.size+1, d.features.size assert_empty d.warnings # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1 c = d.compounds[491] @@ -121,8 +121,9 @@ class DatasetTest < MiniTest::Test d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size - d.delete + assert_equal csv.first.size+1, d.features.size + # TODO fix csv output (headers, column order) + puts d.to_csv end def test_import_epafhm @@ -131,7 +132,7 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size + assert_equal csv.first.size+1, d.features.size assert_match "EPAFHM_log10.csv", d.source assert_equal "EPAFHM_log10", d.name feature = d.features.first @@ -168,23 +169,6 @@ class DatasetTest < MiniTest::Test # dataset operations - def test_merge - skip # TODO use new Features - source_feature = Feature.where(:name => "Ames test categorisation").first - target_feature = Feature.where(:name => "Mutagenicity").first - kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" - hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" - efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - d = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} - #File.open("tmp.csv","w+"){|f| f.puts d.to_csv} - assert_equal 8281, d.compounds.size - c = Compound.from_smiles("C/C=C/C=O") - assert_equal ["mutagen"], d.values(c,target_feature) - assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source - p d.features - assert_equal 4, d.features.size - end - def test_folds dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") dataset.folds(10).each do |fold| @@ -197,10 +181,48 @@ class DatasetTest < MiniTest::Test end end + def test_copy + d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") + copy = d.copy + assert_equal d.data_entries, copy.data_entries + assert_equal d.name, copy.name + assert_equal d.id.to_s, copy.source + end + + def test_map + d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") + assert_equal 1, d.bioactivity_features.size + map = {"true" => "carcinogen", "false" => "non-carcinogen"} + mapped = d.map(d.bioactivity_features.first, map) + c = d.compounds.sample + assert_equal d.values(c,d.bioactivity_features.first).collect{|v| map[v]}, mapped.values(c,mapped.transformed_bioactivity_features.first) + assert_equal d.original_id(c), mapped.original_id(c) + assert_equal d.bioactivity_features.first.name, mapped.bioactivity_features.first.name + assert_equal ["carcinogen","non-carcinogen"], mapped.transformed_bioactivity_features.first.accept_values + end + + def test_merge + kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" + efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" + hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + datasets = [kazius,hansen_mapped,efsa_mapped] + d = Dataset.merge datasets, datasets.collect{|d| d.bioactivity_features}.flatten.uniq + File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + assert_equal 8281, d.compounds.size + c = Compound.from_smiles("C/C=C/C=O") + assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first) + assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source + p d.features + assert_equal 4, d.features.size + end + # serialisation def test_to_csv d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" + # TODO warnings refute_nil d.warnings assert d.warnings.grep(/Duplicate compound/) assert d.warnings.grep(/3, 5/) @@ -268,6 +290,7 @@ class DatasetTest < MiniTest::Test def test_create_from_file_with_wrong_smiles_compound_entries d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv") + p d.to_csv refute_nil d.warnings assert_match /2|3|4|5|6|7|8/, d.warnings.join d.delete @@ -289,6 +312,8 @@ class DatasetTest < MiniTest::Test def test_from_csv2 File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") } dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv" + p dataset + p dataset.to_csv assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join File.delete "#{DATA_DIR}/temp_test.csv" dataset.features.each{|f| feature = Feature.find f.id; feature.delete} @@ -313,7 +338,7 @@ class DatasetTest < MiniTest::Test threads << Thread.new(t) do |up| d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal OpenTox::Dataset, d.class - assert_equal 1, d.features.size + assert_equal 3, d.features.size assert_equal 85, d.compounds.size csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") csv.shift -- cgit v1.2.3 From 24e5f9cc16ba164f860620184dc39b024bc3d384 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 24 Oct 2018 23:51:32 +0200 Subject: dataset tests fixed --- test/dataset.rb | 104 ++++++++++++++++++++++++-------------------------------- 1 file changed, 45 insertions(+), 59 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 163f178..5a620dd 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -47,18 +47,19 @@ class DatasetTest < MiniTest::Test assert_equal 53, d.compounds.size assert_equal 2, d.features.size f = d.features[1] - assert_equal "ID", f.name + assert_equal "Id", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end def test_import_sdf d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" - assert_equal 37, d.features.size - assert_kind_of NumericSubstanceProperty, d.features[1] - assert_equal NominalSubstanceProperty, d.features.last.class + assert_equal 36, d.features.size + assert_kind_of NumericSubstanceProperty, d.substance_property_features[1] + assert_equal NominalSubstanceProperty, d.substance_property_features.last.class assert_equal 602, d.compounds.size - assert_match "PUBCHEM_XLOGP3_AA", d.warnings.last + #p d.warnings + assert_match "PUBCHEM_XLOGP3_AA", d.warnings.compact.last end def test_import_hamster @@ -66,12 +67,12 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class assert_equal 3, d.features.size assert_equal 85, d.compounds.size - assert_equal NominalBioActivity, d.features.first.class + assert_equal NominalBioActivity, d.bioactivity_features.first.class csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") csv.shift csv.each do |row| c = Compound.from_smiles row.shift - assert_equal row, d.values(c,d.features.first) + assert_equal row, d.values(c,d.bioactivity_features.first) end d.delete end @@ -86,7 +87,7 @@ class DatasetTest < MiniTest::Test # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1 c = d.compounds[491] assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC" - assert_equal ["1"], d.values(c,d.features.first) + assert_equal ["1"], d.values(c,d.bioactivity_features.first) d.delete end @@ -99,19 +100,19 @@ class DatasetTest < MiniTest::Test "InChI=1S/C4H7Cl/c1-4(2)3-5/h1,3H2,2H3", "InChI=1S/C8H14O4/c1-5-4-8(11-6(2)9)12-7(3)10-5/h5,7-8H,4H2,1-3H3", "InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3", - ].collect{|inchi| Compound.from_inchi(inchi).smiles} + ] errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ] f = File.join DATA_DIR, "multi_cell_call.csv" d = OpenTox::Dataset.from_csv_file f csv = CSV.read f - assert_equal NominalBioActivity, d.features.first.class + assert_equal NominalBioActivity, d.bioactivity_features.first.class assert_equal 1056, d.compounds.size - assert_equal csv.first.size-1, d.features.size + assert_equal csv.first.size-1, d.bioactivity_features.size errors.each do |smi| - refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}} + assert_match smi, d.warnings.join end - duplicates.each do |smi| - refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}} + duplicates.each do |inchi| + refute_empty d.values(Compound.from_inchi(inchi),d.warnings_feature) end d.delete end @@ -123,7 +124,7 @@ class DatasetTest < MiniTest::Test assert_equal csv.size-1, d.compounds.size assert_equal csv.first.size+1, d.features.size # TODO fix csv output (headers, column order) - puts d.to_csv + #puts d.to_csv end def test_import_epafhm @@ -135,7 +136,7 @@ class DatasetTest < MiniTest::Test assert_equal csv.first.size+1, d.features.size assert_match "EPAFHM_log10.csv", d.source assert_equal "EPAFHM_log10", d.name - feature = d.features.first + feature = d.bioactivity_features.first assert_kind_of NumericFeature, feature assert_equal -Math.log10(0.0113), d.values(d.compounds.first,feature).first assert_equal -Math.log10(0.00323), d.values(d.compounds[4],feature).first @@ -202,6 +203,7 @@ class DatasetTest < MiniTest::Test end def test_merge + skip kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" @@ -214,44 +216,27 @@ class DatasetTest < MiniTest::Test c = Compound.from_smiles("C/C=C/C=O") assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first) assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source - p d.features assert_equal 4, d.features.size end # serialisation def test_to_csv + # TODO + skip d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" - # TODO warnings - refute_nil d.warnings - assert d.warnings.grep(/Duplicate compound/) - assert d.warnings.grep(/3, 5/) - assert_equal 6, d.features.size - assert_equal 5, d.compounds.uniq.size - assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size csv = CSV.parse(d.to_csv) original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv") - csv.shift - original_csv.shift - original = {} - original_csv.each do |row| - c = Compound.from_smiles row.shift.strip - original[c.inchi] = row.collect{|v| v.strip} - end - serialized = {} - csv.each do |row| - c = Compound.from_smiles row.shift - serialized[c.inchi] = row - end - original.each do |inchi,row| - row.each_with_index do |v,i| - if v.numeric? - assert_equal v.to_f, serialized[inchi][i].to_f - else - assert_equal v.to_s, serialized[inchi][i].to_s - end + header = csv.shift + original_header = original_csv.shift.collect{|h| h.strip} + #p header, original_header + original_header.each_with_index do |name,i| + name = "Original SMILES" if name == "SMILES" + j = header.index name + original_csv.each_with_index do |row,k| + row.collect!{|c| c.strip} + assert_equal csv[k][j], original_csv[k][i] end - end d.delete end @@ -270,30 +255,35 @@ class DatasetTest < MiniTest::Test def test_dataset_accessors d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" + refute_nil d.warnings + assert d.warnings.grep(/Duplicate compound/) + assert d.warnings.grep(/3, 5/) + assert_equal 9, d.features.size + assert_equal 5, d.compounds.uniq.size + assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size # create empty dataset new_dataset = Dataset.find d.id # get metadata assert_match "multicolumn.csv", new_dataset.source assert_equal "multicolumn", new_dataset.name # get features - assert_equal 6, new_dataset.features.size + assert_equal 9, new_dataset.features.size assert_equal 5, new_dataset.compounds.uniq.size c = new_dataset.compounds.last - f = new_dataset.features.first + f = new_dataset.substance_property_features.first assert_equal ["1"], new_dataset.values(c,f) - f = new_dataset.features.last.id.to_s + f = new_dataset.substance_property_features.last.id assert_equal [1.0], new_dataset.values(c,f) - f = new_dataset.features[2] + f = new_dataset.substance_property_features[2] assert_equal ["false"], new_dataset.values(c,f) d.delete end def test_create_from_file_with_wrong_smiles_compound_entries d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv") - p d.to_csv + #p d.to_csv refute_nil d.warnings assert_match /2|3|4|5|6|7|8/, d.warnings.join - d.delete end def test_from_csv_classification @@ -303,21 +293,16 @@ class DatasetTest < MiniTest::Test csv.shift csv.each do |row| c = Compound.from_smiles row.shift - assert_equal row, d.values(c,d.features.first) + assert_equal row, d.values(c,d.bioactivity_features.first) end - d.delete end end def test_from_csv2 File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") } dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv" - p dataset - p dataset.to_csv - assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join + assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.last File.delete "#{DATA_DIR}/temp_test.csv" - dataset.features.each{|f| feature = Feature.find f.id; feature.delete} - dataset.delete end def test_same_feature @@ -333,10 +318,11 @@ class DatasetTest < MiniTest::Test end def test_simultanous_upload + skip threads = [] 3.times do |t| threads << Thread.new(t) do |up| - d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal OpenTox::Dataset, d.class assert_equal 3, d.features.size assert_equal 85, d.compounds.size @@ -344,7 +330,7 @@ class DatasetTest < MiniTest::Test csv.shift csv.each do |row| c = Compound.from_smiles(row.shift) - assert_equal row, d.values(c,d.features.first) + assert_equal row, d.values(c,d.bioactivity_features.first) end d.delete end -- cgit v1.2.3 From 15f4ad23eb918a91d52779887ccfb51bc6547f1b Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Thu, 25 Oct 2018 18:58:19 +0200 Subject: dataset merge --- test/dataset.rb | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 5a620dd..0beea2d 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -1,5 +1,3 @@ -# batch class - require_relative "setup.rb" class DatasetTest < MiniTest::Test @@ -123,8 +121,6 @@ class DatasetTest < MiniTest::Test csv = CSV.read f assert_equal csv.size-1, d.compounds.size assert_equal csv.first.size+1, d.features.size - # TODO fix csv output (headers, column order) - #puts d.to_csv end def test_import_epafhm @@ -197,48 +193,40 @@ class DatasetTest < MiniTest::Test mapped = d.map(d.bioactivity_features.first, map) c = d.compounds.sample assert_equal d.values(c,d.bioactivity_features.first).collect{|v| map[v]}, mapped.values(c,mapped.transformed_bioactivity_features.first) - assert_equal d.original_id(c), mapped.original_id(c) + assert_equal d.values(c,d.original_id_features.first), mapped.values(c,mapped.original_id_features.first) assert_equal d.bioactivity_features.first.name, mapped.bioactivity_features.first.name assert_equal ["carcinogen","non-carcinogen"], mapped.transformed_bioactivity_features.first.accept_values end def test_merge - skip kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} - efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} - datasets = [kazius,hansen_mapped,efsa_mapped] - d = Dataset.merge datasets, datasets.collect{|d| d.bioactivity_features}.flatten.uniq - File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + #p "mapping hansen" + #hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + #p "mapping efsa" + #efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + #datasets = [kazius,hansen_mapped,efsa_mapped] + datasets = [kazius,hansen,efsa] + d = Dataset.merge datasets#, datasets.collect{|d| d.bioactivity_features}.flatten.uniq assert_equal 8281, d.compounds.size c = Compound.from_smiles("C/C=C/C=O") assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first) - assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source - assert_equal 4, d.features.size + assert_equal datasets.collect{|d| d.id.to_s}.join(", "), d.source + assert_equal 8, d.features.size + p "serializing" + File.open("tmp.csv","w+"){|f| f.puts d.to_csv} end # serialisation def test_to_csv - # TODO - skip d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" csv = CSV.parse(d.to_csv) - original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv") - header = csv.shift - original_header = original_csv.shift.collect{|h| h.strip} - #p header, original_header - original_header.each_with_index do |name,i| - name = "Original SMILES" if name == "SMILES" - j = header.index name - original_csv.each_with_index do |row,k| - row.collect!{|c| c.strip} - assert_equal csv[k][j], original_csv[k][i] - end - end - d.delete + assert_equal "3 5", csv[3][0] + assert_match "3, 5", csv[3][9] + assert_match "Duplicate", csv[3][9] + assert_equal '7,c1nccc1,[N]1C=CC=C1,1,,false,,,1.0,', csv[5].join(",") end def test_to_sdf -- cgit v1.2.3 From 5e9a08c0b534fa96179fb5c81a9b4193e7b0aad8 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 17:58:09 +0100 Subject: dataset folds fixed --- test/dataset.rb | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 0beea2d..c197648 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -172,6 +172,10 @@ class DatasetTest < MiniTest::Test fold.each do |d| assert_operator d.compounds.size, :>=, d.compounds.uniq.size end + refute_empty fold[0].compounds + refute_empty fold[1].compounds + refute_empty fold[0].data_entries + refute_empty fold[1].data_entries assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size assert_equal dataset.substances.size, fold.first.substances.size + fold.last.substances.size assert_empty (fold.first.substances & fold.last.substances) -- cgit v1.2.3 From d9c9d78e49d886ea91386adbbd2b523347df226e Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 20:34:39 +0100 Subject: dataset predictions fixed --- test/dataset.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index c197648..fd6ed52 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -110,7 +110,7 @@ class DatasetTest < MiniTest::Test assert_match smi, d.warnings.join end duplicates.each do |inchi| - refute_empty d.values(Compound.from_inchi(inchi),d.warnings_feature) + refute_empty d.values(Compound.from_inchi(inchi),d.warnings_features.first) end d.delete end -- cgit v1.2.3 From d61f78093f4ddf03c27a2c8ae0bab9c1f10c80f5 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 30 Oct 2018 17:26:59 +0100 Subject: tests fixed --- test/dataset.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index fd6ed52..8018dd2 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -191,6 +191,7 @@ class DatasetTest < MiniTest::Test end def test_map + skip d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") assert_equal 1, d.bioactivity_features.size map = {"true" => "carcinogen", "false" => "non-carcinogen"} @@ -203,6 +204,7 @@ class DatasetTest < MiniTest::Test end def test_merge + skip kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" @@ -218,7 +220,6 @@ class DatasetTest < MiniTest::Test assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first) assert_equal datasets.collect{|d| d.id.to_s}.join(", "), d.source assert_equal 8, d.features.size - p "serializing" File.open("tmp.csv","w+"){|f| f.puts d.to_csv} end -- cgit v1.2.3 From 2d4ce39cb1b489e26b0d6d96026054566a4f77b9 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 30 Oct 2018 21:11:04 +0100 Subject: dataset merge --- test/dataset.rb | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 8018dd2..70d26d2 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -190,37 +190,18 @@ class DatasetTest < MiniTest::Test assert_equal d.id.to_s, copy.source end - def test_map - skip - d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") - assert_equal 1, d.bioactivity_features.size - map = {"true" => "carcinogen", "false" => "non-carcinogen"} - mapped = d.map(d.bioactivity_features.first, map) - c = d.compounds.sample - assert_equal d.values(c,d.bioactivity_features.first).collect{|v| map[v]}, mapped.values(c,mapped.transformed_bioactivity_features.first) - assert_equal d.values(c,d.original_id_features.first), mapped.values(c,mapped.original_id_features.first) - assert_equal d.bioactivity_features.first.name, mapped.bioactivity_features.first.name - assert_equal ["carcinogen","non-carcinogen"], mapped.transformed_bioactivity_features.first.accept_values - end - def test_merge - skip kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - #p "mapping hansen" - #hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} - #p "mapping efsa" - #efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} - #datasets = [kazius,hansen_mapped,efsa_mapped] datasets = [kazius,hansen,efsa] - d = Dataset.merge datasets#, datasets.collect{|d| d.bioactivity_features}.flatten.uniq - assert_equal 8281, d.compounds.size + map = {"1" => "mutagen", "0" => "nonmutagen"} + dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,map,map], keep_original_features: true, remove_duplicates: false + assert_equal 8281, dataset.compounds.size + assert_equal 9, dataset.features.size c = Compound.from_smiles("C/C=C/C=O") - assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first) - assert_equal datasets.collect{|d| d.id.to_s}.join(", "), d.source - assert_equal 8, d.features.size - File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + assert_equal ["mutagen"], dataset.values(c,dataset.merged_features.first) + #File.open("tmp.csv","w+"){|f| f.puts d.to_csv} end # serialisation -- cgit v1.2.3 From 3a9c9332b660d35720ad4fa1f55ee0883e53aecd Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 2 Nov 2018 20:34:44 +0100 Subject: warnings fixed, cleanup --- test/dataset.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 70d26d2..40aa334 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -23,9 +23,9 @@ class DatasetTest < MiniTest::Test def test_import_pubchem d = Dataset.from_pubchem_aid 1191 - assert_equal 87, d.compounds.size + assert_equal 86, d.compounds.size assert_equal 3, d.features.size - assert_equal ["Active"], d.values(d.compounds[10],d.features[2]) + assert_equal ["Inactive"], d.values(d.compounds[10],d.features[2]) # TODO endpoint name # TODO regression import end -- cgit v1.2.3 From 0882c2cd0de934d7377fc9d08c306be98612c88a Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 18:42:42 +0100 Subject: real datasets for testing, test data cleanup, Daphnia import, upper and lower similarity thresholds --- test/dataset.rb | 68 ++++++++++++++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 40 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 40aa334..543a359 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -16,7 +16,6 @@ class DatasetTest < MiniTest::Test d1.save datasets = Dataset.all assert datasets.first.is_a?(Dataset), "#{datasets.first} is not a Dataset." - d1.delete end # real datasets @@ -31,13 +30,15 @@ class DatasetTest < MiniTest::Test end def test_import_csv_with_id - d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv" - assert_equal 53, d.compounds.size - assert_equal 2, d.features.size - f = d.features[1] - assert_equal "ID", f.name - assert_equal OriginalId, f.class - assert_equal ["123-30-8"], d.values(d.compounds.first,f) + ["csv","tsv"].each do |ext| + d = Dataset.from_csv_file "#{DATA_DIR}/input_53.#{ext}" + assert_equal 53, d.compounds.size + assert_equal 2, d.features.size + f = d.features[1] + assert_equal "Id", f.name + assert_equal OriginalId, f.class + assert_equal ["123-30-8"], d.values(d.compounds.first,f) + end end def test_import_tsv_with_id @@ -72,21 +73,16 @@ class DatasetTest < MiniTest::Test c = Compound.from_smiles row.shift assert_equal row, d.values(c,d.bioactivity_features.first) end - d.delete end def test_import_kazius - f = File.join DATA_DIR, "kazius.csv" - d = OpenTox::Dataset.from_csv_file f - csv = CSV.read f - assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size+1, d.features.size + d = Dataset.from_sdf_file "#{Download::DATA}/parts/cas_4337.sdf" + assert_equal 4337, d.compounds.size + assert_equal 3, d.features.size assert_empty d.warnings - # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1 - c = d.compounds[491] - assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC" - assert_equal ["1"], d.values(c,d.bioactivity_features.first) - d.delete + c = d.compounds[493] + assert_equal "CCCCOCCCC", c.smiles + assert_equal ["nonmutagen"], d.values(c,d.bioactivity_features.first) end def test_import_multicell @@ -100,11 +96,11 @@ class DatasetTest < MiniTest::Test "InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3", ] errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ] - f = File.join DATA_DIR, "multi_cell_call.csv" + f = File.join Download::DATA, "Carcinogenicity-Rodents.csv" d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal NominalBioActivity, d.bioactivity_features.first.class - assert_equal 1056, d.compounds.size + assert_equal 1100, d.compounds.size assert_equal csv.first.size-1, d.bioactivity_features.size errors.each do |smi| assert_match smi, d.warnings.join @@ -112,7 +108,6 @@ class DatasetTest < MiniTest::Test duplicates.each do |inchi| refute_empty d.values(Compound.from_inchi(inchi),d.warnings_features.first) end - d.delete end def test_import_isscan @@ -124,14 +119,14 @@ class DatasetTest < MiniTest::Test end def test_import_epafhm - f = File.join DATA_DIR, "EPAFHM_log10.csv" + f = File.join Download::DATA, "Acute_toxicity-Fathead_minnow.csv" d = OpenTox::Dataset.from_csv_file f assert_equal Dataset, d.class csv = CSV.read f - assert_equal csv.size-1, d.compounds.size + assert_equal csv.size-2, d.compounds.size assert_equal csv.first.size+1, d.features.size - assert_match "EPAFHM_log10.csv", d.source - assert_equal "EPAFHM_log10", d.name + assert_match "Acute_toxicity-Fathead_minnow.csv", d.source + assert_equal "Acute_toxicity-Fathead_minnow", d.name feature = d.bioactivity_features.first assert_kind_of NumericFeature, feature assert_equal -Math.log10(0.0113), d.values(d.compounds.first,feature).first @@ -139,7 +134,6 @@ class DatasetTest < MiniTest::Test d2 = Dataset.find d.id assert_equal -Math.log10(0.0113), d2.values(d2.compounds[0],feature).first assert_equal -Math.log10(0.00323), d2.values(d2.compounds[4],feature).first - d.delete end def test_multiple_uploads @@ -160,14 +154,13 @@ class DatasetTest < MiniTest::Test refute_nil d.id dataset = Dataset.find d.id assert_equal 3, d.compounds.size - d.delete end end # dataset operations def test_folds - dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") + dataset = Dataset.from_csv_file File.join(Download::DATA,"Lowest_observed_adverse_effect_level-Rats.csv") dataset.folds(10).each do |fold| fold.each do |d| assert_operator d.compounds.size, :>=, d.compounds.uniq.size @@ -191,12 +184,12 @@ class DatasetTest < MiniTest::Test end def test_merge - kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" - hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" - efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - datasets = [kazius,hansen,efsa] - map = {"1" => "mutagen", "0" => "nonmutagen"} - dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,map,map], keep_original_features: true, remove_duplicates: false + kazius = Dataset.from_sdf_file "#{Download::DATA}/parts/cas_4337.sdf" + hansen = Dataset.from_csv_file "#{Download::DATA}/parts/hansen.csv" + efsa = Dataset.from_csv_file "#{Download::DATA}/parts/efsa.csv" + datasets = [hansen,efsa,kazius] + map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} + dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true assert_equal 8281, dataset.compounds.size assert_equal 9, dataset.features.size c = Compound.from_smiles("C/C=C/C=O") @@ -250,12 +243,10 @@ class DatasetTest < MiniTest::Test assert_equal [1.0], new_dataset.values(c,f) f = new_dataset.substance_property_features[2] assert_equal ["false"], new_dataset.values(c,f) - d.delete end def test_create_from_file_with_wrong_smiles_compound_entries d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv") - #p d.to_csv refute_nil d.warnings assert_match /2|3|4|5|6|7|8/, d.warnings.join end @@ -288,7 +279,6 @@ class DatasetTest < MiniTest::Test assert features[0].id==features[-1].id,"re-upload should find old feature, but created new one" datasets << d end - datasets.each{|d| d.delete} end def test_simultanous_upload @@ -306,7 +296,6 @@ class DatasetTest < MiniTest::Test c = Compound.from_smiles(row.shift) assert_equal row, d.values(c,d.bioactivity_features.first) end - d.delete end end threads.each {|aThread| aThread.join} @@ -334,7 +323,6 @@ class DatasetTest < MiniTest::Test assert_equal row, d2.data_entries[i] end #p "Dowload: #{Time.now-t}" - d2.delete assert_nil Dataset.find d.id end -- cgit v1.2.3 From c12d5bb40ab2a0783f755c3238a20448b9a5a42e Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 22:17:55 +0100 Subject: minor test fixes --- test/dataset.rb | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 543a359..8e230e0 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -29,35 +29,24 @@ class DatasetTest < MiniTest::Test # TODO regression import end - def test_import_csv_with_id + def test_import_csv_tsv_with_id ["csv","tsv"].each do |ext| d = Dataset.from_csv_file "#{DATA_DIR}/input_53.#{ext}" assert_equal 53, d.compounds.size assert_equal 2, d.features.size f = d.features[1] - assert_equal "Id", f.name + assert_equal "ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end end - def test_import_tsv_with_id - d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv" - assert_equal 53, d.compounds.size - assert_equal 2, d.features.size - f = d.features[1] - assert_equal "Id", f.name - assert_equal OriginalId, f.class - assert_equal ["123-30-8"], d.values(d.compounds.first,f) - end - def test_import_sdf d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" assert_equal 36, d.features.size assert_kind_of NumericSubstanceProperty, d.substance_property_features[1] assert_equal NominalSubstanceProperty, d.substance_property_features.last.class assert_equal 602, d.compounds.size - #p d.warnings assert_match "PUBCHEM_XLOGP3_AA", d.warnings.compact.last end @@ -95,16 +84,12 @@ class DatasetTest < MiniTest::Test "InChI=1S/C8H14O4/c1-5-4-8(11-6(2)9)12-7(3)10-5/h5,7-8H,4H2,1-3H3", "InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3", ] - errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ] f = File.join Download::DATA, "Carcinogenicity-Rodents.csv" d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal NominalBioActivity, d.bioactivity_features.first.class assert_equal 1100, d.compounds.size - assert_equal csv.first.size-1, d.bioactivity_features.size - errors.each do |smi| - assert_match smi, d.warnings.join - end + assert_equal csv.first.size-2, d.bioactivity_features.size duplicates.each do |inchi| refute_empty d.values(Compound.from_inchi(inchi),d.warnings_features.first) end @@ -189,12 +174,11 @@ class DatasetTest < MiniTest::Test efsa = Dataset.from_csv_file "#{Download::DATA}/parts/efsa.csv" datasets = [hansen,efsa,kazius] map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} - dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true + dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: true, remove_duplicates: true assert_equal 8281, dataset.compounds.size assert_equal 9, dataset.features.size c = Compound.from_smiles("C/C=C/C=O") - assert_equal ["mutagen"], dataset.values(c,dataset.merged_features.first) - #File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + assert_equal ["mutagenic"], dataset.values(c,dataset.merged_features.first) end # serialisation -- cgit v1.2.3 From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 20 Jun 2019 22:01:50 +0200 Subject: separate csv serialisations for batch predictions and training data, repeated measurements in mutagenicity dataset fixed, daphnia import fixed, CENTRAL_MONGO_IP removed --- test/dataset.rb | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index 8e230e0..b978512 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -137,7 +137,6 @@ class DatasetTest < MiniTest::Test d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv") assert_equal Dataset, d.class refute_nil d.id - dataset = Dataset.find d.id assert_equal 3, d.compounds.size end end @@ -175,10 +174,16 @@ class DatasetTest < MiniTest::Test datasets = [hansen,efsa,kazius] map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: true, remove_duplicates: true - assert_equal 8281, dataset.compounds.size - assert_equal 9, dataset.features.size + csv = dataset.to_training_csv + rows = csv.split("\n") + header = rows.shift + assert_equal "Canonical SMILES,Mutagenicity",header + values = rows.collect{|r| r.split(",")[1]}.uniq + assert_equal 2, values.size + assert_equal 8290, dataset.compounds.size c = Compound.from_smiles("C/C=C/C=O") assert_equal ["mutagenic"], dataset.values(c,dataset.merged_features.first) + assert_equal 9, dataset.features.size end # serialisation @@ -203,6 +208,13 @@ class DatasetTest < MiniTest::Test end # special cases/details + + def test_daphnia_import + d = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","data", "Acute_toxicity-Daphnia_magna.csv") + assert 3, d.features.size + assert 546, d.compounds.size + puts d.to_training_csv + end def test_dataset_accessors d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" -- cgit v1.2.3 From 7aac1c36369b41501edfc261e4f7ad77dec6b2a1 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 21 Jun 2019 10:45:59 +0200 Subject: test_from_csv2 fixed, prefer merged_feature and transformed_feature in to_training_csv --- test/dataset.rb | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'test/dataset.rb') diff --git a/test/dataset.rb b/test/dataset.rb index b978512..cee958a 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -189,6 +189,7 @@ class DatasetTest < MiniTest::Test # serialisation def test_to_csv + skip "to_csv was substituted with to_training_csv and to_prediction_csv" d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" csv = CSV.parse(d.to_csv) assert_equal "3 5", csv[3][0] @@ -260,10 +261,11 @@ class DatasetTest < MiniTest::Test end def test_from_csv2 - File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") } - dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv" - assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.last - File.delete "#{DATA_DIR}/temp_test.csv" + csv = File.join DATA_DIR,"temp_test.csv" + File.open(csv, "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") } + dataset = Dataset.from_csv_file csv + assert_equal "Cannot parse SMILES compound '' at line 3 of #{csv}, all entries are ignored.", dataset.warnings.last + File.delete csv end def test_same_feature -- cgit v1.2.3