From 1652fd5df948da7ace622c73d158010add656b9f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 24 Oct 2018 18:21:34 +0200 Subject: dataset map --- test/classification-model.rb | 47 ++++++++++++----------- test/dataset.rb | 89 ++++++++++++++++++++++++++++---------------- 2 files changed, 82 insertions(+), 54 deletions(-) (limited to 'test') diff --git a/test/classification-model.rb b/test/classification-model.rb index b94b5e6..7a2a64f 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -22,37 +22,40 @@ class LazarClassificationTest < MiniTest::Test assert_kind_of Model::LazarClassification, model assert_equal algorithms, model.algorithms [ { - :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :compound => OpenTox::Compound.from_smiles("OCC(CN(CC(O)C)N=O)O"), :prediction => "false", },{ - :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), - :prediction => "false", + :compound => OpenTox::Compound.from_smiles("O=CNc1scc(n1)c1ccc(o1)[N+](=O)[O-]"), + :prediction => "true", } ].each do |example| prediction = model.predict example[:compound] - p example[:compound] - p prediction - #assert_equal example[:prediction], prediction[:value] + assert_equal example[:prediction], prediction[:value] end - compound = Compound.from_smiles "CCO" - prediction = model.predict compound - assert_equal "true", prediction[:value] - assert_equal ["false"], prediction[:measurements] - # make a dataset prediction - compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") + compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv") prediction_dataset = model.predict compound_dataset - assert_equal compound_dataset.compounds, prediction_dataset.compounds + puts prediction_dataset.to_csv + assert_equal compound_dataset.compounds.size, prediction_dataset.compounds.size + c = Compound.from_smiles "CC(CN(CC(O)C)N=O)O" + prediction_feature = prediction_dataset.features.select{|f| f.class == NominalLazarPrediction}[0] + assert_equal ["true"], prediction_dataset.values(c, prediction_feature) + p_true = LazarPredictionProbability.find_by(:name => "true") + p_false = LazarPredictionProbability.find_by(:name => "false") + p p_true + assert_equal [0.7], prediction_dataset.values(c,p_true) + assert_equal [0.0], prediction_dataset.values(c,p_false) + assert_equal 0.0, p_false - cid = prediction_dataset.compounds[7].id.to_s - assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] - expectations = ["Cannot create prediction: Only one similar compound in the training set.", - "Could not find similar substances with experimental data in the training dataset."] - prediction_dataset.predictions.each do |cid,pred| - assert_includes expectations, pred[:warnings][0] if pred[:value].nil? - end - cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_match "excluded", prediction_dataset.predictions[cid][:info] +# cid = prediction_dataset.compounds[7].id.to_s +# assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] +# expectations = ["Cannot create prediction: Only one similar compound in the training set.", +# "Could not find similar substances with experimental data in the training dataset."] +# prediction_dataset.predictions.each do |cid,pred| +# assert_includes expectations, pred[:warnings][0] if pred[:value].nil? +# end +# cid = Compound.from_smiles("CCOC(=O)N").id.to_s +# assert_match "excluded", prediction_dataset.predictions[cid][:info] end def test_classification_parameters diff --git a/test/dataset.rb b/test/dataset.rb index 2b439bb..163f178 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -26,8 +26,8 @@ class DatasetTest < MiniTest::Test def test_import_pubchem d = Dataset.from_pubchem_aid 1191 assert_equal 87, d.compounds.size - assert_equal 2, d.features.size - assert_equal ["Active"], d.values(d.compounds[10],d.features[1]) + assert_equal 3, d.features.size + assert_equal ["Active"], d.values(d.compounds[10],d.features[2]) # TODO endpoint name # TODO regression import end @@ -35,9 +35,9 @@ class DatasetTest < MiniTest::Test def test_import_csv_with_id d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv" assert_equal 53, d.compounds.size - assert_equal 1, d.features.size - f = d.features[0] - assert_equal "input_53.ID", f.name + assert_equal 2, d.features.size + f = d.features[1] + assert_equal "ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end @@ -45,16 +45,16 @@ class DatasetTest < MiniTest::Test def test_import_tsv_with_id d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv" assert_equal 53, d.compounds.size - assert_equal 1, d.features.size - f = d.features[0] - assert_equal "input_53.ID", f.name + assert_equal 2, d.features.size + f = d.features[1] + assert_equal "ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end def test_import_sdf d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" - assert_equal 35, d.features.size + assert_equal 37, d.features.size assert_kind_of NumericSubstanceProperty, d.features[1] assert_equal NominalSubstanceProperty, d.features.last.class assert_equal 602, d.compounds.size @@ -64,7 +64,7 @@ class DatasetTest < MiniTest::Test def test_import_hamster d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal Dataset, d.class - assert_equal 1, d.features.size + assert_equal 3, d.features.size assert_equal 85, d.compounds.size assert_equal NominalBioActivity, d.features.first.class csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") @@ -81,7 +81,7 @@ class DatasetTest < MiniTest::Test d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size + assert_equal csv.first.size+1, d.features.size assert_empty d.warnings # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1 c = d.compounds[491] @@ -121,8 +121,9 @@ class DatasetTest < MiniTest::Test d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size - d.delete + assert_equal csv.first.size+1, d.features.size + # TODO fix csv output (headers, column order) + puts d.to_csv end def test_import_epafhm @@ -131,7 +132,7 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size + assert_equal csv.first.size+1, d.features.size assert_match "EPAFHM_log10.csv", d.source assert_equal "EPAFHM_log10", d.name feature = d.features.first @@ -168,23 +169,6 @@ class DatasetTest < MiniTest::Test # dataset operations - def test_merge - skip # TODO use new Features - source_feature = Feature.where(:name => "Ames test categorisation").first - target_feature = Feature.where(:name => "Mutagenicity").first - kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" - hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" - efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - d = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} - #File.open("tmp.csv","w+"){|f| f.puts d.to_csv} - assert_equal 8281, d.compounds.size - c = Compound.from_smiles("C/C=C/C=O") - assert_equal ["mutagen"], d.values(c,target_feature) - assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source - p d.features - assert_equal 4, d.features.size - end - def test_folds dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") dataset.folds(10).each do |fold| @@ -197,10 +181,48 @@ class DatasetTest < MiniTest::Test end end + def test_copy + d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") + copy = d.copy + assert_equal d.data_entries, copy.data_entries + assert_equal d.name, copy.name + assert_equal d.id.to_s, copy.source + end + + def test_map + d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") + assert_equal 1, d.bioactivity_features.size + map = {"true" => "carcinogen", "false" => "non-carcinogen"} + mapped = d.map(d.bioactivity_features.first, map) + c = d.compounds.sample + assert_equal d.values(c,d.bioactivity_features.first).collect{|v| map[v]}, mapped.values(c,mapped.transformed_bioactivity_features.first) + assert_equal d.original_id(c), mapped.original_id(c) + assert_equal d.bioactivity_features.first.name, mapped.bioactivity_features.first.name + assert_equal ["carcinogen","non-carcinogen"], mapped.transformed_bioactivity_features.first.accept_values + end + + def test_merge + kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" + efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" + hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + datasets = [kazius,hansen_mapped,efsa_mapped] + d = Dataset.merge datasets, datasets.collect{|d| d.bioactivity_features}.flatten.uniq + File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + assert_equal 8281, d.compounds.size + c = Compound.from_smiles("C/C=C/C=O") + assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first) + assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source + p d.features + assert_equal 4, d.features.size + end + # serialisation def test_to_csv d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" + # TODO warnings refute_nil d.warnings assert d.warnings.grep(/Duplicate compound/) assert d.warnings.grep(/3, 5/) @@ -268,6 +290,7 @@ class DatasetTest < MiniTest::Test def test_create_from_file_with_wrong_smiles_compound_entries d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv") + p d.to_csv refute_nil d.warnings assert_match /2|3|4|5|6|7|8/, d.warnings.join d.delete @@ -289,6 +312,8 @@ class DatasetTest < MiniTest::Test def test_from_csv2 File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") } dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv" + p dataset + p dataset.to_csv assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join File.delete "#{DATA_DIR}/temp_test.csv" dataset.features.each{|f| feature = Feature.find f.id; feature.delete} @@ -313,7 +338,7 @@ class DatasetTest < MiniTest::Test threads << Thread.new(t) do |up| d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal OpenTox::Dataset, d.class - assert_equal 1, d.features.size + assert_equal 3, d.features.size assert_equal 85, d.compounds.size csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") csv.shift -- cgit v1.2.3