summaryrefslogtreecommitdiff
path: root/test/dataset.rb
diff options
context:
space:
mode:
Diffstat (limited to 'test/dataset.rb')
-rw-r--r--test/dataset.rb89
1 files changed, 57 insertions, 32 deletions
diff --git a/test/dataset.rb b/test/dataset.rb
index 2b439bb..163f178 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -26,8 +26,8 @@ class DatasetTest < MiniTest::Test
def test_import_pubchem
d = Dataset.from_pubchem_aid 1191
assert_equal 87, d.compounds.size
- assert_equal 2, d.features.size
- assert_equal ["Active"], d.values(d.compounds[10],d.features[1])
+ assert_equal 3, d.features.size
+ assert_equal ["Active"], d.values(d.compounds[10],d.features[2])
# TODO endpoint name
# TODO regression import
end
@@ -35,9 +35,9 @@ class DatasetTest < MiniTest::Test
def test_import_csv_with_id
d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv"
assert_equal 53, d.compounds.size
- assert_equal 1, d.features.size
- f = d.features[0]
- assert_equal "input_53.ID", f.name
+ assert_equal 2, d.features.size
+ f = d.features[1]
+ assert_equal "ID", f.name
assert_equal OriginalId, f.class
assert_equal ["123-30-8"], d.values(d.compounds.first,f)
end
@@ -45,16 +45,16 @@ class DatasetTest < MiniTest::Test
def test_import_tsv_with_id
d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv"
assert_equal 53, d.compounds.size
- assert_equal 1, d.features.size
- f = d.features[0]
- assert_equal "input_53.ID", f.name
+ assert_equal 2, d.features.size
+ f = d.features[1]
+ assert_equal "ID", f.name
assert_equal OriginalId, f.class
assert_equal ["123-30-8"], d.values(d.compounds.first,f)
end
def test_import_sdf
d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf"
- assert_equal 35, d.features.size
+ assert_equal 37, d.features.size
assert_kind_of NumericSubstanceProperty, d.features[1]
assert_equal NominalSubstanceProperty, d.features.last.class
assert_equal 602, d.compounds.size
@@ -64,7 +64,7 @@ class DatasetTest < MiniTest::Test
def test_import_hamster
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
assert_equal Dataset, d.class
- assert_equal 1, d.features.size
+ assert_equal 3, d.features.size
assert_equal 85, d.compounds.size
assert_equal NominalBioActivity, d.features.first.class
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
@@ -81,7 +81,7 @@ class DatasetTest < MiniTest::Test
d = OpenTox::Dataset.from_csv_file f
csv = CSV.read f
assert_equal csv.size-1, d.compounds.size
- assert_equal csv.first.size-1, d.features.size
+ assert_equal csv.first.size+1, d.features.size
assert_empty d.warnings
# 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1
c = d.compounds[491]
@@ -121,8 +121,9 @@ class DatasetTest < MiniTest::Test
d = OpenTox::Dataset.from_csv_file f
csv = CSV.read f
assert_equal csv.size-1, d.compounds.size
- assert_equal csv.first.size-1, d.features.size
- d.delete
+ assert_equal csv.first.size+1, d.features.size
+ # TODO fix csv output (headers, column order)
+ puts d.to_csv
end
def test_import_epafhm
@@ -131,7 +132,7 @@ class DatasetTest < MiniTest::Test
assert_equal Dataset, d.class
csv = CSV.read f
assert_equal csv.size-1, d.compounds.size
- assert_equal csv.first.size-1, d.features.size
+ assert_equal csv.first.size+1, d.features.size
assert_match "EPAFHM_log10.csv", d.source
assert_equal "EPAFHM_log10", d.name
feature = d.features.first
@@ -168,23 +169,6 @@ class DatasetTest < MiniTest::Test
# dataset operations
- def test_merge
- skip # TODO use new Features
- source_feature = Feature.where(:name => "Ames test categorisation").first
- target_feature = Feature.where(:name => "Mutagenicity").first
- kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf"
- hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv"
- efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv"
- d = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"}
- #File.open("tmp.csv","w+"){|f| f.puts d.to_csv}
- assert_equal 8281, d.compounds.size
- c = Compound.from_smiles("C/C=C/C=O")
- assert_equal ["mutagen"], d.values(c,target_feature)
- assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source
- p d.features
- assert_equal 4, d.features.size
- end
-
def test_folds
dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv")
dataset.folds(10).each do |fold|
@@ -197,10 +181,48 @@ class DatasetTest < MiniTest::Test
end
end
+ def test_copy
+ d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv")
+ copy = d.copy
+ assert_equal d.data_entries, copy.data_entries
+ assert_equal d.name, copy.name
+ assert_equal d.id.to_s, copy.source
+ end
+
+ def test_map
+ d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv")
+ assert_equal 1, d.bioactivity_features.size
+ map = {"true" => "carcinogen", "false" => "non-carcinogen"}
+ mapped = d.map(d.bioactivity_features.first, map)
+ c = d.compounds.sample
+ assert_equal d.values(c,d.bioactivity_features.first).collect{|v| map[v]}, mapped.values(c,mapped.transformed_bioactivity_features.first)
+ assert_equal d.original_id(c), mapped.original_id(c)
+ assert_equal d.bioactivity_features.first.name, mapped.bioactivity_features.first.name
+ assert_equal ["carcinogen","non-carcinogen"], mapped.transformed_bioactivity_features.first.accept_values
+ end
+
+ def test_merge
+ kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf"
+ hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv"
+ efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv"
+ hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"}
+ efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"}
+ datasets = [kazius,hansen_mapped,efsa_mapped]
+ d = Dataset.merge datasets, datasets.collect{|d| d.bioactivity_features}.flatten.uniq
+ File.open("tmp.csv","w+"){|f| f.puts d.to_csv}
+ assert_equal 8281, d.compounds.size
+ c = Compound.from_smiles("C/C=C/C=O")
+ assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first)
+ assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source
+ p d.features
+ assert_equal 4, d.features.size
+ end
+
# serialisation
def test_to_csv
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
+ # TODO warnings
refute_nil d.warnings
assert d.warnings.grep(/Duplicate compound/)
assert d.warnings.grep(/3, 5/)
@@ -268,6 +290,7 @@ class DatasetTest < MiniTest::Test
def test_create_from_file_with_wrong_smiles_compound_entries
d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv")
+ p d.to_csv
refute_nil d.warnings
assert_match /2|3|4|5|6|7|8/, d.warnings.join
d.delete
@@ -289,6 +312,8 @@ class DatasetTest < MiniTest::Test
def test_from_csv2
File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") }
dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv"
+ p dataset
+ p dataset.to_csv
assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join
File.delete "#{DATA_DIR}/temp_test.csv"
dataset.features.each{|f| feature = Feature.find f.id; feature.delete}
@@ -313,7 +338,7 @@ class DatasetTest < MiniTest::Test
threads << Thread.new(t) do |up|
d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
assert_equal OpenTox::Dataset, d.class
- assert_equal 1, d.features.size
+ assert_equal 3, d.features.size
assert_equal 85, d.compounds.size
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
csv.shift