summaryrefslogtreecommitdiff
path: root/test/dataset.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-05-06 12:49:28 +0200
committerChristoph Helma <helma@in-silico.ch>2016-05-06 12:49:28 +0200
commit51f57e2858b60bed74ebcc97189b2188c900c283 (patch)
treea3be50b410e45fad3f33e956bb302c66e0370226 /test/dataset.rb
parentab7b37541b4f8a762be737009631d3eefd898b4a (diff)
dataset tests cleanup
Diffstat (limited to 'test/dataset.rb')
-rw-r--r--test/dataset.rb364
1 files changed, 242 insertions, 122 deletions
diff --git a/test/dataset.rb b/test/dataset.rb
index a7b8769..f028dbe 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -4,6 +4,15 @@ require_relative "setup.rb"
class DatasetTest < MiniTest::Test
+ # basics
+
+ def test_create_empty
+ d = Dataset.new
+ assert_equal Dataset, d.class
+ refute_nil d.id
+ assert_kind_of BSON::ObjectId, d.id
+ end
+
def test_all
d1 = Dataset.new
d1.save
@@ -12,70 +21,160 @@ class DatasetTest < MiniTest::Test
d1.delete
end
+ # real datasets
+
+ def test_upload_hamster
+ d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+ assert_equal Dataset, d.class
+ assert_equal 1, d.features.size
+ assert_equal 85, d.compounds.size
+ csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
+ csv.shift
+ csv.each do |row|
+ c = Compound.from_smiles row.shift
+ assert_equal c.toxicities[d.feature_ids.first.to_s], row
+ end
+ d.delete
+ end
+
+ def test_upload_kazius
+ f = File.join DATA_DIR, "kazius.csv"
+ d = OpenTox::Dataset.from_csv_file f
+ csv = CSV.read f
+ assert_equal csv.size-1, d.compounds.size
+ assert_equal csv.first.size-1, d.features.size
+ assert_empty d.warnings
+ # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1
+ c = d.compounds[491]
+ assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC"
+ assert_equal c.toxicities[d.feature_ids.first.to_s][0], "1"
+ d.delete
+ end
+
+ def test_upload_multicell
+ duplicates = [
+ "InChI=1S/C6HCl5O/c7-1-2(8)4(10)6(12)5(11)3(1)9/h12H",
+ "InChI=1S/C12H8Cl6O/c13-8-9(14)11(16)5-3-1-2(6-7(3)19-6)4(5)10(8,15)12(11,17)18/h2-7H,1H2",
+ "InChI=1S/C2HCl3/c3-1-2(4)5/h1H",
+ "InChI=1S/C4H5Cl/c1-3-4(2)5/h3H,1-2H2",
+ "InChI=1S/C4H7Cl/c1-4(2)3-5/h1,3H2,2H3",
+ "InChI=1S/C8H14O4/c1-5-4-8(11-6(2)9)12-7(3)10-5/h5,7-8H,4H2,1-3H3",
+ "InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3",
+ ].collect{|inchi| Compound.from_inchi(inchi).smiles}
+ errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ]
+ f = File.join DATA_DIR, "multi_cell_call.csv"
+ d = OpenTox::Dataset.from_csv_file f
+ csv = CSV.read f
+ assert_equal true, d.features.first.nominal
+ assert_equal csv.size-1-errors.size, d.compounds.size
+ assert_equal csv.first.size-1, d.features.size
+ puts d.warnings.to_yaml
+ errors.each do |smi|
+ refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}}
+ end
+ duplicates.each do |smi|
+ refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}}
+ end
+ d.delete
+ end
+
+ def test_upload_isscan
+ f = File.join DATA_DIR, "ISSCAN-multi.csv"
+ d = OpenTox::Dataset.from_csv_file f
+ csv = CSV.read f
+ assert_equal csv.size-1, d.compounds.size
+ assert_equal csv.first.size-1, d.features.size
+ d.delete
+ end
+
+ def test_upload_epafhm
+ f = File.join DATA_DIR, "EPAFHM.csv"
+ d = OpenTox::Dataset.from_csv_file f
+ assert_equal Dataset, d.class
+ csv = CSV.read f
+ assert_equal csv.size-1, d.compounds.size
+ assert_equal csv.first.size-1, d.features.size
+ assert_match "EPAFHM.csv", d.source
+ assert_equal "EPAFHM", d.name
+ refute_nil d.warnings
+ assert_equal 74, d.warnings.size
+ feature = d.features.first
+ assert_kind_of NumericFeature, feature
+ assert_match /row 13/, d.warnings.join
+ assert_equal 0.0113, d.compounds.first.toxicities[feature.id.to_s].first
+ assert_equal 0.00323, d.compounds[5].toxicities[feature.id.to_s].first
+ d2 = Dataset.find d.id
+ assert_equal 0.0113, d2.compounds[0].toxicities[feature.id.to_s].first
+ assert_equal 0.00323, d2.compounds[5].toxicities[feature.id.to_s].first
+ d.delete
+ end
+
+ # batch predictions
+
def test_create_without_features_smiles_and_inchi
["smiles", "inchi"].each do |type|
d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv")
assert_equal Dataset, d.class
refute_nil d.id
dataset = Dataset.find d.id
- #p dataset.compounds
assert_equal 3, d.compounds.size.to_i
d.delete
end
end
- def test_create_empty
- d = Dataset.new
- assert_equal Dataset, d.class
- refute_nil d.id
- assert_kind_of BSON::ObjectId, d.id
+ # dataset operations
+
+ def test_folds
+ dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv")
+ dataset.folds(10).each do |fold|
+ fold.each do |d|
+ assert_operator d.compounds.size, :>=, d.compounds.uniq.size
+ end
+ assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size
+ assert_equal dataset.substance_ids.size, fold.first.substance_ids.size + fold.last.substance_ids.size
+ assert_empty (fold.first.substance_ids & fold.last.substance_ids)
+ end
end
- def test_client_create
- d = Dataset.new
- assert_equal Dataset, d.class
- d.name = "Create dataset test"
+ # serialisation
- # add data entries
- features = ["test1", "test2"].collect do |title|
- f = Feature.new
- f.name = title
- f.numeric = true
- f.save
- f
+ def test_to_csv
+ d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
+ refute_nil d.warnings
+ assert d.warnings.grep(/Duplicate compound/)
+ assert d.warnings.grep(/3, 5/)
+ assert_equal 6, d.features.size
+ assert_equal 5, d.compounds.uniq.size
+ assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size
+ csv = CSV.parse(d.to_csv)
+ original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv")
+ csv.shift
+ original_csv.shift
+ original = {}
+ original_csv.each do |row|
+ c = Compound.from_smiles row.shift.strip
+ original[c.inchi] = row.collect{|v| v.strip}
end
-
- # manual low-level insertions without consistency checks for runtime efficiency
- compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi|
- Compound.from_smiles smi
+ serialized = {}
+ csv.each do |row|
+ c = Compound.from_smiles row.shift
+ serialized[c.inchi] = row
end
- data_entries = []
- data_entries << [1,2]
- data_entries << [4,5]
- data_entries << [6,7]
- compounds.each_with_index do |c,i|
- features.each_with_index do |f,j|
- d.data_entries[c.id.to_s] ||= {}
- d.data_entries[c.id.to_s][f.id.to_s] ||= []
- d.data_entries[c.id.to_s][f.id.to_s] << data_entries[i][j]
+ original.each do |inchi,row|
+ row.each_with_index do |v,i|
+ if v.numeric?
+ assert_equal v.to_f, serialized[inchi][i].to_f
+ else
+ assert_equal v, serialized[inchi][i]
+ end
end
- end
- assert_equal 3, d.compounds.size
- assert_equal 2, d.features.size
- p d.data_entries
- assert_equal [[1,2],[4,5],[6,7]], d.data_entries
- d.save
- # check if dataset has been saved correctly
- new_dataset = Dataset.find d.id
- assert_equal 3, new_dataset.compounds.size
- assert_equal 2, new_dataset.features.size
- assert_equal [[1,2],[4,5],[6,7]], new_dataset.data_entries
- d.delete
- assert_nil Dataset.find d.id
- assert_nil Dataset.find new_dataset.id
+ end
+ d.delete
end
+ # special cases/details
+
def test_dataset_accessors
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
# create empty dataset
@@ -85,8 +184,8 @@ class DatasetTest < MiniTest::Test
assert_equal "multicolumn", new_dataset.name
# get features
assert_equal 6, new_dataset.features.size
- assert_equal 5, new_dataset.compounds.size
- de = new_dataset.data_entries[new_dataset.compounds.last.id.to_s]
+ assert_equal 5, new_dataset.compounds.uniq.size
+ de = new_dataset.compounds.last.toxicities
fid = new_dataset.features.first.id.to_s
assert_equal ["1"], de[fid]
fid = new_dataset.features.last.id.to_s
@@ -96,16 +195,6 @@ class DatasetTest < MiniTest::Test
d.delete
end
- def test_create_from_file
- d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
- assert_equal Dataset, d.class
- refute_nil d.warnings
- assert_match "EPAFHM.mini.csv", d.source
- assert_equal "EPAFHM.mini.csv", d.name
- d.delete
- #assert_equal false, URI.accessible?(d.uri)
- end
-
def test_create_from_file_with_wrong_smiles_compound_entries
d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv")
refute_nil d.warnings
@@ -113,56 +202,14 @@ class DatasetTest < MiniTest::Test
d.delete
end
- def test_multicolumn_csv
- d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
- refute_nil d.warnings
- assert d.warnings.grep(/Duplicate compound/)
- assert d.warnings.grep(/3, 5/)
- assert_equal 6, d.features.size
- assert_equal 5, d.compounds.size
- assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size
- assert_equal [["1", "1", "true", "true", "test", 1.1], ["1", "2", "false", "7.5", "test", 0.24], ["1", "3", "true", "5", "test", 3578.239], ["0", "4", "false", "false", "test", -2.35], ["1", "2", "true", "4", "test_2", 1], ["1", "2", "false", "false", "test", -1.5], ["1", nil, "false", nil, nil, 1.0]], d.data_entries
- assert_equal "c1ccc[nH]1,1,,false,,,1.0", d.to_csv.split("\n")[7]
- csv = CSV.parse(d.to_csv)
- original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv")
- csv.shift
- original_csv.shift
- csv.each_with_index do |row,i|
- compound = Compound.from_smiles row.shift
- original_compound = Compound.from_smiles original_csv[i].shift.strip
- assert_equal original_compound.inchi, compound.inchi
- row.each_with_index do |v,j|
- if v.numeric?
- assert_equal original_csv[i][j].strip.to_f, row[j].to_f
- else
- assert_equal original_csv[i][j].strip, row[j].to_s
- end
- end
- end
- d.delete
- end
-
- def test_from_csv
- d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
- assert_equal Dataset, d.class
- assert_equal 1, d.features.size
- assert_equal 85, d.compounds.size
- assert_equal 85, d.data_entries.size
- csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
- csv.shift
- assert_equal csv.collect{|r| r[1]}, d.data_entries.flatten
- d.delete
- #assert_equal false, URI.accessible?(d.uri)
- end
-
def test_from_csv_classification
["int", "float", "string"].each do |mode|
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv"
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.mini.bool_#{mode}.csv")
csv.shift
- entries = d.data_entries.flatten
- csv.each_with_index do |r, i|
- assert_equal r[1].to_s, entries[i]
+ csv.each do |row|
+ c = Compound.from_smiles row.shift
+ assert_equal c.toxicities[d.feature_ids.first.to_s], row
end
d.delete
end
@@ -189,32 +236,105 @@ class DatasetTest < MiniTest::Test
datasets.each{|d| d.delete}
end
- def test_create_from_file
- d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
- assert_equal Dataset, d.class
- refute_nil d.warnings
- assert_match /row 13/, d.warnings.join
- assert_match "EPAFHM.mini.csv", d.source
- assert_equal 1, d.features.size
- feature = d.features.first
- assert_kind_of NumericFeature, feature
- assert_equal 0.0113, d.data_entries[0][0]
- assert_equal 0.00323, d.data_entries[5][0]
+ # skips, may be removed in the future
+
+ def test_simultanous_upload
+ skip
+ threads = []
+ 3.times do |t|
+ threads << Thread.new(t) do |up|
+ d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+ assert_equal OpenTox::Dataset, d.class
+ assert_equal 1, d.features.size
+ assert_equal 85, d.compounds.size
+ csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
+ csv.shift
+ csv.each do |row|
+ c = Compound.from_smiles(row.shift)
+ p row
+ p c.toxicities
+ p d.feature_ids.first.to_s
+ assert_equal row, c.toxicities[d.feature_ids.first.to_s]
+ end
+ d.delete
+ end
+ end
+ threads.each {|aThread| aThread.join}
+ end
+
+ def test_upload_feature_dataset
+ skip
+ t = Time.now
+ f = File.join DATA_DIR, "rat_feature_dataset.csv"
+ d = Dataset.from_csv_file f
+ assert_equal 458, d.features.size
+ d.save
+ #p "Upload: #{Time.now-t}"
d2 = Dataset.find d.id
- assert_equal 0.0113, d2.data_entries[0][0]
- assert_equal 0.00323, d2.data_entries[5][0]
+ t = Time.now
+ assert_equal d.features.size, d2.features.size
+ csv = CSV.read f
+ csv.shift # remove header
+ assert_empty d2.warnings
+ assert_equal csv.size, d2.compounds.size
+ assert_equal csv.first.size-1, d2.features.size
+ d2.compounds.each_with_index do |compound,i|
+ row = csv[i]
+ row.shift # remove compound
+ assert_equal row, d2.data_entries[i]
+ end
+ #p "Dowload: #{Time.now-t}"
+ d2.delete
+ assert_nil Dataset.find d.id
end
- def test_folds
- dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv")
- dataset.folds(10).each do |fold|
- fold.each do |d|
- assert_equal d.data_entries.size, d.compounds.size
- assert_equal d.compounds.size, :>=, d.compounds.uniq.size
+ def test_client_create
+ skip
+ d = Dataset.new
+ assert_equal Dataset, d.class
+ d.name = "Create dataset test"
+
+ # add data entries
+ features = ["test1", "test2"].collect do |title|
+ f = Feature.new
+ f.name = title
+ f.numeric = true
+ f.save
+ f
+ end
+
+ # manual low-level insertions without consistency checks for runtime efficiency
+ compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi|
+ Compound.from_smiles smi
+ end
+ data_entries = []
+ data_entries << [1,2]
+ data_entries << [4,5]
+ data_entries << [6,7]
+ compounds.each_with_index do |c,i|
+ features.each_with_index do |f,j|
+ d.substance_ids << c.id
+ d.feature_ids << f.id
+ c.toxicities[f.id.to_s] = data_entries[i][j]
end
- assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size
end
- #puts dataset.folds 10
+
+ assert_equal 3, d.compounds.size
+ assert_equal 2, d.features.size
+ #assert_equal [[1,2],[4,5],[6,7]], d.data_entries
+ d.save
+ # check if dataset has been saved correctly
+ new_dataset = Dataset.find d.id
+ assert_equal 3, new_dataset.compounds.size
+ assert_equal 2, new_dataset.features.size
+ new_dataset.compounds.each_with_index do |c,i|
+ new_dataset.features.each_with_index do |f,j|
+ assert_equal data_entries[i][j], c.toxicities[f.id.to_s].first
+ end
+ end
+ d.delete
+ assert_nil Dataset.find d.id
+ assert_nil Dataset.find new_dataset.id
end
end