summaryrefslogtreecommitdiff
path: root/test/dataset.rb
diff options
context:
space:
mode:
Diffstat (limited to 'test/dataset.rb')
-rw-r--r--test/dataset.rb233
1 files changed, 140 insertions, 93 deletions
diff --git a/test/dataset.rb b/test/dataset.rb
index 055a029..cee958a 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -16,40 +16,65 @@ class DatasetTest < MiniTest::Test
d1.save
datasets = Dataset.all
assert datasets.first.is_a?(Dataset), "#{datasets.first} is not a Dataset."
- d1.delete
end
# real datasets
+
+ def test_import_pubchem
+ d = Dataset.from_pubchem_aid 1191
+ assert_equal 86, d.compounds.size
+ assert_equal 3, d.features.size
+ assert_equal ["Inactive"], d.values(d.compounds[10],d.features[2])
+ # TODO endpoint name
+ # TODO regression import
+ end
+
+ def test_import_csv_tsv_with_id
+ ["csv","tsv"].each do |ext|
+ d = Dataset.from_csv_file "#{DATA_DIR}/input_53.#{ext}"
+ assert_equal 53, d.compounds.size
+ assert_equal 2, d.features.size
+ f = d.features[1]
+ assert_equal "ID", f.name
+ assert_equal OriginalId, f.class
+ assert_equal ["123-30-8"], d.values(d.compounds.first,f)
+ end
+ end
+
+ def test_import_sdf
+ d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf"
+ assert_equal 36, d.features.size
+ assert_kind_of NumericSubstanceProperty, d.substance_property_features[1]
+ assert_equal NominalSubstanceProperty, d.substance_property_features.last.class
+ assert_equal 602, d.compounds.size
+ assert_match "PUBCHEM_XLOGP3_AA", d.warnings.compact.last
+ end
- def test_upload_hamster
+ def test_import_hamster
d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
assert_equal Dataset, d.class
- assert_equal 1, d.features.size
+ assert_equal 3, d.features.size
assert_equal 85, d.compounds.size
+ assert_equal NominalBioActivity, d.bioactivity_features.first.class
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
csv.shift
csv.each do |row|
c = Compound.from_smiles row.shift
- assert_equal row, d.values(c,d.features.first)
+ assert_equal row, d.values(c,d.bioactivity_features.first)
end
- d.delete
end
- def test_upload_kazius
- f = File.join DATA_DIR, "kazius.csv"
- d = OpenTox::Dataset.from_csv_file f
- csv = CSV.read f
- assert_equal csv.size-1, d.compounds.size
- assert_equal csv.first.size-1, d.features.size
+ def test_import_kazius
+ d = Dataset.from_sdf_file "#{Download::DATA}/parts/cas_4337.sdf"
+ assert_equal 4337, d.compounds.size
+ assert_equal 3, d.features.size
assert_empty d.warnings
- # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1
- c = d.compounds[491]
- assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC"
- assert_equal ["1"], d.values(c,d.features.first)
- d.delete
+ c = d.compounds[493]
+ assert_equal "CCCCOCCCC", c.smiles
+ assert_equal ["nonmutagen"], d.values(c,d.bioactivity_features.first)
end
- def test_upload_multicell
+ def test_import_multicell
duplicates = [
"InChI=1S/C6HCl5O/c7-1-2(8)4(10)6(12)5(11)3(1)9/h12H",
"InChI=1S/C12H8Cl6O/c13-8-9(14)11(16)5-3-1-2(6-7(3)19-6)4(5)10(8,15)12(11,17)18/h2-7H,1H2",
@@ -58,143 +83,169 @@ class DatasetTest < MiniTest::Test
"InChI=1S/C4H7Cl/c1-4(2)3-5/h1,3H2,2H3",
"InChI=1S/C8H14O4/c1-5-4-8(11-6(2)9)12-7(3)10-5/h5,7-8H,4H2,1-3H3",
"InChI=1S/C19H30O5/c1-3-5-7-20-8-9-21-10-11-22-14-17-13-19-18(23-15-24-19)12-16(17)6-4-2/h12-13H,3-11,14-15H2,1-2H3",
- ].collect{|inchi| Compound.from_inchi(inchi).smiles}
- errors = ['O=P(H)(OC)OC', 'C=CCNN.HCl' ]
- f = File.join DATA_DIR, "multi_cell_call.csv"
+ ]
+ f = File.join Download::DATA, "Carcinogenicity-Rodents.csv"
d = OpenTox::Dataset.from_csv_file f
csv = CSV.read f
- assert_equal true, d.features.first.nominal?
- assert_equal 1056, d.compounds.size
- assert_equal csv.first.size-1, d.features.size
- errors.each do |smi|
- refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}}
- end
- duplicates.each do |smi|
- refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}}
+ assert_equal NominalBioActivity, d.bioactivity_features.first.class
+ assert_equal 1100, d.compounds.size
+ assert_equal csv.first.size-2, d.bioactivity_features.size
+ duplicates.each do |inchi|
+ refute_empty d.values(Compound.from_inchi(inchi),d.warnings_features.first)
end
- d.delete
end
- def test_upload_isscan
+ def test_import_isscan
f = File.join DATA_DIR, "ISSCAN-multi.csv"
d = OpenTox::Dataset.from_csv_file f
csv = CSV.read f
assert_equal csv.size-1, d.compounds.size
- assert_equal csv.first.size-1, d.features.size
- d.delete
+ assert_equal csv.first.size+1, d.features.size
end
- def test_upload_epafhm
- f = File.join DATA_DIR, "EPAFHM_log10.csv"
+ def test_import_epafhm
+ f = File.join Download::DATA, "Acute_toxicity-Fathead_minnow.csv"
d = OpenTox::Dataset.from_csv_file f
assert_equal Dataset, d.class
csv = CSV.read f
- assert_equal csv.size-1, d.compounds.size
- assert_equal csv.first.size-1, d.features.size
- assert_match "EPAFHM_log10.csv", d.source
- assert_equal "EPAFHM_log10", d.name
- feature = d.features.first
+ assert_equal csv.size-2, d.compounds.size
+ assert_equal csv.first.size+1, d.features.size
+ assert_match "Acute_toxicity-Fathead_minnow.csv", d.source
+ assert_equal "Acute_toxicity-Fathead_minnow", d.name
+ feature = d.bioactivity_features.first
assert_kind_of NumericFeature, feature
assert_equal -Math.log10(0.0113), d.values(d.compounds.first,feature).first
assert_equal -Math.log10(0.00323), d.values(d.compounds[4],feature).first
d2 = Dataset.find d.id
assert_equal -Math.log10(0.0113), d2.values(d2.compounds[0],feature).first
assert_equal -Math.log10(0.00323), d2.values(d2.compounds[4],feature).first
- d.delete
+ end
+
+ def test_multiple_uploads
+ datasets = []
+ 2.times do
+ d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv")
+ datasets << d
+ end
+ assert_equal datasets[0],datasets[1]
end
# batch predictions
def test_create_without_features_smiles_and_inchi
["smiles", "inchi"].each do |type|
- d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv"), true
+ d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv")
assert_equal Dataset, d.class
refute_nil d.id
- dataset = Dataset.find d.id
assert_equal 3, d.compounds.size
- d.delete
end
end
# dataset operations
def test_folds
- dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv")
+ dataset = Dataset.from_csv_file File.join(Download::DATA,"Lowest_observed_adverse_effect_level-Rats.csv")
dataset.folds(10).each do |fold|
fold.each do |d|
assert_operator d.compounds.size, :>=, d.compounds.uniq.size
end
+ refute_empty fold[0].compounds
+ refute_empty fold[1].compounds
+ refute_empty fold[0].data_entries
+ refute_empty fold[1].data_entries
assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size
assert_equal dataset.substances.size, fold.first.substances.size + fold.last.substances.size
assert_empty (fold.first.substances & fold.last.substances)
end
end
+ def test_copy
+ d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv")
+ copy = d.copy
+ assert_equal d.data_entries, copy.data_entries
+ assert_equal d.name, copy.name
+ assert_equal d.id.to_s, copy.source
+ end
+
+ def test_merge
+ kazius = Dataset.from_sdf_file "#{Download::DATA}/parts/cas_4337.sdf"
+ hansen = Dataset.from_csv_file "#{Download::DATA}/parts/hansen.csv"
+ efsa = Dataset.from_csv_file "#{Download::DATA}/parts/efsa.csv"
+ datasets = [hansen,efsa,kazius]
+ map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
+ dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: true, remove_duplicates: true
+ csv = dataset.to_training_csv
+ rows = csv.split("\n")
+ header = rows.shift
+ assert_equal "Canonical SMILES,Mutagenicity",header
+ values = rows.collect{|r| r.split(",")[1]}.uniq
+ assert_equal 2, values.size
+ assert_equal 8290, dataset.compounds.size
+ c = Compound.from_smiles("C/C=C/C=O")
+ assert_equal ["mutagenic"], dataset.values(c,dataset.merged_features.first)
+ assert_equal 9, dataset.features.size
+ end
+
# serialisation
def test_to_csv
+ skip "to_csv was substituted with to_training_csv and to_prediction_csv"
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
- refute_nil d.warnings
- assert d.warnings.grep(/Duplicate compound/)
- assert d.warnings.grep(/3, 5/)
- assert_equal 6, d.features.size
- assert_equal 5, d.compounds.uniq.size
- assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size
csv = CSV.parse(d.to_csv)
- original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv")
- csv.shift
- original_csv.shift
- original = {}
- original_csv.each do |row|
- c = Compound.from_smiles row.shift.strip
- original[c.inchi] = row.collect{|v| v.strip}
- end
- serialized = {}
- csv.each do |row|
- c = Compound.from_smiles row.shift
- serialized[c.inchi] = row
- end
- #puts serialized.to_yaml
- original.each do |inchi,row|
- row.each_with_index do |v,i|
- if v.numeric?
- assert_equal v.to_f, serialized[inchi][i].to_f
- else
- assert_equal v.to_s, serialized[inchi][i].to_s
- end
- end
+ assert_equal "3 5", csv[3][0]
+ assert_match "3, 5", csv[3][9]
+ assert_match "Duplicate", csv[3][9]
+ assert_equal '7,c1nccc1,[N]1C=CC=C1,1,,false,,,1.0,', csv[5].join(",")
+ end
+ def test_to_sdf
+ d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.mini.csv"
+ File.open("#{DATA_DIR}/tmp.sdf","w+") do |f|
+ f.puts d.to_sdf
end
- d.delete
+ d2 = Dataset.from_sdf_file "#{DATA_DIR}/tmp.sdf"
+ assert_equal d.compounds.size, d2.compounds.size
+ `rm #{DATA_DIR}/tmp.sdf`
end
# special cases/details
+
+ def test_daphnia_import
+ d = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","data", "Acute_toxicity-Daphnia_magna.csv")
+ assert 3, d.features.size
+ assert 546, d.compounds.size
+ puts d.to_training_csv
+ end
def test_dataset_accessors
d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
+ refute_nil d.warnings
+ assert d.warnings.grep(/Duplicate compound/)
+ assert d.warnings.grep(/3, 5/)
+ assert_equal 9, d.features.size
+ assert_equal 5, d.compounds.uniq.size
+ assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size
# create empty dataset
new_dataset = Dataset.find d.id
# get metadata
assert_match "multicolumn.csv", new_dataset.source
assert_equal "multicolumn", new_dataset.name
# get features
- assert_equal 6, new_dataset.features.size
+ assert_equal 9, new_dataset.features.size
assert_equal 5, new_dataset.compounds.uniq.size
c = new_dataset.compounds.last
- f = new_dataset.features.first
+ f = new_dataset.substance_property_features.first
assert_equal ["1"], new_dataset.values(c,f)
- f = new_dataset.features.last.id.to_s
+ f = new_dataset.substance_property_features.last.id
assert_equal [1.0], new_dataset.values(c,f)
- f = new_dataset.features[2]
+ f = new_dataset.substance_property_features[2]
assert_equal ["false"], new_dataset.values(c,f)
- d.delete
end
def test_create_from_file_with_wrong_smiles_compound_entries
d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv")
refute_nil d.warnings
assert_match /2|3|4|5|6|7|8/, d.warnings.join
- d.delete
end
def test_from_csv_classification
@@ -204,19 +255,17 @@ class DatasetTest < MiniTest::Test
csv.shift
csv.each do |row|
c = Compound.from_smiles row.shift
- assert_equal row, d.values(c,d.features.first)
+ assert_equal row, d.values(c,d.bioactivity_features.first)
end
- d.delete
end
end
def test_from_csv2
- File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") }
- dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv"
- assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join
- File.delete "#{DATA_DIR}/temp_test.csv"
- dataset.features.each{|f| feature = Feature.find f.id; feature.delete}
- dataset.delete
+ csv = File.join DATA_DIR,"temp_test.csv"
+ File.open(csv, "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") }
+ dataset = Dataset.from_csv_file csv
+ assert_equal "Cannot parse SMILES compound '' at line 3 of #{csv}, all entries are ignored.", dataset.warnings.last
+ File.delete csv
end
def test_same_feature
@@ -228,24 +277,23 @@ class DatasetTest < MiniTest::Test
assert features[0].id==features[-1].id,"re-upload should find old feature, but created new one"
datasets << d
end
- datasets.each{|d| d.delete}
end
def test_simultanous_upload
+ skip
threads = []
3.times do |t|
threads << Thread.new(t) do |up|
- d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+ d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
assert_equal OpenTox::Dataset, d.class
- assert_equal 1, d.features.size
+ assert_equal 3, d.features.size
assert_equal 85, d.compounds.size
csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv")
csv.shift
csv.each do |row|
c = Compound.from_smiles(row.shift)
- assert_equal row, d.values(c,d.features.first)
+ assert_equal row, d.values(c,d.bioactivity_features.first)
end
- d.delete
end
end
threads.each {|aThread| aThread.join}
@@ -273,7 +321,6 @@ class DatasetTest < MiniTest::Test
assert_equal row, d2.data_entries[i]
end
#p "Dowload: #{Time.now-t}"
- d2.delete
assert_nil Dataset.find d.id
end