diff options
author | mguetlein <martin.guetlein@gmail.com> | 2012-11-21 12:27:40 +0100 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2012-11-21 12:27:40 +0100 |
commit | fd73098fc09aac004de5e22b4400861f1ad29971 (patch) | |
tree | 77d66cfa2548e40fe1810316eeec264aadac84e0 | |
parent | 232a0e6a9613b3b03c7ab93a2536ff2f1219064a (diff) |
update tests for changes in dataset, r-util and validaiton
-rw-r--r-- | Rakefile | 2 | ||||
-rw-r--r-- | data/hamster_carcinogenicity.mini.dup.csv | 12 | ||||
-rw-r--r-- | dataset.rb | 56 | ||||
-rw-r--r-- | r-util.rb | 52 | ||||
-rw-r--r-- | validation.rb | 72 |
5 files changed, 107 insertions, 87 deletions
@@ -21,6 +21,7 @@ end task :setup do @@subjectid = OpenTox::Authorization.authenticate(TEST_USER,TEST_PW) @@classification_training_dataset = OpenTox::Dataset.create_from_csv_file("data/hamster_carcinogenicity.csv", @@subjectid) + @@duplicate_dataset = OpenTox::Dataset.create_from_csv_file("data/hamster_carcinogenicity.mini.dup.csv", @@subjectid) @@multinomial_training_dataset = OpenTox::Dataset.create_from_csv_file("data/ISSCAN-multi.csv", @@subjectid) @@regression_training_dataset = OpenTox::Dataset.create_from_csv_file("data/EPAFHM.csv", @@subjectid) @@regression_feature_dataset = OpenTox::Dataset.create_from_csv_file("data/EPAFHM-constitutional.csv", @@subjectid) @@ -28,6 +29,7 @@ end task :teardown do @@classification_training_dataset.delete(@@subjectid) + @@duplicate_dataset.delete(@@subjectid) @@multinomial_training_dataset.delete(@@subjectid) @@regression_training_dataset.delete(@@subjectid) @@regression_feature_dataset.delete(@@subjectid) diff --git a/data/hamster_carcinogenicity.mini.dup.csv b/data/hamster_carcinogenicity.mini.dup.csv new file mode 100644 index 0000000..3982ff6 --- /dev/null +++ b/data/hamster_carcinogenicity.mini.dup.csv @@ -0,0 +1,12 @@ +SMILES,Hamster-Carcinogenicity,Missing +CC=O,1, +C12C3=C(C=CC=C3)CC1=CC(=CC=2)NC(C)=O,1,a +O=C(N)\C(C2=CC=CO2)=C/C1=CC=C([N+]([O-])=O)O1,1,a +C1(N=CNN=1)N,0,b +Br(=O)(=O)[O-].[K+],1,a +[Cl-].[Cd+2].[Cl-],0,b +O=S(=O)([O-])[O-].[Cd+2],0,b +ClC1=CC(=NC(=N1)SCC(=O)O)NC2=CC=CC(=C2C)C,0,a +ClCOC,1,a +C=C(Cl)C=C,0, +C=C(Cl)C=C,1,a @@ -219,45 +219,23 @@ class DatasetTest < Test::Unit::TestCase end end - - def test_merge() - #upload - dataset1 = OpenTox::Dataset.create_from_csv_file(File.new("data/hamster_carcinogenicity.csv").path, @@subjectid) - dataset2 = OpenTox::Dataset.create_from_csv_file(File.new("data/multi_cell_call.csv").path, @@subjectid) - #merge1 - title = "test merge" - dataset_merge1 = OpenTox::Dataset.merge(dataset1, dataset2, { DC.title => title,DC.creator => "testsuite"}, @@subjectid ) - dataset_reloaded1 = OpenTox::Dataset.find(dataset_merge1.uri, @@subjectid) - #test1 - [dataset_merge1, dataset_reloaded1].each do |d| - assert_equal d.metadata[DC.title],title - assert_equal d.features.size,(dataset1.features.size+dataset2.features.size) - assert_equal d.compounds.size,(dataset1.compounds+dataset2.compounds).uniq.size - [dataset1, dataset2].each do |d_i| - d_i.compounds.each{|c| assert d.compounds.include?(c)} - d_i.features.keys.each{|f| assert d.features.keys.include?(f)} - d_i.features.keys.each do |f| - assert_equal d_i.features[f],d.features[f] - d_i.compounds do |c| - assert_equal d_i.data_entries[c][f],d.data_entries[c][f] - end - end - end - end - #merge2 - compounds1 = dataset1.compounds[0..dataset1.compounds.size/2] - features1 = [] - dataset_merge2 = OpenTox::Dataset.merge(dataset1, dataset2, {}, @@subjectid, features1, nil, compounds1 ) - dataset_reloaded2 = OpenTox::Dataset.find(dataset_merge2.uri, @@subjectid) - #test2 - [dataset_merge2, dataset_reloaded2].each do |d| - assert_equal d.features.size,dataset2.features.size - assert_equal d.compounds.size,(compounds1+dataset2.compounds).uniq.size - end - #cleanup - [dataset_merge1, dataset_merge2, dataset1, dataset2].each do |d| - OpenTox::RestClientWrapper.delete(d.uri,{:subjectid => @@subjectid}) - end + def test_split() + puts "test_split" + d = @@duplicate_dataset + assert_equal(d.compounds.size,11) + assert_equal(d.compounds.uniq.size,10) + assert_equal(d.compounds[9],d.compounds[10]) + dupl = d.compounds[9] + feat1 = d.features.keys.sort[0] + feat2 = d.features.keys.sort[1] + assert_not_equal(d.data_entries[dupl][feat1][0],d.data_entries[dupl][feat1][1]) + assert_equal(d.data_entries[dupl][feat2][0],nil) + d2 = d.split([0,1,2,3,4,5,6,7,8,10],d.features.keys,d.metadata) + assert_equal(d2.compounds.size,10) + assert_equal(d2.compounds.uniq.size,10) + assert_equal(d.data_entries[dupl][feat1][1],d2.data_entries[dupl][feat1][0]) + assert_equal(d2.data_entries[dupl][feat2][0],"a") + d2.delete end def test_multithreading @@ -146,14 +146,9 @@ class RUtilTest < Test::Unit::TestCase def stratified_split unless defined?@@strat @@split_ratio = 0.05 - @@split_has_duplicates = false #hamster has no duplicates -# res = @@rutil.stratified_split(@@hamster,0,@@split_ratio,1) -# @@resources += [ res[0].uri, res[1].uri ] -# @@strat = { :data => @@hamster, :split1 => res[0], :split2 => res[1] } - data_combined = OpenTox::Dataset.merge(@@hamster,@@hamster_features,{},@@subjectid) - res1, res2 = @@rutil.stratified_split(data_combined,{},0,@@split_ratio,@@subjectid,1) - @@resources += [ data_combined.uri, res1.uri, res2.uri ] - @@strat = {:data => data_combined, :split1 => res1, :split2 => res2 } + res1, res2 = @@rutil.stratified_split(@@hamster_features,{},0,@@split_ratio,@@subjectid,1) + @@resources += [ res1.uri, res2.uri ] + @@strat = {:data => @@hamster_features, :split1 => res1, :split2 => res2 } end @@strat end @@ -165,31 +160,22 @@ class RUtilTest < Test::Unit::TestCase size1 = split[:split1].compounds.size size2 = split[:split2].compounds.size assert_equal size,(split[:split1].compounds+split[:split2].compounds).uniq.size - unless @@split_has_duplicates - assert_equal (@@split_ratio*size).round,size1, - "Dataset #{size} should be split into #{(@@split_ratio*size).round}/#{size-(@@split_ratio*size).round}"+ - " (exact: #{@@split_ratio*size}), instead: #{size1}/#{size2}" - end + assert_equal (@@split_ratio*size).round,size1, + "Dataset #{size} should be split into #{(@@split_ratio*size).round}/#{size-(@@split_ratio*size).round}"+ + " (exact: #{@@split_ratio*size}), instead: #{size1}/#{size2}" split[:data].compounds.each do |c| include1 = split[:split1].compounds.include?(c) include2 = split[:split2].compounds.include?(c) - unless @@split_has_duplicates - assert(((include1 and !include2) or (!include1 and include2))) - else - assert((include1 or include2)) - end + assert(((include1 and !include2) or (!include1 and include2))) end end def test_k_fold_stratified_split puts "test_k_fold_stratified_split" - data_combined = OpenTox::Dataset.merge(@@hamster,@@hamster_features,{},@@subjectid) - num_duplicates = 0 #hamster has no duplicates num_folds = 10 - avg_split_size = (data_combined.compounds.size+num_duplicates)/num_folds.to_f + avg_split_size = (@@hamster_features.compounds.size)/num_folds.to_f - @@resources += [ data_combined.uri ] - train, test = @@rutil.stratified_k_fold_split(data_combined,{},0,num_folds,@@subjectid,1) + train, test = @@rutil.stratified_k_fold_split(@@hamster_features,{},0,num_folds,@@subjectid,1) @@resources += (train + test).collect{ |r| r.uri } [train, test].each do |result| assert result.is_a?(Array) @@ -204,28 +190,20 @@ class RUtilTest < Test::Unit::TestCase sum_test+=test[i].compounds.size compounds_test += test[i].compounds - assert_equal (test[i].compounds.size+train[i].compounds.size),(data_combined.compounds.size+num_duplicates) + assert_equal (test[i].compounds.size+train[i].compounds.size),(@@hamster_features.compounds.size) compounds = (test[i].compounds + train[i].compounds) - if num_duplicates==0 - assert_equal compounds.sort,data_combined.compounds.sort - else - assert_equal compounds.uniq.sort,data_combined.compounds.sort - end - end - assert_equal sum_test,(data_combined.compounds.size+num_duplicates) - if num_duplicates==0 - assert_equal compounds_test.sort,data_combined.compounds.sort - else - assert_equal compounds_test.uniq.sort,data_combined.compounds.sort + assert_equal compounds.sort,@@hamster_features.compounds.sort end + assert_equal sum_test,(@@hamster_features.compounds.size) + assert_equal compounds_test.sort,@@hamster_features.compounds.sort end def test_feature_value_plot puts "feature_value_plot" split = stratified_split data = split[:data] - dataset1 = data.split( data.compounds[0..4], data.features.keys, {}, @@subjectid) - dataset2 = data.split( data.compounds[5..-1], data.features.keys, {}, @@subjectid) + dataset1 = data.split( (0..4).to_a, data.features.keys, {}, @@subjectid) + dataset2 = data.split( (5..(data.compounds.size-1)).to_a, data.features.keys, {}, @@subjectid) @@resources += [dataset1.uri, dataset2.uri] files = [] #plot diff --git a/validation.rb b/validation.rb index 159bf8e..6f750ed 100644 --- a/validation.rb +++ b/validation.rb @@ -38,10 +38,10 @@ class ValidationTest < Test::Unit::TestCase :info => "http://apps.ideaconsult.net:8080/ambit2/dataset/435293?page=0&pagesize=300" } @@files = { File.new("data/hamster_carcinogenicity.csv") => :crossvalidation, - #File.new("data/hamster_carcinogenicity.mini.csv") => :crossvalidation, #File.new("data/EPAFHM.csv") => :crossvalidation, File.new("data/EPAFHM.mini.csv") => :crossvalidation, File.new("data/hamster_carcinogenicity.csv") => :split_validation, + File.new("data/hamster_carcinogenicity.csv") => :bootstrap_validation, File.new("data/EPAFHM.csv") => :split_validation, #File.new("data/StJudes-HepG2-testset_Class.csv") => :crossvalidation } @@ -66,7 +66,7 @@ class ValidationTest < Test::Unit::TestCase def global_teardown puts "delete and logout" - if @@delete + if defined?(@@delete) and @@delete [:data, :train_data, :test_data].each do |d| @@data.each do |data| OpenTox::Dataset.find(data[d],@@subjectid).delete(@@subjectid) if data[d] and data[:delete] and OpenTox::Dataset.exist?(data[d], @@subjectid) @@ -88,6 +88,48 @@ class ValidationTest < Test::Unit::TestCase assert l.uri? end end + + def test_bootstrapping + + @@vs = [] unless defined?@@vs + @@data.each do |data| + if data[:type]==:bootstrap_validation + puts "bootstrapping "+data[:info].to_s + p = { + :dataset_uri => data[:data], + :algorithm_uri => File.join(CONFIG[:services]["opentox-algorithm"],"lazar"), + :algorithm_params => "feature_generation_uri="+File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc"), + :prediction_feature => data[:feat], + :random_seed => 2} + t = OpenTox::SubTask.new(nil,0,1) + def t.progress(pct) + if !defined?@last_msg or @last_msg+10<Time.new + puts "waiting for boostrap validation: "+pct.to_s + @last_msg=Time.new + end + end + def t.waiting_for(task_uri); end + v = OpenTox::Validation.create_bootstrapping_validation(p, @@subjectid, t) + assert v.uri.uri? + if AA_SERVER + assert_rest_call_error OpenTox::NotAuthorizedError do + OpenTox::Validation.find(v.uri) + end + end + v = OpenTox::Validation.find(v.uri, @@subjectid) + assert_valid_date v + assert v.uri.uri? + assert_prob_correct(v) + model = v.metadata[OT.model] + assert model.uri? + v_list = OpenTox::Validation.list( {:model => model} ) + assert v_list.size==1 and v_list.include?(v.uri) + puts v.uri unless defined?(@@delete) and @@delete + @@vs << v + end + end + + end def test_training_test_split @@ -100,7 +142,7 @@ class ValidationTest < Test::Unit::TestCase :algorithm_uri => File.join(CONFIG[:services]["opentox-algorithm"],"lazar"), :algorithm_params => "feature_generation_uri="+File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc"), :prediction_feature => data[:feat], - :split_ratio => 0.95, + :split_ratio => 0.7, :random_seed => 2} t = OpenTox::SubTask.new(nil,0,1) def t.progress(pct) @@ -121,11 +163,19 @@ class ValidationTest < Test::Unit::TestCase assert_valid_date v assert v.uri.uri? assert_prob_correct(v) + + train_compounds = OpenTox::Dataset.find(v.metadata[OT.trainingDataset]).compounds + test_compounds = OpenTox::Dataset.find(v.metadata[OT.testDataset]).compounds + orig_compounds = OpenTox::Dataset.find(data[:data]).compounds + assert_equal((orig_compounds.size*0.7).round,train_compounds.size) + assert_equal(orig_compounds.size,(train_compounds+test_compounds).size) + assert_equal(orig_compounds.uniq.size,(train_compounds+test_compounds).uniq.size) + model = v.metadata[OT.model] assert model.uri? v_list = OpenTox::Validation.list( {:model => model} ) assert v_list.size==1 and v_list.include?(v.uri) - puts v.uri unless @@delete + puts v.uri unless defined?(@@delete) and @@delete @@vs << v end end @@ -167,7 +217,7 @@ class ValidationTest < Test::Unit::TestCase assert model.uri? v_list = OpenTox::Validation.list( {:model => model} ) assert v_list.size==1 and v_list.include?(v.uri) - puts v.uri unless @@delete + puts v.uri unless defined?(@@delete) and @@delete @@vs << v end end @@ -204,7 +254,7 @@ class ValidationTest < Test::Unit::TestCase assert_equal report.uri,report2.uri report3_uri = v.find_or_create_report(@@subjectid) assert_equal report.uri,report3_uri - puts report2.uri unless @@delete + puts report2.uri unless defined?(@@delete) and @@delete @@reports << report2 end end @@ -283,7 +333,7 @@ class ValidationTest < Test::Unit::TestCase assert_equal report.errorType,OpenTox::NotAuthorizedError.to_s end end - puts cv.uri unless @@delete + puts cv.uri unless defined?(@@delete) and @@delete @@cvs << cv @@cv_datasets << data @@ -324,7 +374,7 @@ class ValidationTest < Test::Unit::TestCase assert_equal report.uri,report2.uri report3_uri = cv.find_or_create_report(@@subjectid) assert_equal report.uri,report3_uri - puts report2.uri unless @@delete + puts report2.uri unless defined?(@@delete) and @@delete @@reports << report2 end end @@ -362,14 +412,14 @@ class ValidationTest < Test::Unit::TestCase assert_equal report.uri,report2.uri report3 = OpenTox::AlgorithmComparisonReport.find_for_crossvalidation(@@cvs[j].uri,@@subjectid) assert_equal report.uri,report3.uri - puts report2.uri unless @@delete + puts report2.uri unless defined?(@@delete) and @@delete @@reports << report2 end end end end - if @@qmrf_test + if defined?(@@qmrf_test) and @@qmrf_test def test_qmrf_report #@@cv = OpenTox::Crossvalidation.find("http://local-ot/validation/crossvalidation/13", @@subjectid) @@ -404,7 +454,7 @@ class ValidationTest < Test::Unit::TestCase qmrf_uris = OpenTox::RestClientWrapper.get(File.join(CONFIG[:services]["opentox-validation"],"/reach_report/QMRF?model="+model_uri), {:subjectid => @@subjectid}).chomp.split("\n") assert qmrf_uris.size==1 and qmrf_uris[0]==qmrfReport.uri - puts qmrfReport.uri unless @@delete + puts qmrfReport.uri unless defined?(@@delete) and @@delete @@qmrfReports << qmrfReport end end |