diff options
Diffstat (limited to '5x_cv/check_datasets_inchi_equality.rb')
-rw-r--r-- | 5x_cv/check_datasets_inchi_equality.rb | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/5x_cv/check_datasets_inchi_equality.rb b/5x_cv/check_datasets_inchi_equality.rb new file mode 100644 index 0000000..7d38e96 --- /dev/null +++ b/5x_cv/check_datasets_inchi_equality.rb @@ -0,0 +1,98 @@ +require 'rubygems' +require 'opentox-ruby' +require 'yaml' + +@subjectid = nil + + + +def check_ds(t_ds_uri, f_ds_uri) + puts t_ds_uri + puts f_ds_uri + + regression_training_dataset = OpenTox::Dataset.find(t_ds_uri, @subjectid)#3963;1572; + regression_feature_dataset = OpenTox::Dataset.find(f_ds_uri, @subjectid)#3971;3946; + + train_ds = regression_training_dataset.data_entries.keys + train_cmds = regression_training_dataset.compounds + feature_ds = regression_feature_dataset.data_entries.keys + feature_cmds = regression_feature_dataset.compounds + + puts "----- Check activity inchi -----" + match=0 + mismatch=0 + train_ds.each { |i| + if feature_ds.include?(i) + match = match + 1 + else + mismatch = mismatch + 1 + end + } + if mismatch > 0 + puts "NOT all training compounds represented in feature dataset!!!" unless mismatch > 0 + puts "match: #{match}; and mismatch: !!!!!#{mismatch}!!!!!" + else + puts "All training compounds represented in feature dataset." unless mismatch > 0 + puts "match: #{match}; mismatch: #{mismatch}" + puts "OK!!!" + end + + train_ds.sort! + feature_ds.sort! + + if train_ds == feature_ds + puts "train_ds == feature_ds" + else + a = train_ds - feature_ds + #puts "d: '#{a}'" + puts "train_ds: " + train_ds.size.to_s + "; feature_ds: "+ feature_ds.size.to_s + puts "train_ds =NOT feature_ds" + end + + + + puts "----- Check compound inchi -----" + match=0 + mismatch=0 + train_cmds.each { |i| + if feature_cmds.include?(i) + match = match + 1 + else + mismatch = mismatch + 1 + end + } + if mismatch > 0 + puts "NOT all training compounds represented in feature dataset!!!" unless mismatch > 0 + puts "match: #{match}; and mismatch: !!!!!#{mismatch}!!!!!" + else + puts "All training compounds represented in feature dataset." unless mismatch > 0 + puts "match: #{match}; mismatch: #{mismatch}" + puts "OK!!!" + end + + + + feature_cmds.sort! + train_cmds.sort! + + if train_cmds == feature_cmds + puts "train_cmds == feature_cmds" + else + b = train_cmds - feature_cmds + #puts "d: '#{b}'" + puts "train_cmds: " + train_cmds.size.to_s + "; feature_cmds: " + feature_cmds.size.to_s + puts "train_cmds =NOT feature_cmds" + end + puts +end + + + + +ds = YAML::load_file("datasets_nestle.yaml") +ds.keys.each { |dataset| + ds[dataset].keys.each { |pc| + puts pc + check_ds(ds[dataset]["dataset"], ds[dataset][pc]) + } +} |