diff options
Diffstat (limited to 'cv/data/utils')
-rw-r--r-- | cv/data/utils/count_features.rb | 48 | ||||
-rw-r--r-- | cv/data/utils/create_selected_feature_ds.rb | 7 |
2 files changed, 52 insertions, 3 deletions
diff --git a/cv/data/utils/count_features.rb b/cv/data/utils/count_features.rb new file mode 100644 index 0000000..1e272ac --- /dev/null +++ b/cv/data/utils/count_features.rb @@ -0,0 +1,48 @@ +require 'rubygems' +require 'opentox-ruby' +require 'yaml' + +@subjectid = nil + + + +def count_features(ds_uri) + puts ds_uri + + dataset = OpenTox::Dataset.find(ds_uri, @subjectid) + + features = dataset.features.keys + puts "# all features: #{features.size}" + + delete_features = [] + features.each{ |fn| + dataset.features[fn][RDF.type].each { |typestr| + if typestr.include? "MissingFeature" + delete_features << fn + @missing_features << dataset.features[fn][DC.title] + end + } + } + puts "# Missingfeatures: #{delete_features.size}" + features = features - delete_features + puts "# numeric features: #{features.size}" + puts "-----" +end + + +@missing_features = [] + +ds = YAML::load_file("../datasets.yaml") +ds.keys.each { |dataset| + puts "----------" + puts dataset + ds[dataset].keys.each { |pc| + puts pc unless (pc == "dataset") || (pc == "test") || (pc == "training") + count_features(ds[dataset][pc]) unless (pc == "dataset") || (pc == "test") || (pc == "training") + } + puts "----------" + puts +} +puts +puts "Missing features over all datasets:" +puts @missing_features.uniq!.to_yaml diff --git a/cv/data/utils/create_selected_feature_ds.rb b/cv/data/utils/create_selected_feature_ds.rb index 0e5f063..c30a23e 100644 --- a/cv/data/utils/create_selected_feature_ds.rb +++ b/cv/data/utils/create_selected_feature_ds.rb @@ -35,10 +35,11 @@ ds = YAML::load_file("../datasets.yaml") ds.keys.each { |dataset| puts "----------------- next dataset -----------------" ds[dataset].keys.each { |pc| - puts pc unless pc == "dataset" - [false, true].each { |del_missing| + puts pc unless (pc == "dataset") || (pc == "test") || (pc == "training") + #[false, true].each { |del_missing| + [false].each { |del_missing| #false is default begin - create_f_ds(ds[dataset]["dataset"], ds[dataset][pc], del_missing) unless pc == "dataset" + create_f_ds(ds[dataset]["dataset"], ds[dataset][pc], del_missing) unless (pc == "dataset") || (pc == "test") || (pc == "training") rescue end } |