summaryrefslogtreecommitdiff
path: root/cv/data/utils
diff options
context:
space:
mode:
Diffstat (limited to 'cv/data/utils')
-rw-r--r--cv/data/utils/count_features.rb48
-rw-r--r--cv/data/utils/create_selected_feature_ds.rb7
2 files changed, 52 insertions, 3 deletions
diff --git a/cv/data/utils/count_features.rb b/cv/data/utils/count_features.rb
new file mode 100644
index 0000000..1e272ac
--- /dev/null
+++ b/cv/data/utils/count_features.rb
@@ -0,0 +1,48 @@
+require 'rubygems'
+require 'opentox-ruby'
+require 'yaml'
+
+@subjectid = nil
+
+
+
+def count_features(ds_uri)
+ puts ds_uri
+
+ dataset = OpenTox::Dataset.find(ds_uri, @subjectid)
+
+ features = dataset.features.keys
+ puts "# all features: #{features.size}"
+
+ delete_features = []
+ features.each{ |fn|
+ dataset.features[fn][RDF.type].each { |typestr|
+ if typestr.include? "MissingFeature"
+ delete_features << fn
+ @missing_features << dataset.features[fn][DC.title]
+ end
+ }
+ }
+ puts "# Missingfeatures: #{delete_features.size}"
+ features = features - delete_features
+ puts "# numeric features: #{features.size}"
+ puts "-----"
+end
+
+
+@missing_features = []
+
+ds = YAML::load_file("../datasets.yaml")
+ds.keys.each { |dataset|
+ puts "----------"
+ puts dataset
+ ds[dataset].keys.each { |pc|
+ puts pc unless (pc == "dataset") || (pc == "test") || (pc == "training")
+ count_features(ds[dataset][pc]) unless (pc == "dataset") || (pc == "test") || (pc == "training")
+ }
+ puts "----------"
+ puts
+}
+puts
+puts "Missing features over all datasets:"
+puts @missing_features.uniq!.to_yaml
diff --git a/cv/data/utils/create_selected_feature_ds.rb b/cv/data/utils/create_selected_feature_ds.rb
index 0e5f063..c30a23e 100644
--- a/cv/data/utils/create_selected_feature_ds.rb
+++ b/cv/data/utils/create_selected_feature_ds.rb
@@ -35,10 +35,11 @@ ds = YAML::load_file("../datasets.yaml")
ds.keys.each { |dataset|
puts "----------------- next dataset -----------------"
ds[dataset].keys.each { |pc|
- puts pc unless pc == "dataset"
- [false, true].each { |del_missing|
+ puts pc unless (pc == "dataset") || (pc == "test") || (pc == "training")
+ #[false, true].each { |del_missing|
+ [false].each { |del_missing| #false is default
begin
- create_f_ds(ds[dataset]["dataset"], ds[dataset][pc], del_missing) unless pc == "dataset"
+ create_f_ds(ds[dataset]["dataset"], ds[dataset][pc], del_missing) unless (pc == "dataset") || (pc == "test") || (pc == "training")
rescue
end
}