diff options
author | mguetlein <martin.guetlein@gmail.com> | 2012-06-11 11:31:37 +0200 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2012-06-11 11:31:37 +0200 |
commit | 826acfb96e310e0d431a56b7c0b3f5408ba2cc5f (patch) | |
tree | 22adeb86da8eb90007e85c22bbeae3f66184f2d7 | |
parent | 11c793da54bc304cfd7f80fcf722fb9b488811e8 (diff) |
make anti-strat independent of input order, minor fixes
-rw-r--r-- | lib/dataset.rb | 41 | ||||
-rw-r--r-- | lib/environment.rb | 2 | ||||
-rw-r--r-- | lib/stratification.R | 3 |
3 files changed, 44 insertions, 2 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb index c916722..2c8c73a 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -412,12 +412,47 @@ module OpenTox # @param [optional,Array] compounds2, if specified only this compounds of dataset2 are used # example: if you want no features from dataset2, give empty array as features2 def self.merge( dataset1, dataset2, metadata, subjectid=nil, features1=nil, features2=nil, compounds1=nil, compounds2=nil ) +# features_selected = (features1!=nil or features2!=nil) features1 = dataset1.features.keys unless features1 features2 = dataset2.features.keys unless features2 +# compounds_selected = (compounds1!=nil or compounds2!=nil) compounds1 = dataset1.compounds unless compounds1 compounds2 = dataset2.compounds unless compounds2 data_combined = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid) LOGGER.debug("merging datasets #{dataset1.uri} and #{dataset2.uri} to #{data_combined.uri}") + +# if (compounds2.size*features2.size > compounds1.size*features1.size) +# tmp_c = compounds1 +# tmp_f = features1 +# tmp_d = dataset1 +# compounds1 = compounds2 +# features1 = features2 +# dataset1 = dataset2 +# compounds2 = tmp_c +# features2 = tmp_f +# dataset2 = tmp_d +# end +# # merge hash +# entries = dataset1.data_entries.merge(dataset2.data_entries) +# # delete compounds +# (entries.keys - [compounds1 + compounds2]).each{|c| entries.delete(c)} if compounds_selected +# # delete features +# if features_selected +# feats = [features1 + features2] +# entries.each do |c,f| +# (f.keys - feats).each{|feat| f.delete(feat)} +# end +# end +# # compounds that occur in both datasets, have been overwritten by merge (are equal to dataset2 values) +# [compounds1 & compounds2].each do |c| +# features1.each do |f| +# dataset1.data_entries[c][f].each do |v| +# entries[c][f] = v +# end if dataset1.data_entries[c] and dataset1.data_entries[c][f] +# end +# end +# data_combined.data_entries = entries + [[dataset1, features1, compounds1], [dataset2, features2, compounds2]].each do |dataset,features,compounds| compounds.each{|c| data_combined.add_compound(c)} features.each do |f| @@ -431,6 +466,7 @@ module OpenTox end end end + metadata = {} unless metadata metadata[OT.hasSource] = "Merge from #{dataset1.uri} and #{dataset2.uri}" unless metadata[OT.hasSource] data_combined.add_metadata(metadata) @@ -438,6 +474,11 @@ module OpenTox data_combined end +# def data_entries=(entries) +# @data_entries = entries +# end + + # Save dataset at the dataset service # - creates a new dataset if uri is not set # - overwrites dataset if uri exists diff --git a/lib/environment.rb b/lib/environment.rb index c1b8312..7033ad3 100644 --- a/lib/environment.rb +++ b/lib/environment.rb @@ -11,7 +11,7 @@ TMP_DIR = File.join(basedir, "tmp") LOG_DIR = File.join(basedir, "log") if File.exist?(config_file) - CONFIG = YAML.load_file(config_file) + CONFIG = YAML.load_file(config_file) unless defined?(CONFIG) raise "could not load config, config file: "+config_file.to_s unless CONFIG else FileUtils.mkdir_p TMP_DIR diff --git a/lib/stratification.R b/lib/stratification.R index c15dee6..fb934a4 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -143,7 +143,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) } split <- array(1:nrow(data)) count = 0 - for(j in 1:nrow(data)) + for(j in sample(array(nrow(data)))) { if (count<num && cl[j]==idx) { @@ -300,6 +300,7 @@ plot_split <- function( data, split, names=NULL, ... ) #data<-rbind(data,c) #data=iris #split = stratified_k_fold_split(data, num_folds=3) +#split = anti_stratified_split(data, ratio=0.75) #split = stratified_split(data, ratio=0.75) #print(split) #print(sum(split)) |