summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-06-11 11:31:37 +0200
committermguetlein <martin.guetlein@gmail.com>2012-06-11 11:31:37 +0200
commit826acfb96e310e0d431a56b7c0b3f5408ba2cc5f (patch)
tree22adeb86da8eb90007e85c22bbeae3f66184f2d7
parent11c793da54bc304cfd7f80fcf722fb9b488811e8 (diff)
make anti-strat independent of input order, minor fixes
-rw-r--r--lib/dataset.rb41
-rw-r--r--lib/environment.rb2
-rw-r--r--lib/stratification.R3
3 files changed, 44 insertions, 2 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index c916722..2c8c73a 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -412,12 +412,47 @@ module OpenTox
# @param [optional,Array] compounds2, if specified only this compounds of dataset2 are used
# example: if you want no features from dataset2, give empty array as features2
def self.merge( dataset1, dataset2, metadata, subjectid=nil, features1=nil, features2=nil, compounds1=nil, compounds2=nil )
+# features_selected = (features1!=nil or features2!=nil)
features1 = dataset1.features.keys unless features1
features2 = dataset2.features.keys unless features2
+# compounds_selected = (compounds1!=nil or compounds2!=nil)
compounds1 = dataset1.compounds unless compounds1
compounds2 = dataset2.compounds unless compounds2
data_combined = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
LOGGER.debug("merging datasets #{dataset1.uri} and #{dataset2.uri} to #{data_combined.uri}")
+
+# if (compounds2.size*features2.size > compounds1.size*features1.size)
+# tmp_c = compounds1
+# tmp_f = features1
+# tmp_d = dataset1
+# compounds1 = compounds2
+# features1 = features2
+# dataset1 = dataset2
+# compounds2 = tmp_c
+# features2 = tmp_f
+# dataset2 = tmp_d
+# end
+# # merge hash
+# entries = dataset1.data_entries.merge(dataset2.data_entries)
+# # delete compounds
+# (entries.keys - [compounds1 + compounds2]).each{|c| entries.delete(c)} if compounds_selected
+# # delete features
+# if features_selected
+# feats = [features1 + features2]
+# entries.each do |c,f|
+# (f.keys - feats).each{|feat| f.delete(feat)}
+# end
+# end
+# # compounds that occur in both datasets, have been overwritten by merge (are equal to dataset2 values)
+# [compounds1 & compounds2].each do |c|
+# features1.each do |f|
+# dataset1.data_entries[c][f].each do |v|
+# entries[c][f] = v
+# end if dataset1.data_entries[c] and dataset1.data_entries[c][f]
+# end
+# end
+# data_combined.data_entries = entries
+
[[dataset1, features1, compounds1], [dataset2, features2, compounds2]].each do |dataset,features,compounds|
compounds.each{|c| data_combined.add_compound(c)}
features.each do |f|
@@ -431,6 +466,7 @@ module OpenTox
end
end
end
+
metadata = {} unless metadata
metadata[OT.hasSource] = "Merge from #{dataset1.uri} and #{dataset2.uri}" unless metadata[OT.hasSource]
data_combined.add_metadata(metadata)
@@ -438,6 +474,11 @@ module OpenTox
data_combined
end
+# def data_entries=(entries)
+# @data_entries = entries
+# end
+
+
# Save dataset at the dataset service
# - creates a new dataset if uri is not set
# - overwrites dataset if uri exists
diff --git a/lib/environment.rb b/lib/environment.rb
index c1b8312..7033ad3 100644
--- a/lib/environment.rb
+++ b/lib/environment.rb
@@ -11,7 +11,7 @@ TMP_DIR = File.join(basedir, "tmp")
LOG_DIR = File.join(basedir, "log")
if File.exist?(config_file)
- CONFIG = YAML.load_file(config_file)
+ CONFIG = YAML.load_file(config_file) unless defined?(CONFIG)
raise "could not load config, config file: "+config_file.to_s unless CONFIG
else
FileUtils.mkdir_p TMP_DIR
diff --git a/lib/stratification.R b/lib/stratification.R
index c15dee6..fb934a4 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -143,7 +143,7 @@ anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
}
split <- array(1:nrow(data))
count = 0
- for(j in 1:nrow(data))
+ for(j in sample(array(nrow(data))))
{
if (count<num && cl[j]==idx)
{
@@ -300,6 +300,7 @@ plot_split <- function( data, split, names=NULL, ... )
#data<-rbind(data,c)
#data=iris
#split = stratified_k_fold_split(data, num_folds=3)
+#split = anti_stratified_split(data, ratio=0.75)
#split = stratified_split(data, ratio=0.75)
#print(split)
#print(sum(split))