summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-06-09 10:34:38 +0200
committermguetlein <martin.guetlein@gmail.com>2012-06-09 10:34:38 +0200
commit11c793da54bc304cfd7f80fcf722fb9b488811e8 (patch)
tree4df0ed6e3fbf87b4de20d7cda156e49b952c9f1b
parent3b2f2033aa1d0936009bf13bc32ef6938834efb6 (diff)
new branch val_exp
-rw-r--r--.gitignore2
-rw-r--r--lib/feature.rb2
-rw-r--r--lib/parser.rb1
-rw-r--r--lib/r-util.rb70
-rw-r--r--lib/serializer.rb6
-rw-r--r--lib/stratification.R49
-rw-r--r--lib/task.rb4
7 files changed, 117 insertions, 17 deletions
diff --git a/.gitignore b/.gitignore
index 75924e1..615100b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ pkg
doc
mysql-bak.rb
*~
+.project
+.buildpath
diff --git a/lib/feature.rb b/lib/feature.rb
index 55ac678..3957e0c 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -18,7 +18,7 @@ module OpenTox
feature.subjectid = subjectid
feature
end
-
+
# provides feature type, possible types are "regression" or "classification"
# @return [String] feature type, unknown if OT.isA property is unknown/ not set
def feature_type
diff --git a/lib/parser.rb b/lib/parser.rb
index 07b44db..2193cf4 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -386,6 +386,7 @@ module OpenTox
if (drop_missing && drop)
@format_errors << "Row #{i} not added"
end
+ puts i
}
warnings
@dataset
diff --git a/lib/r-util.rb b/lib/r-util.rb
index 0d4e82c..463a145 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -63,7 +63,8 @@ module OpenTox
def paired_ttest(array1, array2, significance_level=0.95)
@r.assign "v1",array1
@r.assign "v2",array2
- @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
+ paired = array1.size==array2.size ? "T" : "F"
+ @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=#{paired})"
t = @r.pull "ttest$statistic"
p = @r.pull "ttest$p.value"
if (1-significance_level > p)
@@ -73,15 +74,61 @@ module OpenTox
end
end
+ def pvalue(array1, array2)
+ @r.assign "v1",array1
+ @r.assign "v2",array2
+ @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2))"
+ @r.pull "ttest$p.value"
+ end
+
+
+ def ttest(array1, value2, significance_level=0.95)
+ @r.assign "v1",array1
+ @r.eval "ttest = t.test(as.numeric(v1),conf.level=#{significance_level})"
+ min = @r.pull "ttest$conf.int[1]"
+ max = @r.pull "ttest$conf.int[2]"
+ if value2 <= min
+ 1
+ elsif value2 >= max
+ -1
+ else
+ 0
+ end
+ end
+
+
# example:
# files = ["/tmp/box.svg","/tmp/box.png"]
# data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
# boxplot(files, data, "comparison1" )
#
- def boxplot(files, data, title="")
- LOGGER.debug("r-util> create boxplot")
+ def boxplot(files, data, title="", hline=nil)
+ LOGGER.debug("r-util> create boxplot "+data.inspect)
+ raise "no hashes, to keep order" if data.is_a?(Hash)
+ max = -1
+ min = 100000
+ data.size.times do |i|
+ values = data[i][1]
+ max = [max,values.size].max
+ min = [min,values.size].min
+ data[i] = [data[i][0]+"(#{values.size})",data[i][1]]
+ end
+ if min != max
+ times = max/min.to_f
+ raise "box-plot values do not have equal size #{min} <-> #{max}" if times.floor != times.ceil
+ data.size.times do |i|
+ m = data[i][0]
+ values = data[i][1]
+ data[i] = [ m, values*times.to_i ] if values.size<max
+ end
+ min = 100000
+ data.each do |m,values|
+ max = [max,values.size].max
+ min = [min,values.size].min
+ end
+ end
assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
- plot_to_files(files) do |file|
+ plot_to_files(files, hline) do |file|
@r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
end
end
@@ -179,8 +226,8 @@ module OpenTox
# stratified splits a dataset into two dataset according to the feature values
# all features are taken into account unless <split_features> is given
# returns two datases
- def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
- stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
+ def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil, anti_stratification=false )
+ stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features, anti_stratification )
end
# stratified splits a dataset into k datasets according the feature values
@@ -191,7 +238,7 @@ module OpenTox
end
private
- def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
+ def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil, anti_stratification=false )
raise "internal error" if num_folds!=nil and pct!=nil
k_fold_split = num_folds!=nil
if k_fold_split
@@ -227,9 +274,11 @@ module OpenTox
end
return train, test
else
- puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
- @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+ anti = anti_stratification ? "anti_" : ""
+ puts "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+ @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
split = @r.pull 'split'
+ puts "XXXXXXXXXXXX "+split.class.to_s
metadata[DC.title] = "Training dataset split of "+dataset.uri
train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
metadata[DC.title] = "Test dataset split of "+dataset.uri
@@ -393,7 +442,7 @@ module OpenTox
begin File.delete(tmp); rescue; end
end
- def plot_to_files(files)
+ def plot_to_files(files,hline=nil)
files.each do |file|
if file=~/(?i)\.svg/
@r.eval("svg('#{file}',10,8)")
@@ -403,6 +452,7 @@ module OpenTox
raise "invalid format: "+file.to_s
end
yield file
+ @r.eval("abline(h=#{hline}, col = \"gray60\")") unless hline==nil
LOGGER.debug "r-util> plotted to #{file}"
@r.eval("dev.off()")
end
diff --git a/lib/serializer.rb b/lib/serializer.rb
index 4c26329..101bb81 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -463,7 +463,7 @@ module OpenTox
features = dataset.features.keys
# prepare for subgraphs
- have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq
+ have_substructures = features.collect{ |id| dataset.features[id][RDF.type] and dataset.features[id][RDF.type].include?(OT.Substructure) }.compact.uniq
if have_substructures.size == 1 && have_substructures[0]
features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" }
end
@@ -475,7 +475,7 @@ module OpenTox
if typestr.include? "MissingFeature"
delete_features << id
end
- }
+ } if dataset.features[id][RDF.type]
}
features = features - delete_features
@@ -502,7 +502,7 @@ module OpenTox
dataset.compounds.each do |compound|
entries=dataset.data_entries[compound]
cmpd = Compound.new(compound)
- inchi = URI.encode_www_form_component(cmpd.to_inchi)
+ inchi = cmpd.to_smiles() #URI.encode_www_form_component(cmpd.to_inchi)
# allocate container
row_container = Array.new(compound_sizes[compound])
diff --git a/lib/stratification.R b/lib/stratification.R
index 3f8698c..c15dee6 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -115,6 +115,52 @@ stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
stop("unknown method")
}
+anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
+{
+ if (ratio > 0.5)
+ {
+ ratio = 1-ratio
+ swap = TRUE
+ }
+ else
+ swap = FALSE
+ data.processed = as.matrix(process_data( data, colnames ))
+ print(paste("anti-split using #features: ",ncol(data.processed)))
+ num_c = floor(1/ratio)
+ cl = cluster(data.processed, num_c, num_c)
+ #print(cl)
+ idx = -1
+ min = 1000000
+ num = round_it(nrow(data)*ratio)
+ for(j in 1:max(cl))
+ {
+ cl_size = length(subset(cl, cl==j))
+ if (cl_size<min && cl_size>=num)
+ {
+ idx = j
+ min = cl_size
+ }
+ }
+ split <- array(1:nrow(data))
+ count = 0
+ for(j in 1:nrow(data))
+ {
+ if (count<num && cl[j]==idx)
+ {
+ split[j] = 1
+ count=count+1
+ }
+ else
+ split[j] = 0
+
+ }
+ if (swap)
+ for(j in 1:nrow(data))
+ split[j] = 1-split[j]
+ #print(split)
+ as.vector(split)
+}
+
stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
{
print(paste(num_folds,"-fold-split, data-size",nrow(data)))
@@ -254,7 +300,8 @@ plot_split <- function( data, split, names=NULL, ... )
#data<-rbind(data,c)
#data=iris
#split = stratified_k_fold_split(data, num_folds=3)
-#split = stratified_split(data, ratio=0.33, method="cluster")
+#split = stratified_split(data, ratio=0.75)
+#print(split)
#print(sum(split))
#plot_split(plot_pre_process(data),split,c("training","test"))
diff --git a/lib/task.rb b/lib/task.rb
index 102f4dc..d8f0ba6 100644
--- a/lib/task.rb
+++ b/lib/task.rb
@@ -324,8 +324,8 @@ module OpenTox
def initialize(task, min, max)
raise "not a task or subtask" if task!=nil and !(task.is_a?(Task) or task.is_a?(SubTask))
- raise "invalid max ("+max.to_s+"), min ("+min.to_s+") params" unless
- min.is_a?(Numeric) and max.is_a?(Numeric) and min >= 0 and max <= 100 and max > min
+ raise "subtask init: invalid max (#{max}, #{max.class}), min (#{min}, #{min.class}) params" unless
+ min.is_a?(Numeric) and max.is_a?(Numeric) and min >= 0 and max <= 100.0 and max > min
@task = task
@min = min
@max = max