From 11c793da54bc304cfd7f80fcf722fb9b488811e8 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Sat, 9 Jun 2012 10:34:38 +0200 Subject: new branch val_exp --- .gitignore | 2 ++ lib/feature.rb | 2 +- lib/parser.rb | 1 + lib/r-util.rb | 70 ++++++++++++++++++++++++++++++++++++++++++++-------- lib/serializer.rb | 6 ++--- lib/stratification.R | 49 +++++++++++++++++++++++++++++++++++- lib/task.rb | 4 +-- 7 files changed, 117 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 75924e1..615100b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ pkg doc mysql-bak.rb *~ +.project +.buildpath diff --git a/lib/feature.rb b/lib/feature.rb index 55ac678..3957e0c 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -18,7 +18,7 @@ module OpenTox feature.subjectid = subjectid feature end - + # provides feature type, possible types are "regression" or "classification" # @return [String] feature type, unknown if OT.isA property is unknown/ not set def feature_type diff --git a/lib/parser.rb b/lib/parser.rb index 07b44db..2193cf4 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -386,6 +386,7 @@ module OpenTox if (drop_missing && drop) @format_errors << "Row #{i} not added" end + puts i } warnings @dataset diff --git a/lib/r-util.rb b/lib/r-util.rb index 0d4e82c..463a145 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -63,7 +63,8 @@ module OpenTox def paired_ttest(array1, array2, significance_level=0.95) @r.assign "v1",array1 @r.assign "v2",array2 - @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)" + paired = array1.size==array2.size ? "T" : "F" + @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=#{paired})" t = @r.pull "ttest$statistic" p = @r.pull "ttest$p.value" if (1-significance_level > p) @@ -73,15 +74,61 @@ module OpenTox end end + def pvalue(array1, array2) + @r.assign "v1",array1 + @r.assign "v2",array2 + @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2))" + @r.pull "ttest$p.value" + end + + + def ttest(array1, value2, significance_level=0.95) + @r.assign "v1",array1 + @r.eval "ttest = t.test(as.numeric(v1),conf.level=#{significance_level})" + min = @r.pull "ttest$conf.int[1]" + max = @r.pull "ttest$conf.int[2]" + if value2 <= min + 1 + elsif value2 >= max + -1 + else + 0 + end + end + + # example: # files = ["/tmp/box.svg","/tmp/box.png"] # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ] # boxplot(files, data, "comparison1" ) # - def boxplot(files, data, title="") - LOGGER.debug("r-util> create boxplot") + def boxplot(files, data, title="", hline=nil) + LOGGER.debug("r-util> create boxplot "+data.inspect) + raise "no hashes, to keep order" if data.is_a?(Hash) + max = -1 + min = 100000 + data.size.times do |i| + values = data[i][1] + max = [max,values.size].max + min = [min,values.size].min + data[i] = [data[i][0]+"(#{values.size})",data[i][1]] + end + if min != max + times = max/min.to_f + raise "box-plot values do not have equal size #{min} <-> #{max}" if times.floor != times.ceil + data.size.times do |i| + m = data[i][0] + values = data[i][1] + data[i] = [ m, values*times.to_i ] if values.size is given # returns two datases - def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) - stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features ) + def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil, anti_stratification=false ) + stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features, anti_stratification ) end # stratified splits a dataset into k datasets according the feature values @@ -191,7 +238,7 @@ module OpenTox end private - def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil ) + def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil, anti_stratification=false ) raise "internal error" if num_folds!=nil and pct!=nil k_fold_split = num_folds!=nil if k_fold_split @@ -227,9 +274,11 @@ module OpenTox end return train, test else - puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" - @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" + anti = anti_stratification ? "anti_" : "" + puts "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" + @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" split = @r.pull 'split' + puts "XXXXXXXXXXXX "+split.class.to_s metadata[DC.title] = "Training dataset split of "+dataset.uri train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 } metadata[DC.title] = "Test dataset split of "+dataset.uri @@ -393,7 +442,7 @@ module OpenTox begin File.delete(tmp); rescue; end end - def plot_to_files(files) + def plot_to_files(files,hline=nil) files.each do |file| if file=~/(?i)\.svg/ @r.eval("svg('#{file}',10,8)") @@ -403,6 +452,7 @@ module OpenTox raise "invalid format: "+file.to_s end yield file + @r.eval("abline(h=#{hline}, col = \"gray60\")") unless hline==nil LOGGER.debug "r-util> plotted to #{file}" @r.eval("dev.off()") end diff --git a/lib/serializer.rb b/lib/serializer.rb index 4c26329..101bb81 100644 --- a/lib/serializer.rb +++ b/lib/serializer.rb @@ -463,7 +463,7 @@ module OpenTox features = dataset.features.keys # prepare for subgraphs - have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq + have_substructures = features.collect{ |id| dataset.features[id][RDF.type] and dataset.features[id][RDF.type].include?(OT.Substructure) }.compact.uniq if have_substructures.size == 1 && have_substructures[0] features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" } end @@ -475,7 +475,7 @@ module OpenTox if typestr.include? "MissingFeature" delete_features << id end - } + } if dataset.features[id][RDF.type] } features = features - delete_features @@ -502,7 +502,7 @@ module OpenTox dataset.compounds.each do |compound| entries=dataset.data_entries[compound] cmpd = Compound.new(compound) - inchi = URI.encode_www_form_component(cmpd.to_inchi) + inchi = cmpd.to_smiles() #URI.encode_www_form_component(cmpd.to_inchi) # allocate container row_container = Array.new(compound_sizes[compound]) diff --git a/lib/stratification.R b/lib/stratification.R index 3f8698c..c15dee6 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -115,6 +115,52 @@ stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL ) stop("unknown method") } +anti_stratified_split <- function( data, ratio=0.3, colnames=NULL) +{ + if (ratio > 0.5) + { + ratio = 1-ratio + swap = TRUE + } + else + swap = FALSE + data.processed = as.matrix(process_data( data, colnames )) + print(paste("anti-split using #features: ",ncol(data.processed))) + num_c = floor(1/ratio) + cl = cluster(data.processed, num_c, num_c) + #print(cl) + idx = -1 + min = 1000000 + num = round_it(nrow(data)*ratio) + for(j in 1:max(cl)) + { + cl_size = length(subset(cl, cl==j)) + if (cl_size=num) + { + idx = j + min = cl_size + } + } + split <- array(1:nrow(data)) + count = 0 + for(j in 1:nrow(data)) + { + if (count= 0 and max <= 100 and max > min + raise "subtask init: invalid max (#{max}, #{max.class}), min (#{min}, #{min.class}) params" unless + min.is_a?(Numeric) and max.is_a?(Numeric) and min >= 0 and max <= 100.0 and max > min @task = task @min = min @max = max -- cgit v1.2.3