new branch val_exp

author: mguetlein <martin.guetlein@gmail.com> 2012-06-09 10:34:38 +0200
committer: mguetlein <martin.guetlein@gmail.com> 2012-06-09 10:34:38 +0200
commit: 11c793da54bc304cfd7f80fcf722fb9b488811e8 (patch)
tree: 4df0ed6e3fbf87b4de20d7cda156e49b952c9f1b
parent: 3b2f2033aa1d0936009bf13bc32ef6938834efb6 (diff)
7 files changed, 117 insertions, 17 deletions
diff --git a/.gitignore b/.gitignore
index 75924e1..615100b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ pkg
 doc
 mysql-bak.rb
 *~
+.project
+.buildpath
diff --git a/lib/feature.rb b/lib/feature.rb
index 55ac678..3957e0c 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -18,7 +18,7 @@ module OpenTox
       feature.subjectid = subjectid
       feature
     end
-
+    
     # provides feature type, possible types are "regression" or "classification"
     # @return [String] feature type, unknown if OT.isA property is unknown/ not set
     def feature_type
diff --git a/lib/parser.rb b/lib/parser.rb
index 07b44db..2193cf4 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -386,6 +386,7 @@ module OpenTox
           if (drop_missing && drop) 
             @format_errors << "Row #{i} not added" 
           end
+          puts i
         }
         warnings
         @dataset
diff --git a/lib/r-util.rb b/lib/r-util.rb
index 0d4e82c..463a145 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -63,7 +63,8 @@ module OpenTox
     def paired_ttest(array1, array2, significance_level=0.95)
       @r.assign "v1",array1
       @r.assign "v2",array2
-      @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
+      paired = array1.size==array2.size ? "T" : "F"
+      @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=#{paired})"
       t = @r.pull "ttest$statistic"
       p = @r.pull "ttest$p.value"
       if (1-significance_level > p)
@@ -73,15 +74,61 @@ module OpenTox
       end
     end
     
+    def pvalue(array1, array2)
+      @r.assign "v1",array1
+      @r.assign "v2",array2
+      @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2))"
+      @r.pull "ttest$p.value"
+    end
+        
+    
+    def ttest(array1, value2, significance_level=0.95)
+      @r.assign "v1",array1
+      @r.eval "ttest = t.test(as.numeric(v1),conf.level=#{significance_level})"
+      min = @r.pull "ttest$conf.int[1]"
+      max = @r.pull "ttest$conf.int[2]"
+      if value2 <= min
+        1
+      elsif value2 >= max
+        -1
+      else
+        0
+      end
+    end
+    
+    
     # example: 
     # files = ["/tmp/box.svg","/tmp/box.png"]
     # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
     # boxplot(files, data, "comparison1" )
     #
-    def boxplot(files, data, title="")
-      LOGGER.debug("r-util> create boxplot")
+    def boxplot(files, data, title="", hline=nil)
+      LOGGER.debug("r-util> create boxplot "+data.inspect)
+      raise "no hashes, to keep order" if data.is_a?(Hash)
+      max = -1
+      min = 100000
+      data.size.times do |i|
+        values = data[i][1]
+        max = [max,values.size].max
+        min = [min,values.size].min
+        data[i] = [data[i][0]+"(#{values.size})",data[i][1]]
+      end
+      if min != max
+        times = max/min.to_f
+        raise "box-plot values do not have equal size #{min} <-> #{max}" if times.floor != times.ceil
+        data.size.times do |i|
+          m = data[i][0]
+          values = data[i][1]
+          data[i] = [ m, values*times.to_i ] if values.size<max
+        end
+        min = 100000
+        data.each do |m,values|
+          max = [max,values.size].max
+          min = [min,values.size].min
+        end
+      end
       assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
-      plot_to_files(files) do |file|
+      plot_to_files(files, hline) do |file|
         @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
       end
     end
@@ -179,8 +226,8 @@ module OpenTox
     # stratified splits a dataset into two dataset according to the feature values
     # all features are taken into account unless <split_features> is given
     # returns two datases
-    def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
-      stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
+    def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil, anti_stratification=false )
+      stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features, anti_stratification )
     end
     
     # stratified splits a dataset into k datasets according the feature values
@@ -191,7 +238,7 @@ module OpenTox
     end    
     
     private
-    def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
+    def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil, anti_stratification=false )
       raise "internal error" if num_folds!=nil and pct!=nil
       k_fold_split = num_folds!=nil
       if k_fold_split
@@ -227,9 +274,11 @@ module OpenTox
         end
         return train, test
       else
-        puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
-        @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+        anti = anti_stratification ? "anti_" : ""
+        puts "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+        @r.eval "split <- #{anti}stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
         split = @r.pull 'split'
+        puts "XXXXXXXXXXXX "+split.class.to_s
         metadata[DC.title] = "Training dataset split of "+dataset.uri
         train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
         metadata[DC.title] = "Test dataset split of "+dataset.uri
@@ -393,7 +442,7 @@ module OpenTox
       begin File.delete(tmp); rescue; end
     end
     
-    def plot_to_files(files)
+    def plot_to_files(files,hline=nil)
       files.each do |file|
         if file=~/(?i)\.svg/
           @r.eval("svg('#{file}',10,8)")
@@ -403,6 +452,7 @@ module OpenTox
           raise "invalid format: "+file.to_s
         end
         yield file
+        @r.eval("abline(h=#{hline}, col = \"gray60\")") unless hline==nil
         LOGGER.debug "r-util> plotted to #{file}"
         @r.eval("dev.off()")
       end
diff --git a/lib/serializer.rb b/lib/serializer.rb
index 4c26329..101bb81 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -463,7 +463,7 @@ module OpenTox
         features = dataset.features.keys
 
         # prepare for subgraphs
-        have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq
+        have_substructures = features.collect{ |id| dataset.features[id][RDF.type] and dataset.features[id][RDF.type].include?(OT.Substructure) }.compact.uniq
         if have_substructures.size == 1 && have_substructures[0] 
           features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" }
         end
@@ -475,7 +475,7 @@ module OpenTox
             if typestr.include? "MissingFeature"
               delete_features << id 
             end
-          }
+          } if dataset.features[id][RDF.type]
         }
         features = features - delete_features
 
@@ -502,7 +502,7 @@ module OpenTox
         dataset.compounds.each do |compound|
           entries=dataset.data_entries[compound]
           cmpd = Compound.new(compound)
-          inchi = URI.encode_www_form_component(cmpd.to_inchi)
+          inchi = cmpd.to_smiles() #URI.encode_www_form_component(cmpd.to_inchi)
 
           # allocate container
           row_container = Array.new(compound_sizes[compound])
diff --git a/lib/stratification.R b/lib/stratification.R
index 3f8698c..c15dee6 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -115,6 +115,52 @@ stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
       stop("unknown method")
 }
 
+anti_stratified_split <- function( data, ratio=0.3, colnames=NULL)
+{
+  if (ratio > 0.5)
+  {
+    ratio = 1-ratio
+    swap = TRUE
+  }
+  else
+    swap = FALSE
+  data.processed = as.matrix(process_data( data, colnames ))
+  print(paste("anti-split using #features: ",ncol(data.processed)))
+  num_c = floor(1/ratio)
+  cl = cluster(data.processed, num_c, num_c)
+  #print(cl)
+  idx = -1
+  min = 1000000
+  num = round_it(nrow(data)*ratio)
+  for(j in 1:max(cl))
+  {
+    cl_size = length(subset(cl, cl==j))
+    if (cl_size<min && cl_size>=num)
+    {
+      idx = j
+      min = cl_size
+    }
+  }
+  split <- array(1:nrow(data))
+  count = 0
+  for(j in 1:nrow(data))
+  {
+     if (count<num && cl[j]==idx)
+     {
+       split[j] = 1
+       count=count+1
+     }
+     else
+       split[j] = 0
+     
+  } 
+  if (swap)
+    for(j in 1:nrow(data))
+      split[j] = 1-split[j]
+  #print(split)
+  as.vector(split)
+}
+
 stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
 {
   print(paste(num_folds,"-fold-split, data-size",nrow(data)))
@@ -254,7 +300,8 @@ plot_split <- function( data, split, names=NULL, ... )
 #data<-rbind(data,c)
 #data=iris
 #split = stratified_k_fold_split(data, num_folds=3)
-#split = stratified_split(data, ratio=0.33, method="cluster")
+#split = stratified_split(data, ratio=0.75)
+#print(split)
 #print(sum(split))
 #plot_split(plot_pre_process(data),split,c("training","test"))
 
diff --git a/lib/task.rb b/lib/task.rb
index 102f4dc..d8f0ba6 100644
--- a/lib/task.rb
+++ b/lib/task.rb
@@ -324,8 +324,8 @@ module OpenTox
     
     def initialize(task, min, max)
       raise "not a task or subtask" if task!=nil and !(task.is_a?(Task) or task.is_a?(SubTask)) 
-      raise "invalid max ("+max.to_s+"), min ("+min.to_s+") params" unless 
-        min.is_a?(Numeric) and max.is_a?(Numeric) and min >= 0 and max <= 100 and max > min 
+      raise "subtask init: invalid max (#{max}, #{max.class}), min (#{min}, #{min.class}) params" unless 
+        min.is_a?(Numeric) and max.is_a?(Numeric) and min >= 0 and max <= 100.0 and max > min 
       @task = task
       @min = min
       @max = max
author	mguetlein <martin.guetlein@gmail.com>	2012-06-09 10:34:38 +0200
committer	mguetlein <martin.guetlein@gmail.com>	2012-06-09 10:34:38 +0200
commit	11c793da54bc304cfd7f80fcf722fb9b488811e8 (patch)
tree	4df0ed6e3fbf87b4de20d7cda156e49b952c9f1b
parent	3b2f2033aa1d0936009bf13bc32ef6938834efb6 (diff)