Merge branch 'pc_new_1' of github.com:opentox/opentox-ruby into pc_new_1pc_new_1

author: Andreas Maunz <andreas@maunz.de> 2012-02-10 09:31:31 +0100
committer: Andreas Maunz <andreas@maunz.de> 2012-02-10 09:31:31 +0100
commit: c4d79f636827def2b6ac288275570ecfc7187bf1 (patch)
tree: d74f64ad68b9fe8a24789974a1255fe03cdf743f
parent: a56315499d714a783078d4a02c8982ccdb510cff (diff)
parent: 771514f7a6be11b87def56577ea09327ef328246 (diff)
7 files changed, 602 insertions, 7 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 9e9e62d..db21c46 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -92,7 +92,7 @@ module OpenTox
                   LOGGER.warn "No #{feature} activity for #{compound.to_s}."
                 else
                   if @prediction_feature.feature_type == "classification"
-                    activity= value_map.invert[value].to_i # activities are mapped to 1..n
+                    activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
                     @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
                   elsif @prediction_feature.feature_type == "regression"
                     activity= value.to_f 
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 8f76ee7..95c1918 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -373,7 +373,45 @@ module OpenTox
       dataset.save(subjectid)
       dataset
     end
-
+    
+    # merges two dataset into a new dataset (by default uses all compounds and features)
+    # precondition: both datasets are fully loaded
+    # @param [OpenTox::Dataset] dataset1 to merge
+    # @param [OpenTox::Dataset] dataset2 to merge
+    # @param [Hash] metadata
+    # @param [optional,String] subjectid
+    # @param [optional,Array] features1, if specified only this features of dataset1 are used
+    # @param [optional,Array] features2, if specified only this features of dataset2 are used
+    # @param [optional,Array] compounds1, if specified only this compounds of dataset1 are used
+    # @param [optional,Array] compounds2, if specified only this compounds of dataset2 are used
+    # example: if you want no features from dataset2, give empty array as features2
+    def self.merge( dataset1, dataset2, metadata, subjectid=nil, features1=nil, features2=nil, compounds1=nil, compounds2=nil )
+      features1 = dataset1.features.keys unless features1
+      features2 = dataset2.features.keys unless features2
+      compounds1 = dataset1.compounds unless compounds1
+      compounds2 = dataset2.compounds unless compounds2
+      data_combined = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+      LOGGER.debug("merging datasets #{dataset1.uri} and #{dataset2.uri} to #{data_combined.uri}")
+      [[dataset1, features1, compounds1], [dataset2, features2, compounds2]].each do |dataset,features,compounds|
+        compounds.each{|c| data_combined.add_compound(c)}
+        features.each do |f|
+          m = dataset.features[f]
+          m[OT.hasSource] = dataset.uri unless m[OT.hasSource]
+          data_combined.add_feature(f,m)
+          compounds.each do |c|
+            dataset.data_entries[c][f].each do |v|
+              data_combined.add(c,f,v)
+            end if dataset.data_entries[c] and dataset.data_entries[c][f]
+          end
+        end
+      end
+      metadata = {} unless metadata
+      metadata[OT.hasSource] = "Merge from #{dataset1.uri} and #{dataset2.uri}" unless metadata[OT.hasSource]
+      data_combined.add_metadata(metadata)
+      data_combined.save(subjectid)
+      data_combined
+    end
+    
     # Save dataset at the dataset service 
     # - creates a new dataset if uri is not set
     # - overwrites dataset if uri exists
diff --git a/lib/opentox-ruby.rb b/lib/opentox-ruby.rb
index 1fa2a86..d25632c 100644
--- a/lib/opentox-ruby.rb
+++ b/lib/opentox-ruby.rb
@@ -9,6 +9,6 @@ rescue LoadError
 end
 
 ['opentox', 'compound','dataset', 'parser','serializer', 'algorithm','model','task','validation','feature', 
- 'rest_client_wrapper', 'authorization', 'policy', 'helper', 'to-html', 'ontology' ].each do |lib|
+ 'rest_client_wrapper', 'authorization', 'policy', 'helper', 'to-html', 'ontology', 'r-util' ].each do |lib|
 	require lib
 end
diff --git a/lib/parser.rb b/lib/parser.rb
index ae8ada6..18c0ba7 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -57,7 +57,7 @@ module OpenTox
         `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
           triple = line.to_triple
           if triple[0] == @uri
-            if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
+            if triple[1] == RDF.type || triple[1]==OT.predictedVariables || triple[1]==OT.independentVariables # allow multiple types
               @metadata[triple[1]] = [] unless @metadata[triple[1]]
               @metadata[triple[1]] << triple[2].split('^^').first
             else
diff --git a/lib/r-util.rb b/lib/r-util.rb
new file mode 100644
index 0000000..7163c46
--- /dev/null
+++ b/lib/r-util.rb
@@ -0,0 +1,354 @@
+# pending: package dir hack ---------
+# CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
+# PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
+package_dir = CONFIG[:base_dir].split("/")
+package_dir[-1] = "r-packages"
+package_dir = package_dir.join("/")
+PACKAGE_DIR = package_dir
+
+require "tempfile"
+
+module OpenTox
+  
+  class RUtil
+    
+    @@feats = {}
+      
+    def initialize
+      @r = RinRuby.new(true,false) unless defined?(@r) and @r
+      @r.eval ".libPaths('#{PACKAGE_DIR}')"
+      @r_packages = @r.pull "installed.packages()[,1]"
+      ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
+      @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
+    end
+    
+    def quit_r
+      begin
+        @r.quit
+        @r = nil
+      rescue
+      end
+    end
+    
+    def r
+      @r
+    end
+    
+    def package_installed?( package )
+      @r_packages.include?(package) 
+    end
+    
+    def install_package( package )
+      unless package_installed?(package)
+        LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
+        @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
+      end
+    end
+    
+    # <0 -> array1 << array2
+    # 0  -> no significant difference
+    # >0 -> array2 >> array1
+    def paired_ttest(array1, array2, significance_level=0.95)
+      @r.assign "v1",array1
+      @r.assign "v2",array2
+      @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
+      t = @r.pull "ttest$statistic"
+      p = @r.pull "ttest$p.value"
+      if (1-significance_level > p)
+        t
+      else
+        0
+      end
+    end
+    
+    # example: 
+    # files = ["/tmp/box.svg","/tmp/box.png"]
+    # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
+    # boxplot(files, data, "comparison1" )
+    #
+    def boxplot(files, data, title="")
+      LOGGER.debug("r-util> create boxplot")
+      assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
+      plot_to_files(files) do |file|
+        @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
+      end
+    end
+
+    # embedds feature values of two datasets into 2D and plots it
+    # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method) 
+    #        
+    def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
+        features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
+        
+      raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
+      LOGGER.debug("r-util> create feature value plot")
+      d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
+      d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
+      if features
+        [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}} 
+      else
+        raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if 
+          (d1.features.keys.sort != d2.features.keys.sort)
+        features = d1.features.keys
+      end
+      raise "at least two features needed" if d1.features.keys.size<2
+      waiting_task.progress(25) if waiting_task
+      
+      df1 = dataset_to_dataframe(d1,0,subjectid,features)
+      df2 = dataset_to_dataframe(d2,0,subjectid,features)
+      waiting_task.progress(50) if waiting_task
+      
+      @r.eval "df <- rbind(#{df1},#{df2})"
+      @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
+      @r.names = [dataset_name1, dataset_name2]
+      LOGGER.debug("r-util> - convert data to 2d")
+      @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
+      waiting_task.progress(75) if waiting_task
+      
+      if fast_plot
+        info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
+      else
+        info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
+      end
+      LOGGER.debug("r-util> - plot data")
+      plot_to_files(files) do |file|
+        @r.eval "plot_split( df.2d, split, names, #{info})"
+      end
+    end
+    
+    # plots a double histogram
+    # data1 and data2 are arrays with values, either numerical or categorial (string values)
+    # is_numerical, boolean flag indicating value types
+    # log (only for numerical), plot logarithm of values
+    def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
+      LOGGER.debug("r-util> create double hist plot")
+      all = data1 + data2
+      if (is_numerical)
+        @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
+        {
+          if (log)
+          {
+            data1 <- log(data1)
+            data2 <- log(data2)
+            xlab = paste('logarithm of',xlab,sep=' ')
+          }
+          xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
+          h <- hist(rbind(data1,data2),plot=F)
+          h1 <- hist(data1,plot=F,breaks=h$breaks)
+          h2 <- hist(data2,plot=F,breaks=h$breaks)
+          xlims = c(min(h$breaks),max(h$breaks))
+          ylims = c(0,max(h1$counts,h2$counts))
+          xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
+          plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
+            main=title, xlab=xlab, ylab='counts' )
+          plot(h2, col=rgb(0,1,0,2/4), add=T )
+          legend('topleft',names,lty=c(1,1),col=c('red','green'))
+        }" 
+        @r.assign("data1",data1)
+        @r.assign("data2",data2)
+        @r.legend = [name1, name2]
+      else
+        raise "log not valid for categorial" if log
+        vals = all.uniq.sort!
+        counts1 = vals.collect{|e| data1.count(e)}
+        counts2 = vals.collect{|e| data2.count(e)}
+        @r.data1 = counts1
+        @r.data2 = counts2
+        @r.value_names = [name1, name2]
+        @r.legend = vals
+        @r.eval("data <- cbind(data1,data2)")
+      end
+      
+      plot_to_files(files) do |file|
+        if (is_numerical)
+          @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
+        else
+          @r.eval("bp <- barplot(data, beside=T, names.arg=value_names, 
+            main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
+          @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
+        end
+      end
+    end
+    
+    # stratified splits a dataset into two dataset the feature values
+    # all features are taken into account unless <split_features> is given
+    def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+      raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
+      LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
+      
+      df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
+      @r.eval "set.seed(#{seed})"
+      @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
+      split = @r.pull 'split'
+      split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
+      split_to_datasets( df, split, subjectid )
+    end
+    
+    # dataset should be loaded completely (use Dataset.find)
+    # takes duplicates into account
+    # replaces missing values with param <missing_value>
+    # returns dataframe-variable-name in R
+    def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
+      LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
+      
+      # count duplicates
+      num_compounds = {}
+      dataset.features.keys.each do |f|
+        dataset.compounds.each do |c|
+          if dataset.data_entries[c]
+            val = dataset.data_entries[c][f]
+            size = val==nil ? 1 : val.size
+            num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
+          else
+            num_compounds[c] = 1
+          end
+        end
+      end  
+      
+      # use either all, or the provided features, sorting is important as col-index := features
+      if features
+        features.sort!
+      else
+        features = dataset.features.keys.sort
+      end
+      compounds = []
+      dataset.compounds.each do |c|
+        num_compounds[c].times do |i|
+          compounds << c
+        end
+      end
+
+      # values into 2D array, then to dataframe
+      d_values = []
+      dataset.compounds.each do |c|
+        num_compounds[c].times do |i|
+          c_values = []
+          features.each do |f|
+            if dataset.data_entries[c]
+              val = dataset.data_entries[c][f]
+              v = val==nil ? "" : val[i].to_s
+            else
+              raise "wtf" if i>0
+              v = ""
+            end
+            v = missing_value if v.size()==0
+            c_values << v
+          end
+          d_values << c_values
+        end
+      end  
+      df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
+      assign_dataframe(df_name,d_values,compounds,features)
+      
+      # set dataframe column types accordingly
+      f_count = 1 #R starts at 1
+      features.each do |f|
+        feat = OpenTox::Feature.find(f,subjectid)
+        nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+        if nominal
+          @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
+        else
+          @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
+        end
+        f_count += 1
+      end
+      #@r.eval "head(#{df_name})"
+      
+      # store compounds, and features (including metainformation)
+      @@feats[df_name] = {}
+      features.each do |f|
+        @@feats[df_name][f] = dataset.features[f]
+      end
+      df_name
+    end
+    
+    # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
+    # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
+    def dataframe_to_dataset( df, subjectid=nil )
+      dataframe_to_dataset_indices( df, subjectid, nil)
+    end
+    
+    private
+    def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
+      raise unless @@feats[df].size>0
+      values, compounds, features = pull_dataframe(df)
+      features.each{|f| raise unless @@feats[df][f]}
+      dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+      LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
+      compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
+      features.each{|f| dataset.add_feature(f,@@feats[df][f])}
+      features.size.times do |c|
+        feat = OpenTox::Feature.find(features[c],subjectid)
+        nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+        compounds.size.times do |r|
+          if compound_indices==nil or compound_indices.include?(r)
+            dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
+          end 
+        end
+      end
+      dataset.save(subjectid)
+      dataset
+    end    
+    
+    def split_to_datasets( df, split, subjectid=nil )
+      sets = []
+      (split.min.to_i .. split.max.to_i).each do |i|
+        indices = []
+        split.size.times{|j| indices<<j if split[j]==i}
+        dataset = dataframe_to_dataset_indices( df, subjectid, indices )
+        LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+        sets << dataset
+      end
+      sets
+    end
+    
+    def pull_dataframe(df)
+      tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
+      @r.eval "write.table(#{df},file='#{tmp}',sep='#')"
+      res = []; compounds = []; features = []
+      first = true
+      file = File.new(tmp, 'r')
+      file.each_line("\n") do |row|
+        if first
+           features = row.chomp.split("#").collect{|e| e.gsub("\"","")}
+           first = false
+        else
+           vals = row.chomp.split("#").collect{|e| e.gsub("\"","")}
+           compounds << vals[0]
+           res << vals[1..-1]
+        end
+      end
+      begin File.delete(tmp); rescue; end
+      return res, compounds, features
+    end
+    
+    def assign_dataframe(df,input,rownames,colnames)
+      tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
+      file = File.new(tmp, 'w')
+      input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}  
+      file.flush
+      @r.rownames = rownames if rownames
+      @r.colnames = colnames
+      @r.eval "#{df} <- read.table(file='#{tmp}',sep='#',"+
+        "#{rownames ? "row.names=rownames" : ""},col.names=colnames,check.names=F)"
+      begin File.delete(tmp); rescue; end
+    end
+    
+    def plot_to_files(files)
+      files.each do |file|
+        if file=~/(?i)\.svg/
+          @r.eval("svg('#{file}',10,8)")
+        elsif file=~/(?i)\.png/
+          @r.eval("png('#{file}')")
+        else
+          raise "invalid format: "+file.to_s
+        end
+        yield file
+        LOGGER.debug "r-util> plotted to #{file}"
+        @r.eval("dev.off()")
+      end
+    end
+  end
+end
+
+
diff --git a/lib/stratification.R b/lib/stratification.R
new file mode 100644
index 0000000..76ff2d8
--- /dev/null
+++ b/lib/stratification.R
@@ -0,0 +1,201 @@
+
+nominal_to_binary <- function( data )
+{
+  result = NULL
+  for (i in 1:ncol(data))
+  {
+     #print(i)
+     if (is.numeric( data[,i] ) )
+     {
+        if (is.null(result))
+          result = data.frame(data[,i])
+        else
+          result = data.frame(result, data[,i])
+        colnames(result)[ncol(result)] <- colnames(data)[i]
+     }
+     else
+     {
+        vals = unique(data[,i])
+        for (j in 1:length(vals))
+        {
+           #print(j)
+           bins = c()
+           for (k in 1:nrow(data))
+           {
+              if(data[,i][k] == vals[j])
+                bins = c(bins,1)
+              else
+                bins = c(bins,0)
+           }
+           #print(bins)
+           if (is.null(result))
+             result = data.frame(bins)
+           else
+             result = data.frame(result, bins)
+           colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
+           if (length(vals)==2) break
+        }
+     }
+  }
+  #print(head(result))
+  result
+}
+
+process_data <- function( data )
+{
+  data.num <- as.data.frame(data)
+  if (!is.numeric(data.num))
+  {
+    data.num = nominal_to_binary(data.num)
+  }
+  if(any(is.na(data.num)))
+  {
+    require("gam")
+   	data.repl = na.gam.replace(data.num)
+  }
+  else
+  	data.repl = data.num
+  data.repl
+}
+
+cluster <- function( data, min=10, max=15 )
+{
+  require("vegan")
+  max <- min(max,nrow(unique(data)))
+  max <- min(max,nrow(data)-1)
+  if (min>max)
+    min=max
+  print(paste("cascade k-means ",min," - ",max))
+  s = cascadeKM(data,min,max,iter=30)
+  m = max.col(s$results)[2]
+  print(paste("best k-means clustering result: ",((m-1)+min)," num clusters"))
+  cbind(s$partition[,m])
+}
+
+stratified_split <- function( data, ratio=0.3, method="cluster" )
+{
+    data.processed = as.matrix(process_data( data ))
+    if (method == "samplecube")
+    {
+      require("sampling")
+      # adjust ratio to make samplecube return exact number of samples
+      ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
+      pik = rep(ratio,times=nrow(data.processed))
+      data.strat = cbind(pik,data.processed)
+      samplecube(data.strat,pik,order=2,comment=F)
+    }
+    else if (method == "cluster")
+    {
+      cl = cluster(data.processed)
+#      require("caret")
+#      res = createDataPartition(cl,p=ratio)
+#      split = rep(1, times=nrow(data))
+#      for (j in 1:nrow(data))
+#        if ( is.na(match(j,res$Resample1)) )
+#          split[j]=0
+#      split
+      require("sampling")
+      stratified_split(cl,ratio,"samplecube")
+    }
+    else
+      stop("unknown method")
+}
+
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
+{
+  print(paste(num_folds,"-fold-split, data-size",nrow(data)))
+  data.processed = as.matrix(process_data( data ))
+  if (method == "samplecube")
+  {
+    folds = rep(0, times=nrow(data))
+    for (i in 1:(num_folds-1))
+    {
+      require("sampling")
+      prop = 1/(num_folds-(i-1))
+      print(paste("fold",i,"/",num_folds," prop",prop))
+      pik = rep(prop,times=nrow(data))
+      for (j in 1:nrow(data))
+        if(folds[j]!=0)
+          pik[j]=0
+      data.strat = cbind(pik,data.processed)
+      s<-samplecube(data.strat,pik,order=2,comment=F)
+      print(paste("fold size: ",sum(s)))
+      for (j in 1:nrow(data))
+        if (s[j] == 1)
+          folds[j]=i
+    }
+    for (j in 1:nrow(data))
+      if (folds[j] == 0)
+        folds[j]=num_folds
+    folds
+  }
+  else if (method == "cluster")
+  {
+    require("TunePareto")
+    cl = cluster(data.processed)
+    res = generateCVRuns(cl,ntimes=1,nfold=3)
+    folds = rep(0, times=nrow(data))
+    for (i in 1:num_folds)
+      for(j in 1:length(res[[1]][[i]]))
+        folds[res[[1]][[i]][j]]=i
+    folds
+  }
+  else
+    stop("unknown method")
+}
+
+plot_pre_process <- function( data, method="pca" )
+{
+  data.processed = process_data( data )
+  if (method == "pca")
+  {
+    data.pca <- prcomp(data.processed, scale=TRUE)
+    as.data.frame(data.pca$x)[1:2]
+  }
+  else if (method == "smacof")
+  {
+    require("smacof")
+    data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
+    data.emb$conf
+  }
+  else
+    stop("unknown method")
+}
+
+plot_split <- function( data, split, names=NULL, ... )
+{
+  if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
+    stop("data not suitable for plotting, plot_pre_process() first")
+
+  plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
+  if (is.null(names))
+    names <- c("split 1","split 2")
+  colos = as.double(rep(2:(max(split)+2)))
+  legend("topleft",names,pch=2,col=colos)
+
+  for (j in max(split):0)
+  {
+    set = c()
+    for (i in 1:nrow(data))
+      if (split[i] == j)
+        set = c(set,i)
+    points(data[set,], pch = 2, col=(j+2))
+  }
+}
+
+#a<-matrix(rnorm(100, mean=50,  sd=4), ncol=5)
+#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
+#data<-rbind(a,b)
+#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
+#data<-rbind(data,c)
+#data=iris
+#split = stratified_k_fold_split(data, num_folds=3)
+#split = stratified_split(data, ratio=0.33, method="cluster")
+#print(sum(split))
+#plot_split(plot_pre_process(data),split,c("training","test"))
+
+#cl = cluster(data)
+
+
+
+
diff --git a/lib/task.rb b/lib/task.rb
index 66825cd..102f4dc 100644
--- a/lib/task.rb
+++ b/lib/task.rb
@@ -242,18 +242,20 @@ module OpenTox
     # waits for a task, unless time exceeds or state is no longer running
     # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
     # @param [optional,Numeric] dur seconds pausing before cheking again for completion
-    def wait_for_completion( waiting_task=nil, dur=0.3)
+    def wait_for_completion( waiting_task=nil)
       
       waiting_task.waiting_for(self.uri) if waiting_task
       due_to_time = Time.new + DEFAULT_TASK_MAX_DURATION
+      start_time = Time.new
+      dur = 0
       LOGGER.debug "start waiting for task "+@uri.to_s+" at: "+Time.new.to_s+", waiting at least until "+due_to_time.to_s
       
       load_metadata # for extremely fast tasks
       check_state
       while self.running? or self.queued?
         sleep dur
-        #LOGGER.debug "dv ---------------- dur: '#{dur}'"
-        dur = dur*2 unless dur>=30.0 
+        dur = [[(Time.new - start_time)/20.0,0.3].max,300.0].min
+        #LOGGER.debug "task-object-id: #{self.object_id} - wait: #{"%.2f"%(Time.new - start_time)} - dur: #{"%.2f"%dur}"
         load_metadata 
         # if another (sub)task is waiting for self, set progress accordingly 
         waiting_task.progress(@metadata[OT.percentageCompleted].to_f) if waiting_task
author	Andreas Maunz <andreas@maunz.de>	2012-02-10 09:31:31 +0100
committer	Andreas Maunz <andreas@maunz.de>	2012-02-10 09:31:31 +0100
commit	c4d79f636827def2b6ac288275570ecfc7187bf1 (patch)
tree	d74f64ad68b9fe8a24789974a1255fe03cdf743f
parent	a56315499d714a783078d4a02c8982ccdb510cff (diff)
parent	771514f7a6be11b87def56577ea09327ef328246 (diff)