Manual merge with development.

author: David Vorgrimmler <vorgrimmlerdavid@gmx.de> 2012-04-19 16:06:33 +0200
committer: David Vorgrimmler <vorgrimmlerdavid@gmx.de> 2012-04-19 16:06:33 +0200
commit: 6dab51b5cf637b7c0d3d8585fe63f4116553ac5c (patch)
tree: d215da070142639e8d65ec30b24d3608e0561b38
parent: ad7cd1120e982253ecf0b515cc90dd0e45267685 (diff)
11 files changed, 189 insertions, 54 deletions
diff --git a/ChangeLog b/ChangeLog
index de9e01b..5872d56 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+v3.1.0 2012-02-24
+		* utils.rb: added for special routines (e.g. descriptor calculation)
+		* task.rb: Polling with increasing interval
+		* parser.rb: CSV up and download fixed
+		* transform.rb: routines to create machine learning data matrices
+		* algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
+		gauss() removed
+
 v3.0.1 2011-10-19
     * feature: model registration to ontology service
     * ontology lib gets endpoints from ontology service
diff --git a/VERSION b/VERSION
index cb2b00e..fd2a018 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.0.1
+3.1.0
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 8d661b5..b921b9c 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -551,7 +551,7 @@ module OpenTox
       # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
       # @return [Hash] Hash with matching Smarts and number of hits 
       def self.lookup(params)
-        params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type], params[:lib])
+        params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type], params[:lib],params[:subjectid])
       end  
     end
 
diff --git a/lib/authorization.rb b/lib/authorization.rb
index 5d57781..a9744e9 100644
--- a/lib/authorization.rb
+++ b/lib/authorization.rb
@@ -37,13 +37,15 @@ module OpenTox
       
       #Loads and sends Policyfile(XML) to open-sso server
       # @param [String] URI to create a policy for      
-      def send(uri)    
+      def send(uri)
         xml = get_xml(uri)
         ret = false
-        ret = Authorization.create_policy(xml, @subjectid) 
+        ret = Authorization.create_policy(xml, @subjectid)
+        LOGGER.warn "Create policy on openSSO failed for URI: #{uri} subjectid: #{@subjectid}. Will try again." if !ret
+        ret = Authorization.create_policy(xml, @subjectid) if !ret
         LOGGER.debug "Policy send with subjectid: #{@subjectid}"
         LOGGER.warn "Not created Policy is: #{xml}" if !ret
-        ret  
+        ret
       end
       
     end
@@ -337,7 +339,7 @@ module OpenTox
     # @param [String] subjectid
     # @return [Boolean] true if access granted, else otherwise
     def self.authorized?(uri, request_method, subjectid)
-      if CONFIG[:authorization][:free_request].include?(request_method)  
+      if CONFIG[:authorization][:free_request].include?(request_method)
         #LOGGER.debug "authorized? >>true<< (request is free), method: #{request_method}, URI: #{uri}, subjectid: #{subjectid}"
         true
       elsif OpenTox::Authorization.free_uri?(uri, request_method)
@@ -360,7 +362,7 @@ module OpenTox
         false
       end
     end
-    
+
     private
     def self.free_uri?(uri, request_method)
       if CONFIG[:authorization][:free_uris]
@@ -374,7 +376,7 @@ module OpenTox
       end    
       return false
     end
-    
+
     def self.authorize_exception?(uri, request_method)
       if CONFIG[:authorization][:authorize_exceptions]
         CONFIG[:authorization][:authorize_exceptions].each do |request_methods,uris|
@@ -387,6 +389,6 @@ module OpenTox
       end    
       return false
     end    
-    
+
   end
-end
-\ No newline at end of file
+end
diff --git a/lib/compound.rb b/lib/compound.rb
index b180b15..a08d541 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -259,7 +259,7 @@ module OpenTox
         uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"], "/pc/AllDescriptors"), {:dataset_uri => temp_uri, :pc_type => pc_type, :lib => lib})
         ds = OpenTox::Dataset.find(uri)
         entry = ds.data_entries[self.uri]
-        ds.delete
+        ds.delete(subjectid)
         temp_ds.delete
       end
       features = entry.keys
diff --git a/lib/environment.rb b/lib/environment.rb
index 6a72ba5..c1b8312 100644
--- a/lib/environment.rb
+++ b/lib/environment.rb
@@ -91,5 +91,5 @@ DC =  OwlNamespace.new 'http://purl.org/dc/elements/1.1/'
 OT =  OwlNamespace.new 'http://www.opentox.org/api/1.1#'
 OTA =  OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#'
 XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#'
-BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
+#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
 
diff --git a/lib/model.rb b/lib/model.rb
index f8d98ba..c9d367e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -258,7 +258,8 @@ module OpenTox
                                     :features => @features, 
                                     :feature_dataset_uri => @metadata[OT.featureDataset],
                                     :pc_type => self.parameter(\"pc_type\"),
-                                    :lib => self.parameter(\"lib\")
+                                    :lib => self.parameter(\"lib\"),
+                                    :subjectid => subjectid
                                     })")
           
           # Adding fingerprint of query compound with features and values(p_value*nr_hits)
diff --git a/lib/parser.rb b/lib/parser.rb
index 2e1dc5d..07b44db 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -536,6 +536,7 @@ module OpenTox
       def initialize
         @data = {}
         @activity_errors = []
+        @max_class_values = 3
       end
 
       def feature_values(feature)
diff --git a/lib/r-util.rb b/lib/r-util.rb
index 7163c46..0d4e82c 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir
 
 require "tempfile"
 
+class Array
+  
+  def check_uniq
+    hash = {}
+    self.each do |x|
+      raise "duplicate #{x}" if hash[x]
+      hash[x] = true
+    end
+  end
+  
+end
+
 module OpenTox
   
   class RUtil
@@ -75,12 +87,10 @@ module OpenTox
     end
 
     # embedds feature values of two datasets into 2D and plots it
-    # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method) 
     #        
     def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
-        features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
+        features=nil, subjectid=nil, waiting_task=nil)
         
-      raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
       LOGGER.debug("r-util> create feature value plot")
       d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
       d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
@@ -102,17 +112,13 @@ module OpenTox
       @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
       @r.names = [dataset_name1, dataset_name2]
       LOGGER.debug("r-util> - convert data to 2d")
-      @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
+      #@r.eval "save.image(\"/tmp/image.R\")"
+      @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
       waiting_task.progress(75) if waiting_task
       
-      if fast_plot
-        info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
-      else
-        info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
-      end
       LOGGER.debug("r-util> - plot data")
       plot_to_files(files) do |file|
-        @r.eval "plot_split( df.2d, split, names, #{info})"
+        @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
       end
     end
     
@@ -170,19 +176,68 @@ module OpenTox
       end
     end
     
-    # stratified splits a dataset into two dataset the feature values
+    # stratified splits a dataset into two dataset according to the feature values
+    # all features are taken into account unless <split_features> is given
+    # returns two datases
+    def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+      stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
+    end
+    
+    # stratified splits a dataset into k datasets according the feature values
     # all features are taken into account unless <split_features> is given
-    def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+    # returns two arrays of datasets
+    def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
+      stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
+    end    
+    
+    private
+    def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
+      raise "internal error" if num_folds!=nil and pct!=nil
+      k_fold_split = num_folds!=nil
+      if k_fold_split
+        raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
+      else
+        raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
+      end
       raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
+      raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
+      raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)          
       LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
       
-      df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
+      df = dataset_to_dataframe( dataset, missing_values, subjectid)
       @r.eval "set.seed(#{seed})"
-      @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
-      split = @r.pull 'split'
-      split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
-      split_to_datasets( df, split, subjectid )
+      str_split_features = ""
+      if split_features
+        @r.split_features = split_features if split_features
+        str_split_features = "colnames=split_features"
+      end
+      @r.eval "save.image(\"/tmp/image.R\")"
+      
+      if k_fold_split
+        @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
+        split = @r.pull 'split'
+        train = []
+        test = []
+        num_folds.times do |f|
+          datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s           
+          metadata[DC.title] = "training "+datasetname 
+          train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
+          metadata[DC.title] = "test "+datasetname
+          test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
+        end
+        return train, test
+      else
+        puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+        @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+        split = @r.pull 'split'
+        metadata[DC.title] = "Training dataset split of "+dataset.uri
+        train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
+        metadata[DC.title] = "Test dataset split of "+dataset.uri
+        test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
+        return train, test
+      end
     end
+    public
     
     # dataset should be loaded completely (use Dataset.find)
     # takes duplicates into account
@@ -212,9 +267,13 @@ module OpenTox
         features = dataset.features.keys.sort
       end
       compounds = []
+      compound_names = []
       dataset.compounds.each do |c|
+        count = 0
         num_compounds[c].times do |i|
           compounds << c
+          compound_names << "#{c}$#{count}"
+          count+=1
         end
       end
 
@@ -238,7 +297,7 @@ module OpenTox
         end
       end  
       df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
-      assign_dataframe(df_name,d_values,compounds,features)
+      assign_dataframe(df_name,d_values,compound_names,features)
       
       # set dataframe column types accordingly
       f_count = 1 #R starts at 1
@@ -264,16 +323,18 @@ module OpenTox
     
     # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
     # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
-    def dataframe_to_dataset( df, subjectid=nil )
-      dataframe_to_dataset_indices( df, subjectid, nil)
+    def dataframe_to_dataset( df, metadata={}, subjectid=nil )
+      dataframe_to_dataset_indices( df, metadata, subjectid, nil)
     end
     
     private
-    def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
+    def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
       raise unless @@feats[df].size>0
-      values, compounds, features = pull_dataframe(df)
+      values, compound_names, features = pull_dataframe(df)
+      compounds = compound_names.collect{|c| c.split("$")[0]}
       features.each{|f| raise unless @@feats[df][f]}
       dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+      dataset.add_metadata(metadata)
       LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
       compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
       features.each{|f| dataset.add_feature(f,@@feats[df][f])}
@@ -290,16 +351,12 @@ module OpenTox
       dataset
     end    
     
-    def split_to_datasets( df, split, subjectid=nil )
-      sets = []
-      (split.min.to_i .. split.max.to_i).each do |i|
-        indices = []
-        split.size.times{|j| indices<<j if split[j]==i}
-        dataset = dataframe_to_dataset_indices( df, subjectid, indices )
-        LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
-        sets << dataset
-      end
-      sets
+    def split_to_dataset( df, split, metadata={}, subjectid=nil )
+      indices = []
+      split.size.times{|i| indices<<i if yield(split[i]) }
+      dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
+      LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+      dataset
     end
     
     def pull_dataframe(df)
@@ -323,6 +380,8 @@ module OpenTox
     end
     
     def assign_dataframe(df,input,rownames,colnames)
+      rownames.check_uniq if rownames
+      colnames.check_uniq if colnames
       tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
       file = File.new(tmp, 'w')
       input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}  
diff --git a/lib/stratification.R b/lib/stratification.R
index 76ff2d8..3f8698c 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -1,4 +1,13 @@
 
+round_it <- function( x )
+{
+  if(isTRUE((x - floor(x))>=0.5))
+    ceiling(x)
+  else
+    floor(x)
+}
+
+
 nominal_to_binary <- function( data )
 {
   result = NULL
@@ -41,9 +50,13 @@ nominal_to_binary <- function( data )
   result
 }
 
-process_data <- function( data )
+process_data <- function( data, colnames=NULL )
 {
   data.num <- as.data.frame(data)
+  if (!is.null(colnames))
+  {
+    data.num = subset(data.num, select = colnames)
+  }
   if (!is.numeric(data.num))
   {
     data.num = nominal_to_binary(data.num)
@@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 )
   cbind(s$partition[,m])
 }
 
-stratified_split <- function( data, ratio=0.3, method="cluster" )
+stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
 {
-    data.processed = as.matrix(process_data( data ))
+    data.processed = as.matrix(process_data( data, colnames ))
+    print(paste("split using #features: ",ncol(data.processed)))
     if (method == "samplecube")
     {
       require("sampling")
       # adjust ratio to make samplecube return exact number of samples
-      ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
+      ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
       pik = rep(ratio,times=nrow(data.processed))
       data.strat = cbind(pik,data.processed)
       samplecube(data.strat,pik,order=2,comment=F)
@@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" )
       stop("unknown method")
 }
 
-stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
 {
   print(paste(num_folds,"-fold-split, data-size",nrow(data)))
-  data.processed = as.matrix(process_data( data ))
+  data.processed = as.matrix(process_data( data, colnames ))
+  print(paste("split using #features: ",ncol(data.processed)))
   if (method == "samplecube")
   {
     folds = rep(0, times=nrow(data))
@@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
   {
     require("TunePareto")
     cl = cluster(data.processed)
-    res = generateCVRuns(cl,ntimes=1,nfold=3)
+    res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
     folds = rep(0, times=nrow(data))
     for (i in 1:num_folds)
       for(j in 1:length(res[[1]][[i]]))
@@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
     stop("unknown method")
 }
 
+duplicate_indices <- function( data ) {
+  indices = 1:nrow(data) 
+  z = data
+  duplicate_index = anyDuplicated(z) 
+  while(duplicate_index) {
+    duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
+    #print(paste(duplicate_index,'is dupl to',duplicate_to_index))
+    indices[duplicate_index] <- duplicate_to_index
+    z[duplicate_index,] <- paste('123$§%',duplicate_index)
+    duplicate_index = anyDuplicated(z) 
+  } 
+  indices 
+}
+
+add_duplicates <- function( data, dup_indices ) { 
+  result = data[1,]
+  for(i in 2:length(dup_indices)) { 
+    row = data[rownames(data)==dup_indices[i],]
+    if(length(row)==0)
+       stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
+    result = rbind(result, row) 
+  } 
+  rownames(result)<-NULL 
+  result 
+}
+
+sammon_duplicates <- function( data, ... ) { 
+  di <- duplicate_indices(data)
+  print(di)
+  u <- unique(data) 
+  print(paste('unique data points',nrow(u),'of',nrow(data)))
+  if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
+  points_unique <- sammon(dist(u), ...)$points
+  if (nrow(u)<nrow(data))
+  {
+    points <- add_duplicates(points_unique, di) 
+    points 
+  }
+  else
+  {
+    points_unique
+  }
+}
+
 plot_pre_process <- function( data, method="pca" )
 {
   data.processed = process_data( data )
@@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" )
     data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
     data.emb$conf
   }
+  else if (method == "sammon")
+  {
+    require("MASS")
+    sammon_duplicates(data.processed, k=2)
+  }
   else
     stop("unknown method")
 }
diff --git a/lib/utils.rb b/lib/utils.rb
index a3f8161..88b8347 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -359,7 +359,7 @@ module OpenTox
     # @param[Hash] keys: SMILES, values: InChIs
     # @param[Array] field descriptions, one for each feature
     # @return[Array] CSV, array of field ids, array of field descriptions
-    def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids)
+    def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil)
       
       master=nil
       ids=[]
@@ -369,7 +369,7 @@ module OpenTox
         (1...ambit_result_uri.size).collect { |idx|
           curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
           #LOGGER.debug "Requesting #{curr_uri}"
-          csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv"}) )
+          csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
           if csv_data[0] && csv_data[0].size>1
             if master.nil? # This is the smiles entry
               (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
author	David Vorgrimmler <vorgrimmlerdavid@gmx.de>	2012-04-19 16:06:33 +0200
committer	David Vorgrimmler <vorgrimmlerdavid@gmx.de>	2012-04-19 16:06:33 +0200
commit	6dab51b5cf637b7c0d3d8585fe63f4116553ac5c (patch)
tree	d215da070142639e8d65ec30b24d3608e0561b38
parent	ad7cd1120e982253ecf0b515cc90dd0e45267685 (diff)