13 files changed, 299 insertions, 230 deletions
diff --git a/ChangeLog b/ChangeLog
index 5872d56..de9e01b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,11 +1,3 @@
-v3.1.0 2012-02-24
-		* utils.rb: added for special routines (e.g. descriptor calculation)
-		* task.rb: Polling with increasing interval
-		* parser.rb: CSV up and download fixed
-		* transform.rb: routines to create machine learning data matrices
-		* algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
-		gauss() removed
-
 v3.0.1 2011-10-19
     * feature: model registration to ontology service
     * ontology lib gets endpoints from ontology service
diff --git a/VERSION b/VERSION
index fd2a018..cb2b00e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.1.0
+3.0.1
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 9dcf6a8..ebd2019 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -477,10 +477,22 @@ module OpenTox
           # assumes a data matrix 'features' and a vector 'y' of target values
           row.names(features)=NULL
           
+          # features with all values missing removed
+          na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
+          features = features[,!names(features) %in% na_col]
+
+          # features with infinite values removed
+          inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
+          features = features[,!names(features) %in% inf_col]
+
+          # features with zero variance removed
+          zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
+          features = features[,!names(features) %in% zero_var]
+          
           pp = NULL
           if (del_missing) {
             # needed if rows should be removed
-            na_ids = apply(features,1,function(x)any(is.na(x)))
+            na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )
             features = features[!na_ids,]
             y = y[!na_ids]
             pp = preProcess(features, method=c("scale", "center"))
@@ -490,15 +502,21 @@ module OpenTox
           }
           features = predict(pp, features)
           
+          # features with nan values removed (sometimes preProcess return NaN values)
+          nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
+          features = features[,!names(features) %in% nan_col]
+         
           # determine subsets
-          subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
-          subsets = c(2,3,4,5,7,10,subsets)
+          subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
+          #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+          #subsets = c(2,3,4,5,7,10,subsets)
+          #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
           subsets = unique(sort(round(subsets))) 
           subsets = subsets[subsets<=dim(features)[2]]
           subsets = subsets[subsets>1] 
-          
+         
           # Recursive feature elimination
-          rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
+          rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
           
           # read existing dataset and select most useful features
           csv=feats[,c("SMILES", rfProfile$optVariables)]
@@ -528,7 +546,7 @@ module OpenTox
       # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
       # @return [Hash] Hash with matching Smarts and number of hits 
       def self.lookup(params)
-        params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
+        params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type])
       end  
     end
 
diff --git a/lib/authorization.rb b/lib/authorization.rb
index a9744e9..5d57781 100644
--- a/lib/authorization.rb
+++ b/lib/authorization.rb
@@ -37,15 +37,13 @@ module OpenTox
       
       #Loads and sends Policyfile(XML) to open-sso server
       # @param [String] URI to create a policy for      
-      def send(uri)
+      def send(uri)    
         xml = get_xml(uri)
         ret = false
-        ret = Authorization.create_policy(xml, @subjectid)
-        LOGGER.warn "Create policy on openSSO failed for URI: #{uri} subjectid: #{@subjectid}. Will try again." if !ret
-        ret = Authorization.create_policy(xml, @subjectid) if !ret
+        ret = Authorization.create_policy(xml, @subjectid) 
         LOGGER.debug "Policy send with subjectid: #{@subjectid}"
         LOGGER.warn "Not created Policy is: #{xml}" if !ret
-        ret
+        ret  
       end
       
     end
@@ -339,7 +337,7 @@ module OpenTox
     # @param [String] subjectid
     # @return [Boolean] true if access granted, else otherwise
     def self.authorized?(uri, request_method, subjectid)
-      if CONFIG[:authorization][:free_request].include?(request_method)
+      if CONFIG[:authorization][:free_request].include?(request_method)  
         #LOGGER.debug "authorized? >>true<< (request is free), method: #{request_method}, URI: #{uri}, subjectid: #{subjectid}"
         true
       elsif OpenTox::Authorization.free_uri?(uri, request_method)
@@ -362,7 +360,7 @@ module OpenTox
         false
       end
     end
-
+    
     private
     def self.free_uri?(uri, request_method)
       if CONFIG[:authorization][:free_uris]
@@ -376,7 +374,7 @@ module OpenTox
       end    
       return false
     end
-
+    
     def self.authorize_exception?(uri, request_method)
       if CONFIG[:authorization][:authorize_exceptions]
         CONFIG[:authorization][:authorize_exceptions].each do |request_methods,uris|
@@ -389,6 +387,6 @@ module OpenTox
       end    
       return false
     end    
-
+    
   end
-end
+end
+\ No newline at end of file
diff --git a/lib/compound.rb b/lib/compound.rb
index 8928081..6d3cb68 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -3,6 +3,7 @@
 
 module OpenTox
 
+  require "rexml/document"
   # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
 	class Compound 
 
@@ -130,6 +131,47 @@ module OpenTox
         "not available"
       end
 		end
+    
+    
+    # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+    # @example
+    #   names = compound.to_names_hash
+    # @return [Hash] Classification => Name Array
+		def to_names_hash
+      begin
+        xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml")
+        xmldoc = REXML::Document.new(xml)
+        data = {}
+        
+        xmldoc.root.elements[1].elements.each{|e|
+          if data.has_key?(e.attribute("classification").value) == false
+             data[e.attribute("classification").value] = [e.text]
+          else
+             data[e.attribute("classification").value].push(e.text)
+          end
+        }
+        data
+      rescue
+        "not available"
+      end
+		end
+
+    # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+    # @example
+    #   names = compound.to_names_hash
+    # @return [Hash] Classification => Name Array
+    def to_ambit_names_hash
+      begin
+        ds = OpenTox::Dataset.new
+        ds.save
+        ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}"))
+        ds.save
+        ds.uri
+      rescue
+        "not available"
+      end
+    end
+
 
 		# Match a smarts string
     # @example
@@ -197,6 +239,7 @@ module OpenTox
     # Lookup numerical values, returns hash with feature name as key and value as value 
     # @param [Array] Array of feature names
     # @param [String] Feature dataset uri
+    # @param [String] Comma separated pc types
     # @return [Hash] Hash with feature name as key and value as value
     def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil)
       ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid)
@@ -211,11 +254,12 @@ module OpenTox
       LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
 
       if entry.nil?
-        uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
-        uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid)
-        ds = OpenTox::Dataset.find(uri,subjectid)
+        temp_ds = OpenTox::Dataset.create; temp_ds.add_compound(self.uri)
+        uri = RestClientWrapper.post(temp_ds.save + "/pcdesc", {:pc_type => pc_type})
+        ds = OpenTox::Dataset.find(uri)
         entry = ds.data_entries[self.uri]
-        ds.delete(subjectid)
+        ds.delete
+        temp_ds.delete
       end
       features = entry.keys
       features.each { |feature| 
@@ -224,7 +268,6 @@ module OpenTox
         entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
       }
       #res = feature_array.collect {|v| entry[v]}
-      #LOGGER.debug "----- am #{entry.to_yaml}"
       entry
 		end
 
diff --git a/lib/environment.rb b/lib/environment.rb
index c1b8312..6a72ba5 100644
--- a/lib/environment.rb
+++ b/lib/environment.rb
@@ -91,5 +91,5 @@ DC =  OwlNamespace.new 'http://purl.org/dc/elements/1.1/'
 OT =  OwlNamespace.new 'http://www.opentox.org/api/1.1#'
 OTA =  OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#'
 XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#'
-#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
+BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
 
diff --git a/lib/model.rb b/lib/model.rb
index a858a0f..b3de1a3 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -237,6 +237,7 @@ module OpenTox
 
         @compound = Compound.new compound_uri
         features = {}
+
         #LOGGER.debug self.to_yaml
         unless @prediction_dataset
           @prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@@ -247,19 +248,22 @@ module OpenTox
             OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
           } )
         end
+
         if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
           all_activities = [] 
           all_activities = @activities.values.flatten.collect! { |i| i.to_f }
         end
+
         unless database_activity(subjectid) # adds database activity to @prediction_dataset
+
           # Calculation of needed values for query compound
           @compound_features = eval("#{@feature_calculation_algorithm}({
                                     :compound => @compound, 
                                     :features => @features, 
                                     :feature_dataset_uri => @metadata[OT.featureDataset],
-                                    :pc_type => self.parameter(\"pc_type\"),
-                                    :subjectid => subjectid
+                                    :pc_type => self.parameter(\"pc_type\")
                                     })")
+          
           # Adding fingerprint of query compound with features and values(p_value*nr_hits)
           @compound_fingerprints = {}
           @compound_features.each do |feature, value| # value is nil if "Substructure.match"
diff --git a/lib/parser.rb b/lib/parser.rb
index 56e4fed..e871323 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -349,8 +349,11 @@ module OpenTox
 
       # Load CSV string (format specification: http://toxcreate.org/help)
       # @param [String] csv CSV representation of the dataset
+      # @param [Boolean] drop_missing Whether completely missing rows should be droppped
+      # @param [Boolean] all_numeric Whether all features should be treated as numeric
+      # @param [Boolean] del_nominal All nominal features will be removed
       # @return [OpenTox::Dataset] Dataset object with CSV data
-      def load_csv(csv, drop_missing=false)
+      def load_csv(csv, drop_missing=false, all_numeric=false)
         row = 0
         input = csv.split("\n")
         headers = split_row(input.shift)
@@ -362,7 +365,7 @@ module OpenTox
           row = split_row(row)
           value_maps = detect_new_values(row, value_maps)
           value_maps.each_with_index { |vm,j|
-            if vm.size > @max_class_values # max @max_class_values classes.
+            if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
               regression_features[j]=true 
             else
               regression_features[j]=false
@@ -395,17 +398,14 @@ module OpenTox
         info = ''
         @feature_types.each do |feature,types|
           if types.uniq.size == 0
-            type = "helper#MissingFeature"
+            type = "helper#MissingFeature" # TODO: Fit to OT ontology!
           elsif types.uniq.size > 1
             type = OT.NumericFeature
           else
             type = types.first
           end
           @dataset.add_feature_metadata(feature,{RDF.type => [type]})
-          info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
-
-          # TODO: rewrite feature values
-          # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
+          info += "'#{@dataset.feature_name(feature)}' detected as '#{type.split('#').last}'<br>" if type
         end
 
         @dataset.metadata[OT.Info] = info 
@@ -522,7 +522,6 @@ module OpenTox
       def initialize
         @data = {}
         @activity_errors = []
-        @max_class_values = 3
       end
 
       def feature_values(feature)
@@ -654,7 +653,7 @@ module OpenTox
           obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
           table.data[compound.uri] = row
         end
-        
+
         # find and remove ignored_features
         @activity_errors = table.clean_features
         table.add_to_dataset @dataset
diff --git a/lib/r-util.rb b/lib/r-util.rb
index 0d4e82c..7163c46 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -8,18 +8,6 @@ PACKAGE_DIR = package_dir
 
 require "tempfile"
 
-class Array
-  
-  def check_uniq
-    hash = {}
-    self.each do |x|
-      raise "duplicate #{x}" if hash[x]
-      hash[x] = true
-    end
-  end
-  
-end
-
 module OpenTox
   
   class RUtil
@@ -87,10 +75,12 @@ module OpenTox
     end
 
     # embedds feature values of two datasets into 2D and plots it
+    # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method) 
     #        
     def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
-        features=nil, subjectid=nil, waiting_task=nil)
+        features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
         
+      raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
       LOGGER.debug("r-util> create feature value plot")
       d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
       d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
@@ -112,13 +102,17 @@ module OpenTox
       @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
       @r.names = [dataset_name1, dataset_name2]
       LOGGER.debug("r-util> - convert data to 2d")
-      #@r.eval "save.image(\"/tmp/image.R\")"
-      @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
+      @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
       waiting_task.progress(75) if waiting_task
       
+      if fast_plot
+        info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
+      else
+        info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
+      end
       LOGGER.debug("r-util> - plot data")
       plot_to_files(files) do |file|
-        @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
+        @r.eval "plot_split( df.2d, split, names, #{info})"
       end
     end
     
@@ -176,68 +170,19 @@ module OpenTox
       end
     end
     
-    # stratified splits a dataset into two dataset according to the feature values
-    # all features are taken into account unless <split_features> is given
-    # returns two datases
-    def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
-      stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
-    end
-    
-    # stratified splits a dataset into k datasets according the feature values
+    # stratified splits a dataset into two dataset the feature values
     # all features are taken into account unless <split_features> is given
-    # returns two arrays of datasets
-    def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
-      stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
-    end    
-    
-    private
-    def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
-      raise "internal error" if num_folds!=nil and pct!=nil
-      k_fold_split = num_folds!=nil
-      if k_fold_split
-        raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
-      else
-        raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
-      end
+    def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
       raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
-      raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
-      raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)          
       LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
       
-      df = dataset_to_dataframe( dataset, missing_values, subjectid)
+      df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
       @r.eval "set.seed(#{seed})"
-      str_split_features = ""
-      if split_features
-        @r.split_features = split_features if split_features
-        str_split_features = "colnames=split_features"
-      end
-      @r.eval "save.image(\"/tmp/image.R\")"
-      
-      if k_fold_split
-        @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
-        split = @r.pull 'split'
-        train = []
-        test = []
-        num_folds.times do |f|
-          datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s           
-          metadata[DC.title] = "training "+datasetname 
-          train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
-          metadata[DC.title] = "test "+datasetname
-          test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
-        end
-        return train, test
-      else
-        puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
-        @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
-        split = @r.pull 'split'
-        metadata[DC.title] = "Training dataset split of "+dataset.uri
-        train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
-        metadata[DC.title] = "Test dataset split of "+dataset.uri
-        test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
-        return train, test
-      end
+      @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
+      split = @r.pull 'split'
+      split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
+      split_to_datasets( df, split, subjectid )
     end
-    public
     
     # dataset should be loaded completely (use Dataset.find)
     # takes duplicates into account
@@ -267,13 +212,9 @@ module OpenTox
         features = dataset.features.keys.sort
       end
       compounds = []
-      compound_names = []
       dataset.compounds.each do |c|
-        count = 0
         num_compounds[c].times do |i|
           compounds << c
-          compound_names << "#{c}$#{count}"
-          count+=1
         end
       end
 
@@ -297,7 +238,7 @@ module OpenTox
         end
       end  
       df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
-      assign_dataframe(df_name,d_values,compound_names,features)
+      assign_dataframe(df_name,d_values,compounds,features)
       
       # set dataframe column types accordingly
       f_count = 1 #R starts at 1
@@ -323,18 +264,16 @@ module OpenTox
     
     # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
     # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
-    def dataframe_to_dataset( df, metadata={}, subjectid=nil )
-      dataframe_to_dataset_indices( df, metadata, subjectid, nil)
+    def dataframe_to_dataset( df, subjectid=nil )
+      dataframe_to_dataset_indices( df, subjectid, nil)
     end
     
     private
-    def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
+    def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
       raise unless @@feats[df].size>0
-      values, compound_names, features = pull_dataframe(df)
-      compounds = compound_names.collect{|c| c.split("$")[0]}
+      values, compounds, features = pull_dataframe(df)
       features.each{|f| raise unless @@feats[df][f]}
       dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
-      dataset.add_metadata(metadata)
       LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
       compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
       features.each{|f| dataset.add_feature(f,@@feats[df][f])}
@@ -351,12 +290,16 @@ module OpenTox
       dataset
     end    
     
-    def split_to_dataset( df, split, metadata={}, subjectid=nil )
-      indices = []
-      split.size.times{|i| indices<<i if yield(split[i]) }
-      dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
-      LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
-      dataset
+    def split_to_datasets( df, split, subjectid=nil )
+      sets = []
+      (split.min.to_i .. split.max.to_i).each do |i|
+        indices = []
+        split.size.times{|j| indices<<j if split[j]==i}
+        dataset = dataframe_to_dataset_indices( df, subjectid, indices )
+        LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+        sets << dataset
+      end
+      sets
     end
     
     def pull_dataframe(df)
@@ -380,8 +323,6 @@ module OpenTox
     end
     
     def assign_dataframe(df,input,rownames,colnames)
-      rownames.check_uniq if rownames
-      colnames.check_uniq if colnames
       tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
       file = File.new(tmp, 'w')
       input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}  
diff --git a/lib/serializer.rb b/lib/serializer.rb
index 30cb2ba..2205ade 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -460,6 +460,17 @@ module OpenTox
         @rows = []
         @rows << ["SMILES"]
         features = dataset.features.keys
+
+        delete_features = []
+        features.each{ |fn|
+          dataset.features[fn][RDF.type].each { |typestr|
+            if typestr.include? "MissingFeature"
+              delete_features << fn 
+            end
+          }
+        }
+        features = features - delete_features
+
         @rows.first << features
         @rows.first.flatten!
         dataset.data_entries.each do |compound,entries|
diff --git a/lib/stratification.R b/lib/stratification.R
index 3f8698c..76ff2d8 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -1,13 +1,4 @@
 
-round_it <- function( x )
-{
-  if(isTRUE((x - floor(x))>=0.5))
-    ceiling(x)
-  else
-    floor(x)
-}
-
-
 nominal_to_binary <- function( data )
 {
   result = NULL
@@ -50,13 +41,9 @@ nominal_to_binary <- function( data )
   result
 }
 
-process_data <- function( data, colnames=NULL )
+process_data <- function( data )
 {
   data.num <- as.data.frame(data)
-  if (!is.null(colnames))
-  {
-    data.num = subset(data.num, select = colnames)
-  }
   if (!is.numeric(data.num))
   {
     data.num = nominal_to_binary(data.num)
@@ -85,15 +72,14 @@ cluster <- function( data, min=10, max=15 )
   cbind(s$partition[,m])
 }
 
-stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
+stratified_split <- function( data, ratio=0.3, method="cluster" )
 {
-    data.processed = as.matrix(process_data( data, colnames ))
-    print(paste("split using #features: ",ncol(data.processed)))
+    data.processed = as.matrix(process_data( data ))
     if (method == "samplecube")
     {
       require("sampling")
       # adjust ratio to make samplecube return exact number of samples
-      ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
+      ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
       pik = rep(ratio,times=nrow(data.processed))
       data.strat = cbind(pik,data.processed)
       samplecube(data.strat,pik,order=2,comment=F)
@@ -115,11 +101,10 @@ stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
       stop("unknown method")
 }
 
-stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
 {
   print(paste(num_folds,"-fold-split, data-size",nrow(data)))
-  data.processed = as.matrix(process_data( data, colnames ))
-  print(paste("split using #features: ",ncol(data.processed)))
+  data.processed = as.matrix(process_data( data ))
   if (method == "samplecube")
   {
     folds = rep(0, times=nrow(data))
@@ -148,7 +133,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna
   {
     require("TunePareto")
     cl = cluster(data.processed)
-    res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
+    res = generateCVRuns(cl,ntimes=1,nfold=3)
     folds = rep(0, times=nrow(data))
     for (i in 1:num_folds)
       for(j in 1:length(res[[1]][[i]]))
@@ -159,50 +144,6 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna
     stop("unknown method")
 }
 
-duplicate_indices <- function( data ) {
-  indices = 1:nrow(data) 
-  z = data
-  duplicate_index = anyDuplicated(z) 
-  while(duplicate_index) {
-    duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
-    #print(paste(duplicate_index,'is dupl to',duplicate_to_index))
-    indices[duplicate_index] <- duplicate_to_index
-    z[duplicate_index,] <- paste('123$§%',duplicate_index)
-    duplicate_index = anyDuplicated(z) 
-  } 
-  indices 
-}
-
-add_duplicates <- function( data, dup_indices ) { 
-  result = data[1,]
-  for(i in 2:length(dup_indices)) { 
-    row = data[rownames(data)==dup_indices[i],]
-    if(length(row)==0)
-       stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
-    result = rbind(result, row) 
-  } 
-  rownames(result)<-NULL 
-  result 
-}
-
-sammon_duplicates <- function( data, ... ) { 
-  di <- duplicate_indices(data)
-  print(di)
-  u <- unique(data) 
-  print(paste('unique data points',nrow(u),'of',nrow(data)))
-  if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
-  points_unique <- sammon(dist(u), ...)$points
-  if (nrow(u)<nrow(data))
-  {
-    points <- add_duplicates(points_unique, di) 
-    points 
-  }
-  else
-  {
-    points_unique
-  }
-}
-
 plot_pre_process <- function( data, method="pca" )
 {
   data.processed = process_data( data )
@@ -217,11 +158,6 @@ plot_pre_process <- function( data, method="pca" )
     data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
     data.emb$conf
   }
-  else if (method == "sammon")
-  {
-    require("MASS")
-    sammon_duplicates(data.processed, k=2)
-  }
   else
     stop("unknown method")
 }
diff --git a/lib/transform.rb b/lib/transform.rb
index f6f769d..cb530a3 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -396,8 +396,8 @@ module OpenTox
             @q_prop = gsl_q_prop_orig.row(0).to_a
           end
 
-          LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"  if (@n_prop && @n_prop[0] && @q_prop)
-          LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" if (@sims && @acts)
+          LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
+          LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
 
           @sims = [ gram_matrix, @sims ] 
 
diff --git a/lib/utils.rb b/lib/utils.rb
index d9d7b4b..40988db 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -1,4 +1,5 @@
 require 'csv'
+require 'tempfile'
 
 
 module OpenTox
@@ -8,18 +9,60 @@ module OpenTox
     include OpenTox
 
     # Calculate physico-chemical descriptors.
-    # @param[Hash] Required keys: :dataset_uri, :pc_type
+    # @param[Hash] Required keys: :dataset_uri, :pc_type, :rjb
     # @return[String] dataset uri
-
     def self.pc_descriptors(params)
 
       begin
         ds = OpenTox::Dataset.find(params[:dataset_uri])
         compounds = ds.compounds.collect
-        ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
-        #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
-        LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
-        load_ds_csv(ambit_result_uri, smiles_to_inchi)
+
+        jl_master=nil
+        ambit_master=nil
+
+        # joelib via rjb
+        types = params[:pc_type].split(",")
+
+        step= (1.0/types.size * 100).floor
+        if types.size && types.include?("joelib")
+          jl_master = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb] } )
+          types.delete("joelib")
+        end
+        params[:task].progress(step) if params[:task]
+
+
+        # ambit via REST
+        if types.size > 0
+          ambit_result_uri, smiles_to_inchi = get_ambit_descriptors( { :compounds => compounds, :pc_type => types.join(','), :task => params[:task], :step => step } )
+          LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
+          ambit_master = load_ds_csv(ambit_result_uri, smiles_to_inchi)
+        end
+
+
+        # Fuse CSVs
+        if jl_master && ambit_master
+          nr_cols = (jl_master[0].size)-1
+          LOGGER.debug "Merging #{nr_cols} new columns"
+          ambit_master.each {|row| nr_cols.times { row.push(nil) }  } # Adds empty columns to all rows
+          jl_master.each do |row|
+            temp = ambit_master.assoc(row[0]) # Finds the appropriate line in master
+            ((-1*nr_cols)..-1).collect.each { |idx|
+              temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
+            }
+          end
+          master = ambit_master
+        else
+          master = jl_master if jl_master
+          master = ambit_master if ambit_master
+        end
+
+        parser = OpenTox::Parser::Spreadsheets.new
+        ds = OpenTox::Dataset.new
+        ds.save
+        parser.dataset = ds
+        ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"),false,true)
+        ds.save
+
       rescue Exception => e
         LOGGER.debug "#{e.class}: #{e.message}"
         LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
@@ -27,10 +70,94 @@ module OpenTox
 
     end
     
-    # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
+
+    # Calculates PC descriptors via JOELib2.
+    # @param[Hash] Required keys: :compounds, :rjb
+    # @return[String] dataset uri
+    def self.get_jl_descriptors(params)
+
+      s = params[:rjb]
+      master = nil
+      raise "No Java environment" unless s
+
+      # Load keys, enter CSV headers
+      begin
+        keysfile = File.join(ENV['HOME'], ".opentox", "config", "jl_keys.yaml")
+        csvfile = Tempfile.open(['jl_descriptors-csv-','.sdf'])
+        jl_keys = YAML::load_file(keysfile)
+        jl_colnames = jl_keys.collect{ |k| 
+          k.split(".").last
+        }
+        csvfile.puts((["SMILES"] + jl_colnames).join(","))
+
+        # remember inchis
+        inchis = params[:compounds].collect { |c_uri| 
+          cmpd = OpenTox::Compound.new(c_uri)
+          URI.encode_www_form_component(cmpd.to_inchi)
+        }
+
+        # Process compounds
+        params[:compounds].each_with_index { |c_uri, c_idx| 
+          cmpd = OpenTox::Compound.new(c_uri)
+          inchi = cmpd.to_inchi
+          sdf_data = cmpd.to_sdf
+
+          infile = Tempfile.open(['jl_descriptors-in-','.sdf'])
+          outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out")
+
+          begin
+            infile.puts sdf_data
+            infile.flush
+            s.new(infile.path, outfile_path)
+                   
+            row = [inchis[c_idx]]
+            jl_keys.each_with_index do |k,i| # Fill row
+              re = Regexp.new(k)
+              open(outfile_path) do |f|
+                f.each do |line|
+                  if @prev =~ re
+                    entry = line.chomp
+                    val = nil
+                    if OpenTox::Algorithm.numeric?(entry)
+                      val = Float(entry)
+                      val = nil if val.nan?
+                      val = nil if val.infinite?
+                    end
+                    row << val
+                  end
+                  @prev = line
+                end
+              end
+            end
+            csvfile.puts(row.join(","))
+            csvfile.flush
+
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          ensure
+            File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf"))
+            File.delete(outfile_path)
+            infile.close!
+          end
+        }
+        master = CSV::parse(File.open(csvfile.path, "rb").read)
+
+      rescue Exception => e
+        LOGGER.debug "#{e.class}: #{e.message}"
+        LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+      ensure
+        [ csvfile].each { |f| f.close! }
+      end
+
+      master
+    end
+
+
+    # Calcul:compoundsates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
     # @param[Hash] Required keys: :compounds, :pc_type
     # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
-    def self.get_pc_descriptors(params)
+    def self.get_ambit_descriptors(params)
 
       begin
 
@@ -38,16 +165,17 @@ module OpenTox
         ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
         descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
         descs_uris = []
-        params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
         types = params[:pc_type].split(",")
         descs.each { |uri, cat_name| 
           if types.include? cat_name[:category]
-            descs_uris << uri
+            descs_uris << "#{cat_name[:category]}:::#{uri}"
           end
         }
         if descs_uris.size == 0
           raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
         end
+        descs_uris.sort!
+        descs_uris.collect! { |uri| uri.split(":::").last }
         #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
 
         begin
@@ -75,17 +203,21 @@ module OpenTox
         end
         ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
 
-        # Calculate 3D for CPSA
-        if types.include? "cpsa"
-          ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} ) 
-          LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
-        end
+        # -C-a-l-c-u-l-a-t-e- -3-D- -f-o-r- -C-P-S-A-
+        # Always calculate 3D! See http://goo.gl/Tk81j
+        #if types.include? "cpsa"
+        ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} ) 
+        LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
+        #end
 
         # Get Ambit results
         ambit_result_uri = [] # 1st pos: base uri, then features
         ambit_result_uri << ambit_ds_uri + "?"
         ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
+        current_cat = ""
         descs_uris.each_with_index do |uri, i|
+          old_cat = current_cat; current_cat = descs[uri][:category]
+          params[:task].progress(params[:task].metadata[OT.percentageCompleted] + params[:step]) if params[:task] && params[:step] && old_cat != current_cat && old_cat != ""
           algorithm = Algorithm::Generic.new(uri)
           result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
           ambit_result_uri << result_uri.split("?")[1] + "&"
@@ -104,13 +236,13 @@ module OpenTox
     # Load dataset via CSV
     # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
     # @return[String] dataset uri 
-    def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil)
+    def self.load_ds_csv(ambit_result_uri, smiles_to_inchi)
       
       master=nil
       (1...ambit_result_uri.size).collect { |idx|
         curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
         LOGGER.debug "Requesting #{curr_uri}"
-        csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
+        csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv"}) )
         if csv_data[0] && csv_data[0].size>1
           if master.nil? # This is the smiles entry
             (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
@@ -139,17 +271,12 @@ module OpenTox
       master[0][0] = "Compound" #"SMILES" 
       index_smi = master[0].index("SMILES")
       master.map {|i| i.delete_at(index_smi)} if index_smi
-      #master[0][0] = "SMILES" 
+      master[0][0] = "SMILES" 
        
       #LOGGER.debug "-------- AM: Writing to dumpfile"
       #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
      
-      parser = OpenTox::Parser::Spreadsheets.new
-      ds = OpenTox::Dataset.new(nil,subjectid)
-      ds.save(subjectid)
-      parser.dataset = ds
-      ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
-      ds.save(subjectid)
+      master 
     end