Merge branch 'bbrc-sample' into development

author: David Vorgrimmler <vorgrimmlerdavid@gmx.de> 2012-06-05 16:03:30 +0200
committer: David Vorgrimmler <vorgrimmlerdavid@gmx.de> 2012-06-05 16:03:30 +0200
commit: e2d670cb86d7b78f69d7f5d24a8cae5d79505e0f (patch)
tree: 2163a3e46afd2938e0a9e48aba6e00e26764f183
parent: 9cc85cd21d4c400b2a5c20068efaac5cde56a476 (diff)
parent: bc41a12a20612a17142c51626c708ceb1f764db6 (diff)
3 files changed, 117 insertions, 48 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index b921b9c..8b6fca5 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -68,19 +68,61 @@ module OpenTox
         raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
 
         unless params[:min_frequency].nil? 
-          @minfreq=params[:min_frequency].to_i
-          raise "Minimum frequency must be a number >0!" unless @minfreq>0
-        else
-          @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+          # check for percentage
+          if params[:min_frequency].include? "pc"
+            per_mil=params[:min_frequency].gsub(/pc/,"")
+            if OpenTox::Algorithm.numeric? per_mil
+              per_mil = per_mil.to_i * 10
+            else
+              bad_request=true
+            end
+          # check for per-mil
+          elsif params[:min_frequency].include? "pm"
+            per_mil=params[:min_frequency].gsub(/pm/,"")
+            if OpenTox::Algorithm.numeric? per_mil
+              per_mil = per_mil.to_i
+            else
+              bad_request=true
+            end
+          # set minfreq directly
+          else
+            if OpenTox::Algorithm.numeric? params[:min_frequency]
+              @minfreq=params[:min_frequency].to_i
+              LOGGER.debug "min_frequency #{@minfreq}"
+            else
+              bad_request=true
+            end
+          end
+          raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request
+          if @minfreq.nil?
+            @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil)
+            LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
+          end
         end
       end
 
       def add_fminer_data(fminer_instance, value_map)
 
+
+        # detect nr duplicates per compound
+        compound_sizes = {}
+        @training_dataset.compounds.each do |compound|
+          entries=@training_dataset.data_entries[compound]
+          entries.each do |feature, values|
+            compound_sizes[compound] || compound_sizes[compound] = []
+            compound_sizes[compound] << values.size
+          end
+          compound_sizes[compound].uniq!
+          raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1
+          compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
+        end
+
         id = 1 # fminer start id is not 0
-        @training_dataset.data_entries.each do |compound,entry| #order of compounds does not influence result
+
+        @training_dataset.compounds.each do |compound|
+          entry=@training_dataset.data_entries[compound]
           begin
-            smiles = OpenTox::Compound.smiles(compound.to_s)
+            smiles = OpenTox::Compound.new(compound).to_smiles
           rescue
             LOGGER.warn "No resource for #{compound.to_s}"
             next
@@ -92,29 +134,29 @@ module OpenTox
 
           entry.each do |feature,values|
             if feature == @prediction_feature.uri
-              values.each do |value|
-                if value.nil? 
+              (0...compound_sizes[compound]).each { |i|
+                if values[i].nil? 
                   LOGGER.warn "No #{feature} activity for #{compound.to_s}."
                 else
                   if @prediction_feature.feature_type == "classification"
-                    activity= value_map.invert[value].to_i # activities are mapped to 1..n
+                    activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n
                     @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
                   elsif @prediction_feature.feature_type == "regression"
-                    activity= value.to_f 
+                    activity= values[i].to_f 
                   end
                   begin
-                    fminer_instance.AddCompound(smiles,id)
-                    fminer_instance.AddActivity(activity, id)
+                    fminer_instance.AddCompound(smiles,id) if fminer_instance
+                    fminer_instance.AddActivity(activity, id) if fminer_instance 
                     @all_activities[id]=activity # DV: insert global information
                     @compounds[id] = compound
                     @smi[id] = smiles
                     id += 1
                   rescue Exception => e
-                    LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+                    LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer"
                     LOGGER.warn e.backtrace
                   end
                 end
-              end
+              }
             end
           end
         end
@@ -462,7 +504,7 @@ module OpenTox
         @r.del_missing = params[:del_missing] == true ? 1 : 0
         r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
         @r.f_fds_r = r_result_file.to_s
-        
+
         # need packs 'randomForest', 'RANN'
         @r.eval <<-EOR
           suppressPackageStartupMessages(library('caret'))
@@ -471,17 +513,17 @@ module OpenTox
           suppressPackageStartupMessages(library('doMC'))
           registerDoMC()
           set.seed(1)
-          
+
           acts = read.csv(ds_csv_file, check.names=F)
           feats = read.csv(fds_csv_file, check.names=F)
           ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
-          
+
           features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
           y = ds[,which(names(ds) == prediction_feature)] 
-          
+
           # assumes a data matrix 'features' and a vector 'y' of target values
           row.names(features)=NULL
-          
+
           # features with all values missing removed
           na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
           features = features[,!names(features) %in% na_col]
@@ -493,7 +535,7 @@ module OpenTox
           # features with zero variance removed
           zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
           features = features[,!names(features) %in% zero_var]
-          
+
           pp = NULL
           if (del_missing) {
             # needed if rows should be removed
@@ -506,11 +548,11 @@ module OpenTox
             pp = preProcess(features, method=c("scale", "center", "knnImpute"))
           }
           features = predict(pp, features)
-          
+
           # features with nan values removed (sometimes preProcess return NaN values)
           nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
           features = features[,!names(features) %in% nan_col]
-         
+
           # determine subsets
           subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
           #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
@@ -519,10 +561,10 @@ module OpenTox
           subsets = unique(sort(round(subsets))) 
           subsets = subsets[subsets<=dim(features)[2]]
           subsets = subsets[subsets>1] 
-         
+
           # Recursive feature elimination
           rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
-          
+
           # read existing dataset and select most useful features
           csv=feats[,c("SMILES", rfProfile$optVariables)]
           write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
@@ -563,3 +605,26 @@ module OpenTox
     end
   end
 end
+
+class Array
+  # collect method extended for parallel processing.
+  # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... }
+  # @param n the number of processes to spawn (default: unlimited)
+  def pcollect(n = nil)
+    nproc = 0
+    result = collect do |*a|
+      r, w = IO.pipe
+      fork do
+        r.close
+        w.write( Marshal.dump( yield(*a) ) )
+      end
+      if n and (nproc+=1) >= n
+        Process.wait ; nproc -= 1
+      end
+      [ w.close, r ].last
+    end
+    Process.waitall
+    result.collect{|r| Marshal.load [ r.read, r.close ].first}
+  end
+end
+
diff --git a/lib/serializer.rb b/lib/serializer.rb
index 4c26329..a1b980d 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -483,13 +483,15 @@ module OpenTox
         compound_sizes = {}
         dataset.compounds.each do |compound|
           entries=dataset.data_entries[compound]
-          entries.each do |feature, values|
-            compound_sizes[compound] || compound_sizes[compound] = []
-            compound_sizes[compound] << values.size
+          if entries
+            entries.each do |feature, values|
+              compound_sizes[compound] || compound_sizes[compound] = []
+              compound_sizes[compound] << values.size
+            end
+            compound_sizes[compound].uniq!
+            raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1
+            compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
           end
-          compound_sizes[compound].uniq!
-          raise "Inappropriate data for CSV export" if compound_sizes[compound].size > 1
-          compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
         end
  
         # substructures: complete data entries with zeroes of appropriate duplicates
@@ -501,23 +503,25 @@ module OpenTox
 
         dataset.compounds.each do |compound|
           entries=dataset.data_entries[compound]
-          cmpd = Compound.new(compound)
-          inchi = URI.encode_www_form_component(cmpd.to_inchi)
-
-          # allocate container
-          row_container = Array.new(compound_sizes[compound])
-          (0...row_container.size).each do |i|
-            row_container[i] = Array.new(@rows.first.size)
-          end
-
-          entries.each { |feature, values|
-            (0...compound_sizes[compound]).each { |i|
-              j = features.index(feature)+1
-              row_container[i][0] = inchi
-              row_container[i][j] = values[i]
+          if entries
+            cmpd = Compound.new(compound)
+            inchi = URI.encode_www_form_component(cmpd.to_inchi)
+  
+            # allocate container
+            row_container = Array.new(compound_sizes[compound])
+            (0...row_container.size).each do |i|
+              row_container[i] = Array.new(@rows.first.size)
+            end
+  
+            entries.each { |feature, values|
+              (0...compound_sizes[compound]).each { |i|
+                j = features.index(feature)+1
+                row_container[i][0] = inchi
+                row_container[i][j] = values[i]
+              }
             }
-          }
-          row_container.each { |r| @rows << r }
+            row_container.each { |r| @rows << r }
+          end
         end
       end
 
diff --git a/lib/utils.rb b/lib/utils.rb
index f6f8a4a..e04199d 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -467,8 +467,8 @@ module OpenTox
     end
 
 
-    # Effect calculation for classification
-    # @param [Array] Array of occurrences per class in the form of Enumerables.
+    # Effect calculation for classification. It is assumed that the elements of the arrays match each other pairwise
+    # @param [Array] Array of occurrences per class (in the form of Enumerables).
     # @param [Array] Array of database instance counts per class.
     def self.effect(occurrences, db_instances)
       max=0
author	David Vorgrimmler <vorgrimmlerdavid@gmx.de>	2012-06-05 16:03:30 +0200
committer	David Vorgrimmler <vorgrimmlerdavid@gmx.de>	2012-06-05 16:03:30 +0200
commit	e2d670cb86d7b78f69d7f5d24a8cae5d79505e0f (patch)
tree	2163a3e46afd2938e0a9e48aba6e00e26764f183
parent	9cc85cd21d4c400b2a5c20068efaac5cde56a476 (diff)
parent	bc41a12a20612a17142c51626c708ceb1f764db6 (diff)