1 files changed, 143 insertions, 37 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index c026c56..78fc447 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -56,25 +56,73 @@ module OpenTox
 
       def check_params(params,per_mil,subjectid=nil)
         raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and  !params[:dataset_uri].nil?
-        raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and  !params[:prediction_feature].nil?
-        @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
         @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
+
+        unless params[:prediction_feature] # try to read prediction_feature from dataset
+          raise OpenTox::NotFoundError.new "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1
+          prediction_feature = OpenTox::Feature.find(@training_dataset.features.keys.first,@subjectid)
+          params[:prediction_feature] = prediction_feature.uri
+        end
+        @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
+
         raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
 
         unless params[:min_frequency].nil? 
-          @minfreq=params[:min_frequency].to_i
-          raise "Minimum frequency must be a number >0!" unless @minfreq>0
-        else
-          @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+          # check for percentage
+          if params[:min_frequency].include? "pc"
+            per_mil=params[:min_frequency].gsub(/pc/,"")
+            if OpenTox::Algorithm.numeric? per_mil
+              per_mil = per_mil.to_i * 10
+            else
+              bad_request=true
+            end
+          # check for per-mil
+          elsif params[:min_frequency].include? "pm"
+            per_mil=params[:min_frequency].gsub(/pm/,"")
+            if OpenTox::Algorithm.numeric? per_mil
+              per_mil = per_mil.to_i
+            else
+              bad_request=true
+            end
+          # set minfreq directly
+          else
+            if OpenTox::Algorithm.numeric? params[:min_frequency]
+              @minfreq=params[:min_frequency].to_i
+              LOGGER.debug "min_frequency #{@minfreq}"
+            else
+              bad_request=true
+            end
+          end
+          raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request
+        end
+        if @minfreq.nil?
+          @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil)
+          LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
         end
       end
 
-      def add_fminer_data(fminer_instance, params, value_map)
+      def add_fminer_data(fminer_instance, value_map)
+
+
+        # detect nr duplicates per compound
+        compound_sizes = {}
+        @training_dataset.compounds.each do |compound|
+          entries=@training_dataset.data_entries[compound]
+          entries.each do |feature, values|
+            compound_sizes[compound] || compound_sizes[compound] = []
+            compound_sizes[compound] << values.size unless values.size == 0
+          end
+          compound_sizes[compound].uniq!
+          raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1
+          compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
+        end
 
         id = 1 # fminer start id is not 0
-        @training_dataset.data_entries.each do |compound,entry|
+
+        @training_dataset.compounds.each do |compound|
+          entry=@training_dataset.data_entries[compound]
           begin
-            smiles = OpenTox::Compound.smiles(compound.to_s)
+            smiles = OpenTox::Compound.new(compound).to_smiles
           rescue
             LOGGER.warn "No resource for #{compound.to_s}"
             next
@@ -84,32 +132,31 @@ module OpenTox
             next
           end
 
-          value_map=params[:value_map] unless params[:value_map].nil?
           entry.each do |feature,values|
             if feature == @prediction_feature.uri
-              values.each do |value|
-                if value.nil? 
+              (0...compound_sizes[compound]).each { |i|
+                if values[i].nil? 
                   LOGGER.warn "No #{feature} activity for #{compound.to_s}."
                 else
                   if @prediction_feature.feature_type == "classification"
-                    activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
+                    activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n
                     @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
                   elsif @prediction_feature.feature_type == "regression"
-                    activity= value.to_f 
+                    activity= values[i].to_f 
                   end
                   begin
-                    fminer_instance.AddCompound(smiles,id)
-                    fminer_instance.AddActivity(activity, id)
+                    fminer_instance.AddCompound(smiles,id) if fminer_instance
+                    fminer_instance.AddActivity(activity, id) if fminer_instance 
                     @all_activities[id]=activity # DV: insert global information
                     @compounds[id] = compound
                     @smi[id] = smiles
                     id += 1
                   rescue Exception => e
-                    LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+                    LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer"
                     LOGGER.warn e.backtrace
                   end
                 end
-              end
+              }
             end
           end
         end
@@ -380,11 +427,11 @@ module OpenTox
           prediction = acts[0]
         else
           #LOGGER.debug gram_matrix.to_yaml
-          @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
-          @r.eval "set.seed(1)"
+          @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
           @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
           @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
           @r.eval "registerDoMC()" # switch on parallel processing
+          @r.eval "set.seed(1)"
           begin
 
             # set data
@@ -400,7 +447,14 @@ module OpenTox
 
             # prepare data
             LOGGER.debug "Preparing R data ..."
-            @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
+            @r.eval <<-EOR
+              weights=NULL
+              if (class(y) == 'character') { 
+                y = factor(y)
+                suppressPackageStartupMessages(library('class')) 
+                #weights=unlist(as.list(prop.table(table(y))))
+              }
+            EOR
 
             @r.eval <<-EOR
               rem = nearZeroVar(prop_matrix)
@@ -417,8 +471,18 @@ module OpenTox
 
             # model + support vectors
             LOGGER.debug "Creating R SVM model ..."
-            @r.eval <<-EOR
-              model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
+            train_success = @r.eval <<-EOR
+              # AM: TODO: evaluate class weight effect by altering:
+              # AM: comment in 'weights' above run and class.weights=weights vs. class.weights=1-weights
+              # AM: vs
+              # AM: comment out 'weights' above (status quo), thereby disabling weights
+              model = train(prop_matrix,y,
+                             method="svmradial",
+                             preProcess=c("center", "scale"),
+                             class.weights=weights,
+                             trControl=trainControl(method="LGOCV",number=10),
+                             tuneLength=8
+                           )
               perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
             EOR
 
@@ -431,6 +495,7 @@ module OpenTox
 
             # censoring
             prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
+            prediction = nil unless train_success
             LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
           rescue Exception => e
             LOGGER.debug "#{e.class}: #{e.message}"
@@ -456,30 +521,42 @@ module OpenTox
         @r.del_missing = params[:del_missing] == true ? 1 : 0
         r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
         @r.f_fds_r = r_result_file.to_s
-        
+
         # need packs 'randomForest', 'RANN'
         @r.eval <<-EOR
-          set.seed(1)
           suppressPackageStartupMessages(library('caret'))
           suppressPackageStartupMessages(library('randomForest'))
           suppressPackageStartupMessages(library('RANN'))
           suppressPackageStartupMessages(library('doMC'))
           registerDoMC()
-          
+          set.seed(1)
+
           acts = read.csv(ds_csv_file, check.names=F)
           feats = read.csv(fds_csv_file, check.names=F)
           ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
-          
+
           features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
           y = ds[,which(names(ds) == prediction_feature)] 
-          
+
           # assumes a data matrix 'features' and a vector 'y' of target values
           row.names(features)=NULL
-          
+
+          # features with all values missing removed
+          na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
+          features = features[,!names(features) %in% na_col]
+
+          # features with infinite values removed
+          inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
+          features = features[,!names(features) %in% inf_col]
+
+          # features with zero variance removed
+          zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
+          features = features[,!names(features) %in% zero_var]
+
           pp = NULL
           if (del_missing) {
             # needed if rows should be removed
-            na_ids = apply(features,1,function(x)any(is.na(x)))
+            na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )
             features = features[!na_ids,]
             y = y[!na_ids]
             pp = preProcess(features, method=c("scale", "center"))
@@ -488,17 +565,23 @@ module OpenTox
             pp = preProcess(features, method=c("scale", "center", "knnImpute"))
           }
           features = predict(pp, features)
-          
+
+          # features with nan values removed (sometimes preProcess return NaN values)
+          nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
+          features = features[,!names(features) %in% nan_col]
+
           # determine subsets
-          subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
-          subsets = c(2,3,4,5,7,10,subsets)
+          subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
+          #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+          #subsets = c(2,3,4,5,7,10,subsets)
+          #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
           subsets = unique(sort(round(subsets))) 
           subsets = subsets[subsets<=dim(features)[2]]
           subsets = subsets[subsets>1] 
-          
+
           # Recursive feature elimination
-          rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
-          
+          rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
+
           # read existing dataset and select most useful features
           csv=feats[,c("SMILES", rfProfile$optVariables)]
           write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
@@ -527,7 +610,7 @@ module OpenTox
       # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
       # @return [Hash] Hash with matching Smarts and number of hits 
       def self.lookup(params)
-        params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
+        params[:compound].lookup(params[:features], params[:feature_dataset_uri], params[:pc_type], params[:lib], params[:subjectid])
       end  
     end
 
@@ -539,3 +622,26 @@ module OpenTox
     end
   end
 end
+
+class Array
+  # collect method extended for parallel processing.
+  # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... }
+  # @param n the number of processes to spawn (default: unlimited)
+  def pcollect(n = nil)
+    nproc = 0
+    result = collect do |*a|
+      r, w = IO.pipe
+      fork do
+        r.close
+        w.write( Marshal.dump( yield(*a) ) )
+      end
+      if n and (nproc+=1) >= n
+        Process.wait ; nproc -= 1
+      end
+      [ w.close, r ].last
+    end
+    Process.waitall
+    result.collect{|r| Marshal.load [ r.read, r.close ].first}
+  end
+end
+