diff options
Diffstat (limited to 'lib/algorithm.rb')
-rw-r--r-- | lib/algorithm.rb | 180 |
1 files changed, 143 insertions, 37 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb index c026c56..78fc447 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -56,25 +56,73 @@ module OpenTox def check_params(params,per_mil,subjectid=nil) raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? - raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? - @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid + + unless params[:prediction_feature] # try to read prediction_feature from dataset + raise OpenTox::NotFoundError.new "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1 + prediction_feature = OpenTox::Feature.find(@training_dataset.features.keys.first,@subjectid) + params[:prediction_feature] = prediction_feature.uri + end + @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid + raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature]) unless params[:min_frequency].nil? - @minfreq=params[:min_frequency].to_i - raise "Minimum frequency must be a number >0!" unless @minfreq>0 - else - @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST + # check for percentage + if params[:min_frequency].include? "pc" + per_mil=params[:min_frequency].gsub(/pc/,"") + if OpenTox::Algorithm.numeric? per_mil + per_mil = per_mil.to_i * 10 + else + bad_request=true + end + # check for per-mil + elsif params[:min_frequency].include? "pm" + per_mil=params[:min_frequency].gsub(/pm/,"") + if OpenTox::Algorithm.numeric? per_mil + per_mil = per_mil.to_i + else + bad_request=true + end + # set minfreq directly + else + if OpenTox::Algorithm.numeric? params[:min_frequency] + @minfreq=params[:min_frequency].to_i + LOGGER.debug "min_frequency #{@minfreq}" + else + bad_request=true + end + end + raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request + end + if @minfreq.nil? + @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) + LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)" end end - def add_fminer_data(fminer_instance, params, value_map) + def add_fminer_data(fminer_instance, value_map) + + + # detect nr duplicates per compound + compound_sizes = {} + @training_dataset.compounds.each do |compound| + entries=@training_dataset.data_entries[compound] + entries.each do |feature, values| + compound_sizes[compound] || compound_sizes[compound] = [] + compound_sizes[compound] << values.size unless values.size == 0 + end + compound_sizes[compound].uniq! + raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1 + compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array + end id = 1 # fminer start id is not 0 - @training_dataset.data_entries.each do |compound,entry| + + @training_dataset.compounds.each do |compound| + entry=@training_dataset.data_entries[compound] begin - smiles = OpenTox::Compound.smiles(compound.to_s) + smiles = OpenTox::Compound.new(compound).to_smiles rescue LOGGER.warn "No resource for #{compound.to_s}" next @@ -84,32 +132,31 @@ module OpenTox next end - value_map=params[:value_map] unless params[:value_map].nil? entry.each do |feature,values| if feature == @prediction_feature.uri - values.each do |value| - if value.nil? + (0...compound_sizes[compound]).each { |i| + if values[i].nil? LOGGER.warn "No #{feature} activity for #{compound.to_s}." else if @prediction_feature.feature_type == "classification" - activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n + activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect elsif @prediction_feature.feature_type == "regression" - activity= value.to_f + activity= values[i].to_f end begin - fminer_instance.AddCompound(smiles,id) - fminer_instance.AddActivity(activity, id) + fminer_instance.AddCompound(smiles,id) if fminer_instance + fminer_instance.AddActivity(activity, id) if fminer_instance @all_activities[id]=activity # DV: insert global information @compounds[id] = compound @smi[id] = smiles id += 1 rescue Exception => e - LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" + LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer" LOGGER.warn e.backtrace end end - end + } end end end @@ -380,11 +427,11 @@ module OpenTox prediction = acts[0] else #LOGGER.debug gram_matrix.to_yaml - @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests - @r.eval "set.seed(1)" + @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab" @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore" @r.eval "registerDoMC()" # switch on parallel processing + @r.eval "set.seed(1)" begin # set data @@ -400,7 +447,14 @@ module OpenTox # prepare data LOGGER.debug "Preparing R data ..." - @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification + @r.eval <<-EOR + weights=NULL + if (class(y) == 'character') { + y = factor(y) + suppressPackageStartupMessages(library('class')) + #weights=unlist(as.list(prop.table(table(y)))) + } + EOR @r.eval <<-EOR rem = nearZeroVar(prop_matrix) @@ -417,8 +471,18 @@ module OpenTox # model + support vectors LOGGER.debug "Creating R SVM model ..." - @r.eval <<-EOR - model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale")) + train_success = @r.eval <<-EOR + # AM: TODO: evaluate class weight effect by altering: + # AM: comment in 'weights' above run and class.weights=weights vs. class.weights=1-weights + # AM: vs + # AM: comment out 'weights' above (status quo), thereby disabling weights + model = train(prop_matrix,y, + method="svmradial", + preProcess=c("center", "scale"), + class.weights=weights, + trControl=trainControl(method="LGOCV",number=10), + tuneLength=8 + ) perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) EOR @@ -431,6 +495,7 @@ module OpenTox # censoring prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance ) + prediction = nil unless train_success LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}" rescue Exception => e LOGGER.debug "#{e.class}: #{e.message}" @@ -456,30 +521,42 @@ module OpenTox @r.del_missing = params[:del_missing] == true ? 1 : 0 r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_") @r.f_fds_r = r_result_file.to_s - + # need packs 'randomForest', 'RANN' @r.eval <<-EOR - set.seed(1) suppressPackageStartupMessages(library('caret')) suppressPackageStartupMessages(library('randomForest')) suppressPackageStartupMessages(library('RANN')) suppressPackageStartupMessages(library('doMC')) registerDoMC() - + set.seed(1) + acts = read.csv(ds_csv_file, check.names=F) feats = read.csv(fds_csv_file, check.names=F) ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-) - + features = ds[,(dim(acts)[2]+1):(dim(ds)[2])] y = ds[,which(names(ds) == prediction_feature)] - + # assumes a data matrix 'features' and a vector 'y' of target values row.names(features)=NULL - + + # features with all values missing removed + na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) ) + features = features[,!names(features) %in% na_col] + + # features with infinite values removed + inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) ) + features = features[,!names(features) %in% inf_col] + + # features with zero variance removed + zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) ) + features = features[,!names(features) %in% zero_var] + pp = NULL if (del_missing) { # needed if rows should be removed - na_ids = apply(features,1,function(x)any(is.na(x))) + na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) ) features = features[!na_ids,] y = y[!na_ids] pp = preProcess(features, method=c("scale", "center")) @@ -488,17 +565,23 @@ module OpenTox pp = preProcess(features, method=c("scale", "center", "knnImpute")) } features = predict(pp, features) - + + # features with nan values removed (sometimes preProcess return NaN values) + nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) ) + features = features[,!names(features) %in% nan_col] + # determine subsets - subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) - subsets = c(2,3,4,5,7,10,subsets) + subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7) + #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) + #subsets = c(2,3,4,5,7,10,subsets) + #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30) subsets = unique(sort(round(subsets))) subsets = subsets[subsets<=dim(features)[2]] subsets = subsets[subsets>1] - + # Recursive feature elimination - rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets) - + rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets) + # read existing dataset and select most useful features csv=feats[,c("SMILES", rfProfile$optVariables)] write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='') @@ -527,7 +610,7 @@ module OpenTox # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type # @return [Hash] Hash with matching Smarts and number of hits def self.lookup(params) - params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid]) + params[:compound].lookup(params[:features], params[:feature_dataset_uri], params[:pc_type], params[:lib], params[:subjectid]) end end @@ -539,3 +622,26 @@ module OpenTox end end end + +class Array + # collect method extended for parallel processing. + # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... } + # @param n the number of processes to spawn (default: unlimited) + def pcollect(n = nil) + nproc = 0 + result = collect do |*a| + r, w = IO.pipe + fork do + r.close + w.write( Marshal.dump( yield(*a) ) ) + end + if n and (nproc+=1) >= n + Process.wait ; nproc -= 1 + end + [ w.close, r ].last + end + Process.waitall + result.collect{|r| Marshal.load [ r.read, r.close ].first} + end +end + |