From 1b02cef5f5af5930a2c0a449357618c9266c29ed Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Thu, 3 May 2012 16:35:18 +0200 Subject: Ordered addition of entries to fminer --- lib/algorithm.rb | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/lib/algorithm.rb b/lib/algorithm.rb index b921b9c..64fa508 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -75,10 +75,26 @@ module OpenTox end end - def add_fminer_data(fminer_instance, value_map) + def add_fminer_data(fminer_instance, value_map, prepare_backend=true) + + + # detect nr duplicates per compound + compound_sizes = {} + @training_dataset.compounds.each do |compound| + entries=@training_dataset.data_entries[compound] + entries.each do |feature, values| + compound_sizes[compound] || compound_sizes[compound] = [] + compound_sizes[compound] << values.size + end + compound_sizes[compound].uniq! + raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1 + compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array + end id = 1 # fminer start id is not 0 - @training_dataset.data_entries.each do |compound,entry| #order of compounds does not influence result + + @training_dataset.compounds.each do |compound| + entry=@training_dataset.data_entries[compound] begin smiles = OpenTox::Compound.smiles(compound.to_s) rescue @@ -92,29 +108,29 @@ module OpenTox entry.each do |feature,values| if feature == @prediction_feature.uri - values.each do |value| - if value.nil? + (0...compound_sizes[compound]).each { |i| + if values[i].nil? LOGGER.warn "No #{feature} activity for #{compound.to_s}." else if @prediction_feature.feature_type == "classification" - activity= value_map.invert[value].to_i # activities are mapped to 1..n + activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect elsif @prediction_feature.feature_type == "regression" - activity= value.to_f + activity= values[i].to_f end begin - fminer_instance.AddCompound(smiles,id) - fminer_instance.AddActivity(activity, id) + fminer_instance.AddCompound(smiles,id) if prepare_backend + fminer_instance.AddActivity(activity, id) if prepare_backend @all_activities[id]=activity # DV: insert global information @compounds[id] = compound @smi[id] = smiles id += 1 rescue Exception => e - LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" + LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer" LOGGER.warn e.backtrace end end - end + } end end end -- cgit v1.2.3 From 3a790b762fafbfe1a3b92aa494355bc8ab6ca978 Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Mon, 7 May 2012 14:11:23 +0200 Subject: checking instance if data should be added to fminer --- lib/algorithm.rb | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 64fa508..54bb371 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -75,7 +75,7 @@ module OpenTox end end - def add_fminer_data(fminer_instance, value_map, prepare_backend=true) + def add_fminer_data(fminer_instance, value_map) # detect nr duplicates per compound @@ -119,8 +119,8 @@ module OpenTox activity= values[i].to_f end begin - fminer_instance.AddCompound(smiles,id) if prepare_backend - fminer_instance.AddActivity(activity, id) if prepare_backend + fminer_instance.AddCompound(smiles,id) if fminer_instance + fminer_instance.AddActivity(activity, id) if fminer_instance @all_activities[id]=activity # DV: insert global information @compounds[id] = compound @smi[id] = smiles @@ -579,3 +579,26 @@ module OpenTox end end end + +class Array + # collect method extended for parallel processing. + # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... } + # @param n the number of processes to spawn (default: unlimited) + def pcollect(n = nil) + nproc = 0 + result = collect do |*a| + r, w = IO.pipe + fork do + r.close + w.write( Marshal.dump( yield(*a) ) ) + end + if n and (nproc+=1) >= n + Process.wait ; nproc -= 1 + end + [ w.close, r ].last + end + Process.waitall + result.collect{|r| Marshal.load [ r.read, r.close ].first} + end +end + -- cgit v1.2.3 From d2397c0d6682989ff720bd0eb04be8b596d6b392 Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Tue, 15 May 2012 15:20:25 +0200 Subject: Fixed empty entries --- lib/serializer.rb | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/lib/serializer.rb b/lib/serializer.rb index 4c26329..a1b980d 100644 --- a/lib/serializer.rb +++ b/lib/serializer.rb @@ -483,13 +483,15 @@ module OpenTox compound_sizes = {} dataset.compounds.each do |compound| entries=dataset.data_entries[compound] - entries.each do |feature, values| - compound_sizes[compound] || compound_sizes[compound] = [] - compound_sizes[compound] << values.size + if entries + entries.each do |feature, values| + compound_sizes[compound] || compound_sizes[compound] = [] + compound_sizes[compound] << values.size + end + compound_sizes[compound].uniq! + raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1 + compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array end - compound_sizes[compound].uniq! - raise "Inappropriate data for CSV export" if compound_sizes[compound].size > 1 - compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array end # substructures: complete data entries with zeroes of appropriate duplicates @@ -501,23 +503,25 @@ module OpenTox dataset.compounds.each do |compound| entries=dataset.data_entries[compound] - cmpd = Compound.new(compound) - inchi = URI.encode_www_form_component(cmpd.to_inchi) - - # allocate container - row_container = Array.new(compound_sizes[compound]) - (0...row_container.size).each do |i| - row_container[i] = Array.new(@rows.first.size) - end - - entries.each { |feature, values| - (0...compound_sizes[compound]).each { |i| - j = features.index(feature)+1 - row_container[i][0] = inchi - row_container[i][j] = values[i] + if entries + cmpd = Compound.new(compound) + inchi = URI.encode_www_form_component(cmpd.to_inchi) + + # allocate container + row_container = Array.new(compound_sizes[compound]) + (0...row_container.size).each do |i| + row_container[i] = Array.new(@rows.first.size) + end + + entries.each { |feature, values| + (0...compound_sizes[compound]).each { |i| + j = features.index(feature)+1 + row_container[i][0] = inchi + row_container[i][j] = values[i] + } } - } - row_container.each { |r| @rows << r } + row_container.each { |r| @rows << r } + end end end -- cgit v1.2.3 From 0d8141443f36a22f9442362aa633c473f8b9e485 Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Mon, 21 May 2012 16:28:16 +0200 Subject: Fixed read-in of smiles for fminer --- lib/algorithm.rb | 2 +- lib/utils.rb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 54bb371..f27baa5 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -96,7 +96,7 @@ module OpenTox @training_dataset.compounds.each do |compound| entry=@training_dataset.data_entries[compound] begin - smiles = OpenTox::Compound.smiles(compound.to_s) + smiles = OpenTox::Compound.new(compound).to_smiles rescue LOGGER.warn "No resource for #{compound.to_s}" next diff --git a/lib/utils.rb b/lib/utils.rb index f6f8a4a..e04199d 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -467,8 +467,8 @@ module OpenTox end - # Effect calculation for classification - # @param [Array] Array of occurrences per class in the form of Enumerables. + # Effect calculation for classification. It is assumed that the elements of the arrays match each other pairwise + # @param [Array] Array of occurrences per class (in the form of Enumerables). # @param [Array] Array of database instance counts per class. def self.effect(occurrences, db_instances) max=0 -- cgit v1.2.3 From bc41a12a20612a17142c51626c708ceb1f764db6 Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Wed, 23 May 2012 08:54:45 +0200 Subject: Added fminer support for percentage and per-mil min frequencies --- lib/algorithm.rb | 54 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/lib/algorithm.rb b/lib/algorithm.rb index f27baa5..8b6fca5 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -68,10 +68,36 @@ module OpenTox raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature]) unless params[:min_frequency].nil? - @minfreq=params[:min_frequency].to_i - raise "Minimum frequency must be a number >0!" unless @minfreq>0 - else - @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST + # check for percentage + if params[:min_frequency].include? "pc" + per_mil=params[:min_frequency].gsub(/pc/,"") + if OpenTox::Algorithm.numeric? per_mil + per_mil = per_mil.to_i * 10 + else + bad_request=true + end + # check for per-mil + elsif params[:min_frequency].include? "pm" + per_mil=params[:min_frequency].gsub(/pm/,"") + if OpenTox::Algorithm.numeric? per_mil + per_mil = per_mil.to_i + else + bad_request=true + end + # set minfreq directly + else + if OpenTox::Algorithm.numeric? params[:min_frequency] + @minfreq=params[:min_frequency].to_i + LOGGER.debug "min_frequency #{@minfreq}" + else + bad_request=true + end + end + raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request + if @minfreq.nil? + @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) + LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)" + end end end @@ -478,7 +504,7 @@ module OpenTox @r.del_missing = params[:del_missing] == true ? 1 : 0 r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_") @r.f_fds_r = r_result_file.to_s - + # need packs 'randomForest', 'RANN' @r.eval <<-EOR suppressPackageStartupMessages(library('caret')) @@ -487,17 +513,17 @@ module OpenTox suppressPackageStartupMessages(library('doMC')) registerDoMC() set.seed(1) - + acts = read.csv(ds_csv_file, check.names=F) feats = read.csv(fds_csv_file, check.names=F) ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-) - + features = ds[,(dim(acts)[2]+1):(dim(ds)[2])] y = ds[,which(names(ds) == prediction_feature)] - + # assumes a data matrix 'features' and a vector 'y' of target values row.names(features)=NULL - + # features with all values missing removed na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) ) features = features[,!names(features) %in% na_col] @@ -509,7 +535,7 @@ module OpenTox # features with zero variance removed zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) ) features = features[,!names(features) %in% zero_var] - + pp = NULL if (del_missing) { # needed if rows should be removed @@ -522,11 +548,11 @@ module OpenTox pp = preProcess(features, method=c("scale", "center", "knnImpute")) } features = predict(pp, features) - + # features with nan values removed (sometimes preProcess return NaN values) nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) ) features = features[,!names(features) %in% nan_col] - + # determine subsets subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7) #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) @@ -535,10 +561,10 @@ module OpenTox subsets = unique(sort(round(subsets))) subsets = subsets[subsets<=dim(features)[2]] subsets = subsets[subsets>1] - + # Recursive feature elimination rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets) - + # read existing dataset and select most useful features csv=feats[,c("SMILES", rfProfile$optVariables)] write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='') -- cgit v1.2.3