summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Vorgrimmler <vorgrimmlerdavid@gmx.de>2012-06-05 16:03:30 +0200
committerDavid Vorgrimmler <vorgrimmlerdavid@gmx.de>2012-06-05 16:03:30 +0200
commite2d670cb86d7b78f69d7f5d24a8cae5d79505e0f (patch)
tree2163a3e46afd2938e0a9e48aba6e00e26764f183
parent9cc85cd21d4c400b2a5c20068efaac5cde56a476 (diff)
parentbc41a12a20612a17142c51626c708ceb1f764db6 (diff)
Merge branch 'bbrc-sample' into development
-rw-r--r--lib/algorithm.rb113
-rw-r--r--lib/serializer.rb48
-rw-r--r--lib/utils.rb4
3 files changed, 117 insertions, 48 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index b921b9c..8b6fca5 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -68,19 +68,61 @@ module OpenTox
raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
unless params[:min_frequency].nil?
- @minfreq=params[:min_frequency].to_i
- raise "Minimum frequency must be a number >0!" unless @minfreq>0
- else
- @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+ # check for percentage
+ if params[:min_frequency].include? "pc"
+ per_mil=params[:min_frequency].gsub(/pc/,"")
+ if OpenTox::Algorithm.numeric? per_mil
+ per_mil = per_mil.to_i * 10
+ else
+ bad_request=true
+ end
+ # check for per-mil
+ elsif params[:min_frequency].include? "pm"
+ per_mil=params[:min_frequency].gsub(/pm/,"")
+ if OpenTox::Algorithm.numeric? per_mil
+ per_mil = per_mil.to_i
+ else
+ bad_request=true
+ end
+ # set minfreq directly
+ else
+ if OpenTox::Algorithm.numeric? params[:min_frequency]
+ @minfreq=params[:min_frequency].to_i
+ LOGGER.debug "min_frequency #{@minfreq}"
+ else
+ bad_request=true
+ end
+ end
+ raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request
+ if @minfreq.nil?
+ @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil)
+ LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
+ end
end
end
def add_fminer_data(fminer_instance, value_map)
+
+ # detect nr duplicates per compound
+ compound_sizes = {}
+ @training_dataset.compounds.each do |compound|
+ entries=@training_dataset.data_entries[compound]
+ entries.each do |feature, values|
+ compound_sizes[compound] || compound_sizes[compound] = []
+ compound_sizes[compound] << values.size
+ end
+ compound_sizes[compound].uniq!
+ raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1
+ compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
+ end
+
id = 1 # fminer start id is not 0
- @training_dataset.data_entries.each do |compound,entry| #order of compounds does not influence result
+
+ @training_dataset.compounds.each do |compound|
+ entry=@training_dataset.data_entries[compound]
begin
- smiles = OpenTox::Compound.smiles(compound.to_s)
+ smiles = OpenTox::Compound.new(compound).to_smiles
rescue
LOGGER.warn "No resource for #{compound.to_s}"
next
@@ -92,29 +134,29 @@ module OpenTox
entry.each do |feature,values|
if feature == @prediction_feature.uri
- values.each do |value|
- if value.nil?
+ (0...compound_sizes[compound]).each { |i|
+ if values[i].nil?
LOGGER.warn "No #{feature} activity for #{compound.to_s}."
else
if @prediction_feature.feature_type == "classification"
- activity= value_map.invert[value].to_i # activities are mapped to 1..n
+ activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
elsif @prediction_feature.feature_type == "regression"
- activity= value.to_f
+ activity= values[i].to_f
end
begin
- fminer_instance.AddCompound(smiles,id)
- fminer_instance.AddActivity(activity, id)
+ fminer_instance.AddCompound(smiles,id) if fminer_instance
+ fminer_instance.AddActivity(activity, id) if fminer_instance
@all_activities[id]=activity # DV: insert global information
@compounds[id] = compound
@smi[id] = smiles
id += 1
rescue Exception => e
- LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+ LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer"
LOGGER.warn e.backtrace
end
end
- end
+ }
end
end
end
@@ -462,7 +504,7 @@ module OpenTox
@r.del_missing = params[:del_missing] == true ? 1 : 0
r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
@r.f_fds_r = r_result_file.to_s
-
+
# need packs 'randomForest', 'RANN'
@r.eval <<-EOR
suppressPackageStartupMessages(library('caret'))
@@ -471,17 +513,17 @@ module OpenTox
suppressPackageStartupMessages(library('doMC'))
registerDoMC()
set.seed(1)
-
+
acts = read.csv(ds_csv_file, check.names=F)
feats = read.csv(fds_csv_file, check.names=F)
ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
-
+
features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
y = ds[,which(names(ds) == prediction_feature)]
-
+
# assumes a data matrix 'features' and a vector 'y' of target values
row.names(features)=NULL
-
+
# features with all values missing removed
na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
features = features[,!names(features) %in% na_col]
@@ -493,7 +535,7 @@ module OpenTox
# features with zero variance removed
zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
features = features[,!names(features) %in% zero_var]
-
+
pp = NULL
if (del_missing) {
# needed if rows should be removed
@@ -506,11 +548,11 @@ module OpenTox
pp = preProcess(features, method=c("scale", "center", "knnImpute"))
}
features = predict(pp, features)
-
+
# features with nan values removed (sometimes preProcess return NaN values)
nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
features = features[,!names(features) %in% nan_col]
-
+
# determine subsets
subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
#subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
@@ -519,10 +561,10 @@ module OpenTox
subsets = unique(sort(round(subsets)))
subsets = subsets[subsets<=dim(features)[2]]
subsets = subsets[subsets>1]
-
+
# Recursive feature elimination
rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
-
+
# read existing dataset and select most useful features
csv=feats[,c("SMILES", rfProfile$optVariables)]
write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
@@ -563,3 +605,26 @@ module OpenTox
end
end
end
+
+class Array
+ # collect method extended for parallel processing.
+ # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... }
+ # @param n the number of processes to spawn (default: unlimited)
+ def pcollect(n = nil)
+ nproc = 0
+ result = collect do |*a|
+ r, w = IO.pipe
+ fork do
+ r.close
+ w.write( Marshal.dump( yield(*a) ) )
+ end
+ if n and (nproc+=1) >= n
+ Process.wait ; nproc -= 1
+ end
+ [ w.close, r ].last
+ end
+ Process.waitall
+ result.collect{|r| Marshal.load [ r.read, r.close ].first}
+ end
+end
+
diff --git a/lib/serializer.rb b/lib/serializer.rb
index 4c26329..a1b980d 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -483,13 +483,15 @@ module OpenTox
compound_sizes = {}
dataset.compounds.each do |compound|
entries=dataset.data_entries[compound]
- entries.each do |feature, values|
- compound_sizes[compound] || compound_sizes[compound] = []
- compound_sizes[compound] << values.size
+ if entries
+ entries.each do |feature, values|
+ compound_sizes[compound] || compound_sizes[compound] = []
+ compound_sizes[compound] << values.size
+ end
+ compound_sizes[compound].uniq!
+ raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1
+ compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
end
- compound_sizes[compound].uniq!
- raise "Inappropriate data for CSV export" if compound_sizes[compound].size > 1
- compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
end
# substructures: complete data entries with zeroes of appropriate duplicates
@@ -501,23 +503,25 @@ module OpenTox
dataset.compounds.each do |compound|
entries=dataset.data_entries[compound]
- cmpd = Compound.new(compound)
- inchi = URI.encode_www_form_component(cmpd.to_inchi)
-
- # allocate container
- row_container = Array.new(compound_sizes[compound])
- (0...row_container.size).each do |i|
- row_container[i] = Array.new(@rows.first.size)
- end
-
- entries.each { |feature, values|
- (0...compound_sizes[compound]).each { |i|
- j = features.index(feature)+1
- row_container[i][0] = inchi
- row_container[i][j] = values[i]
+ if entries
+ cmpd = Compound.new(compound)
+ inchi = URI.encode_www_form_component(cmpd.to_inchi)
+
+ # allocate container
+ row_container = Array.new(compound_sizes[compound])
+ (0...row_container.size).each do |i|
+ row_container[i] = Array.new(@rows.first.size)
+ end
+
+ entries.each { |feature, values|
+ (0...compound_sizes[compound]).each { |i|
+ j = features.index(feature)+1
+ row_container[i][0] = inchi
+ row_container[i][j] = values[i]
+ }
}
- }
- row_container.each { |r| @rows << r }
+ row_container.each { |r| @rows << r }
+ end
end
end
diff --git a/lib/utils.rb b/lib/utils.rb
index f6f8a4a..e04199d 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -467,8 +467,8 @@ module OpenTox
end
- # Effect calculation for classification
- # @param [Array] Array of occurrences per class in the form of Enumerables.
+ # Effect calculation for classification. It is assumed that the elements of the arrays match each other pairwise
+ # @param [Array] Array of occurrences per class (in the form of Enumerables).
# @param [Array] Array of database instance counts per class.
def self.effect(occurrences, db_instances)
max=0