diff options
author | davor <vorgrimmlerdavid@gmx.de> | 2012-03-20 13:04:53 +0100 |
---|---|---|
committer | davor <vorgrimmlerdavid@gmx.de> | 2012-03-20 13:04:53 +0100 |
commit | 45423141fa0ac10d46b049799ac129b47241e665 (patch) | |
tree | 8afa27a08d9f919d23f492382f2479e1de7d20a5 | |
parent | d0ee7e993f851407263962e0d8d3b9c3a2a76de6 (diff) |
Backup including important chances from devjl_dv
train_succeess and dataset cleanup + own chances (get compound name
function, small dataset fix)
-rw-r--r-- | lib/algorithm.rb | 14 | ||||
-rw-r--r-- | lib/compound.rb | 42 | ||||
-rw-r--r-- | lib/transform.rb | 2 | ||||
-rw-r--r-- | lib/utils.rb | 2 |
4 files changed, 52 insertions, 8 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 40cbc84..19666ad 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -381,10 +381,10 @@ module OpenTox else #LOGGER.debug gram_matrix.to_yaml @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests - @r.eval "set.seed(1)" @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab" @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore" @r.eval "registerDoMC()" # switch on parallel processing + @r.eval "set.seed(1)" begin # set data @@ -417,7 +417,7 @@ module OpenTox # model + support vectors LOGGER.debug "Creating R SVM model ..." - @r.eval <<-EOR + train_success = @r.eval <<-EOR model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale")) perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) EOR @@ -431,6 +431,7 @@ module OpenTox # censoring prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance ) + prediction = nil unless train_success LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}" rescue Exception => e LOGGER.debug "#{e.class}: #{e.message}" @@ -459,12 +460,12 @@ module OpenTox # need packs 'randomForest', 'RANN' @r.eval <<-EOR - set.seed(1) suppressPackageStartupMessages(library('caret')) suppressPackageStartupMessages(library('randomForest')) suppressPackageStartupMessages(library('RANN')) suppressPackageStartupMessages(library('doMC')) registerDoMC() + set.seed(1) acts = read.csv(ds_csv_file, check.names=F) feats = read.csv(fds_csv_file, check.names=F) @@ -506,15 +507,16 @@ module OpenTox features = features[,!names(features) %in% nan_col] # determine subsets - subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) - subsets = c(2,3,4,5,7,10,subsets) + subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7) + #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) + #subsets = c(2,3,4,5,7,10,subsets) #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30) subsets = unique(sort(round(subsets))) subsets = subsets[subsets<=dim(features)[2]] subsets = subsets[subsets>1] # Recursive feature elimination - rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, method='cv'), sizes=subsets) + rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets) optVar = rfProfile$optVariables if (rfProfile$bestSubset == dim(features)[2]) { newRMSE = rfProfile$results$RMSE diff --git a/lib/compound.rb b/lib/compound.rb index c7c82c3..5024af1 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -3,6 +3,7 @@ module OpenTox + require "rexml/document" # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure). class Compound @@ -130,6 +131,47 @@ module OpenTox "not available" end end + + + # Get all known compound names sorted by classification. Relies on an external service for name lookups. + # @example + # names = compound.to_names_hash + # @return [Hash] Classification => Name Array + def to_names_hash + begin + xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml") + xmldoc = REXML::Document.new(xml) + data = {} + + xmldoc.root.elements[1].elements.each{|e| + if data.has_key?(e.attribute("classification").value) == false + data[e.attribute("classification").value] = [e.text] + else + data[e.attribute("classification").value].push(e.text) + end + } + data + rescue + "not available" + end + end + + # Get all known compound names sorted by classification. Relies on an external service for name lookups. + # @example + # names = compound.to_names_hash + # @return [Hash] Classification => Name Array + def to_ambit_names_hash + begin + ds = OpenTox::Dataset.new + ds.save + ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}")) + ds.save + ds.uri + rescue + "not available" + end + end + # Match a smarts string # @example diff --git a/lib/transform.rb b/lib/transform.rb index 8fe1093..cb530a3 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -396,7 +396,7 @@ module OpenTox @q_prop = gsl_q_prop_orig.row(0).to_a end - LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" + LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop) LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" @sims = [ gram_matrix, @sims ] diff --git a/lib/utils.rb b/lib/utils.rb index d96c2b3..eccec46 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -60,7 +60,7 @@ module OpenTox ds = OpenTox::Dataset.new ds.save parser.dataset = ds - if compounds.size == 1 + if compounds.size < 4 ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"),false,true,false) # all_numeric = true, all features should be treated as numeric else ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"),false,false,true) # del_nominal = true, which removes nominal features |