summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordavor <vorgrimmlerdavid@gmx.de>2012-03-20 13:04:53 +0100
committerdavor <vorgrimmlerdavid@gmx.de>2012-03-20 13:04:53 +0100
commit45423141fa0ac10d46b049799ac129b47241e665 (patch)
tree8afa27a08d9f919d23f492382f2479e1de7d20a5
parentd0ee7e993f851407263962e0d8d3b9c3a2a76de6 (diff)
Backup including important chances from devjl_dv
train_succeess and dataset cleanup + own chances (get compound name function, small dataset fix)
-rw-r--r--lib/algorithm.rb14
-rw-r--r--lib/compound.rb42
-rw-r--r--lib/transform.rb2
-rw-r--r--lib/utils.rb2
4 files changed, 52 insertions, 8 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 40cbc84..19666ad 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -381,10 +381,10 @@ module OpenTox
else
#LOGGER.debug gram_matrix.to_yaml
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
- @r.eval "set.seed(1)"
@r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
@r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
@r.eval "registerDoMC()" # switch on parallel processing
+ @r.eval "set.seed(1)"
begin
# set data
@@ -417,7 +417,7 @@ module OpenTox
# model + support vectors
LOGGER.debug "Creating R SVM model ..."
- @r.eval <<-EOR
+ train_success = @r.eval <<-EOR
model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
EOR
@@ -431,6 +431,7 @@ module OpenTox
# censoring
prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
+ prediction = nil unless train_success
LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
rescue Exception => e
LOGGER.debug "#{e.class}: #{e.message}"
@@ -459,12 +460,12 @@ module OpenTox
# need packs 'randomForest', 'RANN'
@r.eval <<-EOR
- set.seed(1)
suppressPackageStartupMessages(library('caret'))
suppressPackageStartupMessages(library('randomForest'))
suppressPackageStartupMessages(library('RANN'))
suppressPackageStartupMessages(library('doMC'))
registerDoMC()
+ set.seed(1)
acts = read.csv(ds_csv_file, check.names=F)
feats = read.csv(fds_csv_file, check.names=F)
@@ -506,15 +507,16 @@ module OpenTox
features = features[,!names(features) %in% nan_col]
# determine subsets
- subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
- subsets = c(2,3,4,5,7,10,subsets)
+ subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
+ #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+ #subsets = c(2,3,4,5,7,10,subsets)
#subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
subsets = unique(sort(round(subsets)))
subsets = subsets[subsets<=dim(features)[2]]
subsets = subsets[subsets>1]
# Recursive feature elimination
- rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, method='cv'), sizes=subsets)
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
optVar = rfProfile$optVariables
if (rfProfile$bestSubset == dim(features)[2]) {
newRMSE = rfProfile$results$RMSE
diff --git a/lib/compound.rb b/lib/compound.rb
index c7c82c3..5024af1 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -3,6 +3,7 @@
module OpenTox
+ require "rexml/document"
# Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
class Compound
@@ -130,6 +131,47 @@ module OpenTox
"not available"
end
end
+
+
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+ # @example
+ # names = compound.to_names_hash
+ # @return [Hash] Classification => Name Array
+ def to_names_hash
+ begin
+ xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml")
+ xmldoc = REXML::Document.new(xml)
+ data = {}
+
+ xmldoc.root.elements[1].elements.each{|e|
+ if data.has_key?(e.attribute("classification").value) == false
+ data[e.attribute("classification").value] = [e.text]
+ else
+ data[e.attribute("classification").value].push(e.text)
+ end
+ }
+ data
+ rescue
+ "not available"
+ end
+ end
+
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+ # @example
+ # names = compound.to_names_hash
+ # @return [Hash] Classification => Name Array
+ def to_ambit_names_hash
+ begin
+ ds = OpenTox::Dataset.new
+ ds.save
+ ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}"))
+ ds.save
+ ds.uri
+ rescue
+ "not available"
+ end
+ end
+
# Match a smarts string
# @example
diff --git a/lib/transform.rb b/lib/transform.rb
index 8fe1093..cb530a3 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -396,7 +396,7 @@ module OpenTox
@q_prop = gsl_q_prop_orig.row(0).to_a
end
- LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+ LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
@sims = [ gram_matrix, @sims ]
diff --git a/lib/utils.rb b/lib/utils.rb
index d96c2b3..eccec46 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -60,7 +60,7 @@ module OpenTox
ds = OpenTox::Dataset.new
ds.save
parser.dataset = ds
- if compounds.size == 1
+ if compounds.size < 4
ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"),false,true,false) # all_numeric = true, all features should be treated as numeric
else
ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"),false,false,true) # del_nominal = true, which removes nominal features