summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Vorgrimmler <vorgrimmlerdavid@gmx.de>2012-04-19 16:06:33 +0200
committerDavid Vorgrimmler <vorgrimmlerdavid@gmx.de>2012-04-19 16:06:33 +0200
commit6dab51b5cf637b7c0d3d8585fe63f4116553ac5c (patch)
treed215da070142639e8d65ec30b24d3608e0561b38
parentad7cd1120e982253ecf0b515cc90dd0e45267685 (diff)
Manual merge with development.
-rw-r--r--ChangeLog8
-rw-r--r--VERSION2
-rw-r--r--lib/algorithm.rb2
-rw-r--r--lib/authorization.rb18
-rw-r--r--lib/compound.rb2
-rw-r--r--lib/environment.rb2
-rw-r--r--lib/model.rb3
-rw-r--r--lib/parser.rb1
-rw-r--r--lib/r-util.rb123
-rw-r--r--lib/stratification.R78
-rw-r--r--lib/utils.rb4
11 files changed, 189 insertions, 54 deletions
diff --git a/ChangeLog b/ChangeLog
index de9e01b..5872d56 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+v3.1.0 2012-02-24
+ * utils.rb: added for special routines (e.g. descriptor calculation)
+ * task.rb: Polling with increasing interval
+ * parser.rb: CSV up and download fixed
+ * transform.rb: routines to create machine learning data matrices
+ * algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
+ gauss() removed
+
v3.0.1 2011-10-19
* feature: model registration to ontology service
* ontology lib gets endpoints from ontology service
diff --git a/VERSION b/VERSION
index cb2b00e..fd2a018 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.0.1
+3.1.0
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 8d661b5..b921b9c 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -551,7 +551,7 @@ module OpenTox
# @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
# @return [Hash] Hash with matching Smarts and number of hits
def self.lookup(params)
- params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type], params[:lib])
+ params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type], params[:lib],params[:subjectid])
end
end
diff --git a/lib/authorization.rb b/lib/authorization.rb
index 5d57781..a9744e9 100644
--- a/lib/authorization.rb
+++ b/lib/authorization.rb
@@ -37,13 +37,15 @@ module OpenTox
#Loads and sends Policyfile(XML) to open-sso server
# @param [String] URI to create a policy for
- def send(uri)
+ def send(uri)
xml = get_xml(uri)
ret = false
- ret = Authorization.create_policy(xml, @subjectid)
+ ret = Authorization.create_policy(xml, @subjectid)
+ LOGGER.warn "Create policy on openSSO failed for URI: #{uri} subjectid: #{@subjectid}. Will try again." if !ret
+ ret = Authorization.create_policy(xml, @subjectid) if !ret
LOGGER.debug "Policy send with subjectid: #{@subjectid}"
LOGGER.warn "Not created Policy is: #{xml}" if !ret
- ret
+ ret
end
end
@@ -337,7 +339,7 @@ module OpenTox
# @param [String] subjectid
# @return [Boolean] true if access granted, else otherwise
def self.authorized?(uri, request_method, subjectid)
- if CONFIG[:authorization][:free_request].include?(request_method)
+ if CONFIG[:authorization][:free_request].include?(request_method)
#LOGGER.debug "authorized? >>true<< (request is free), method: #{request_method}, URI: #{uri}, subjectid: #{subjectid}"
true
elsif OpenTox::Authorization.free_uri?(uri, request_method)
@@ -360,7 +362,7 @@ module OpenTox
false
end
end
-
+
private
def self.free_uri?(uri, request_method)
if CONFIG[:authorization][:free_uris]
@@ -374,7 +376,7 @@ module OpenTox
end
return false
end
-
+
def self.authorize_exception?(uri, request_method)
if CONFIG[:authorization][:authorize_exceptions]
CONFIG[:authorization][:authorize_exceptions].each do |request_methods,uris|
@@ -387,6 +389,6 @@ module OpenTox
end
return false
end
-
+
end
-end \ No newline at end of file
+end
diff --git a/lib/compound.rb b/lib/compound.rb
index b180b15..a08d541 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -259,7 +259,7 @@ module OpenTox
uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"], "/pc/AllDescriptors"), {:dataset_uri => temp_uri, :pc_type => pc_type, :lib => lib})
ds = OpenTox::Dataset.find(uri)
entry = ds.data_entries[self.uri]
- ds.delete
+ ds.delete(subjectid)
temp_ds.delete
end
features = entry.keys
diff --git a/lib/environment.rb b/lib/environment.rb
index 6a72ba5..c1b8312 100644
--- a/lib/environment.rb
+++ b/lib/environment.rb
@@ -91,5 +91,5 @@ DC = OwlNamespace.new 'http://purl.org/dc/elements/1.1/'
OT = OwlNamespace.new 'http://www.opentox.org/api/1.1#'
OTA = OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#'
XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#'
-BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
+#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
diff --git a/lib/model.rb b/lib/model.rb
index f8d98ba..c9d367e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -258,7 +258,8 @@ module OpenTox
:features => @features,
:feature_dataset_uri => @metadata[OT.featureDataset],
:pc_type => self.parameter(\"pc_type\"),
- :lib => self.parameter(\"lib\")
+ :lib => self.parameter(\"lib\"),
+ :subjectid => subjectid
})")
# Adding fingerprint of query compound with features and values(p_value*nr_hits)
diff --git a/lib/parser.rb b/lib/parser.rb
index 2e1dc5d..07b44db 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -536,6 +536,7 @@ module OpenTox
def initialize
@data = {}
@activity_errors = []
+ @max_class_values = 3
end
def feature_values(feature)
diff --git a/lib/r-util.rb b/lib/r-util.rb
index 7163c46..0d4e82c 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir
require "tempfile"
+class Array
+
+ def check_uniq
+ hash = {}
+ self.each do |x|
+ raise "duplicate #{x}" if hash[x]
+ hash[x] = true
+ end
+ end
+
+end
+
module OpenTox
class RUtil
@@ -75,12 +87,10 @@ module OpenTox
end
# embedds feature values of two datasets into 2D and plots it
- # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
#
def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
- features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
+ features=nil, subjectid=nil, waiting_task=nil)
- raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
LOGGER.debug("r-util> create feature value plot")
d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
@@ -102,17 +112,13 @@ module OpenTox
@r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
@r.names = [dataset_name1, dataset_name2]
LOGGER.debug("r-util> - convert data to 2d")
- @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
+ #@r.eval "save.image(\"/tmp/image.R\")"
+ @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
waiting_task.progress(75) if waiting_task
- if fast_plot
- info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
- else
- info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
- end
LOGGER.debug("r-util> - plot data")
plot_to_files(files) do |file|
- @r.eval "plot_split( df.2d, split, names, #{info})"
+ @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
end
end
@@ -170,19 +176,68 @@ module OpenTox
end
end
- # stratified splits a dataset into two dataset the feature values
+ # stratified splits a dataset into two dataset according to the feature values
+ # all features are taken into account unless <split_features> is given
+ # returns two datases
+ def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+ stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
+ end
+
+ # stratified splits a dataset into k datasets according the feature values
# all features are taken into account unless <split_features> is given
- def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+ # returns two arrays of datasets
+ def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
+ stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
+ end
+
+ private
+ def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
+ raise "internal error" if num_folds!=nil and pct!=nil
+ k_fold_split = num_folds!=nil
+ if k_fold_split
+ raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
+ else
+ raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
+ end
raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
+ raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
+ raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)
LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
- df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
+ df = dataset_to_dataframe( dataset, missing_values, subjectid)
@r.eval "set.seed(#{seed})"
- @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
- split = @r.pull 'split'
- split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
- split_to_datasets( df, split, subjectid )
+ str_split_features = ""
+ if split_features
+ @r.split_features = split_features if split_features
+ str_split_features = "colnames=split_features"
+ end
+ @r.eval "save.image(\"/tmp/image.R\")"
+
+ if k_fold_split
+ @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
+ split = @r.pull 'split'
+ train = []
+ test = []
+ num_folds.times do |f|
+ datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s
+ metadata[DC.title] = "training "+datasetname
+ train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
+ metadata[DC.title] = "test "+datasetname
+ test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
+ end
+ return train, test
+ else
+ puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+ @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+ split = @r.pull 'split'
+ metadata[DC.title] = "Training dataset split of "+dataset.uri
+ train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
+ metadata[DC.title] = "Test dataset split of "+dataset.uri
+ test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
+ return train, test
+ end
end
+ public
# dataset should be loaded completely (use Dataset.find)
# takes duplicates into account
@@ -212,9 +267,13 @@ module OpenTox
features = dataset.features.keys.sort
end
compounds = []
+ compound_names = []
dataset.compounds.each do |c|
+ count = 0
num_compounds[c].times do |i|
compounds << c
+ compound_names << "#{c}$#{count}"
+ count+=1
end
end
@@ -238,7 +297,7 @@ module OpenTox
end
end
df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
- assign_dataframe(df_name,d_values,compounds,features)
+ assign_dataframe(df_name,d_values,compound_names,features)
# set dataframe column types accordingly
f_count = 1 #R starts at 1
@@ -264,16 +323,18 @@ module OpenTox
# converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
# this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
- def dataframe_to_dataset( df, subjectid=nil )
- dataframe_to_dataset_indices( df, subjectid, nil)
+ def dataframe_to_dataset( df, metadata={}, subjectid=nil )
+ dataframe_to_dataset_indices( df, metadata, subjectid, nil)
end
private
- def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
+ def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
raise unless @@feats[df].size>0
- values, compounds, features = pull_dataframe(df)
+ values, compound_names, features = pull_dataframe(df)
+ compounds = compound_names.collect{|c| c.split("$")[0]}
features.each{|f| raise unless @@feats[df][f]}
dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+ dataset.add_metadata(metadata)
LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
features.each{|f| dataset.add_feature(f,@@feats[df][f])}
@@ -290,16 +351,12 @@ module OpenTox
dataset
end
- def split_to_datasets( df, split, subjectid=nil )
- sets = []
- (split.min.to_i .. split.max.to_i).each do |i|
- indices = []
- split.size.times{|j| indices<<j if split[j]==i}
- dataset = dataframe_to_dataset_indices( df, subjectid, indices )
- LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
- sets << dataset
- end
- sets
+ def split_to_dataset( df, split, metadata={}, subjectid=nil )
+ indices = []
+ split.size.times{|i| indices<<i if yield(split[i]) }
+ dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
+ LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+ dataset
end
def pull_dataframe(df)
@@ -323,6 +380,8 @@ module OpenTox
end
def assign_dataframe(df,input,rownames,colnames)
+ rownames.check_uniq if rownames
+ colnames.check_uniq if colnames
tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
file = File.new(tmp, 'w')
input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
diff --git a/lib/stratification.R b/lib/stratification.R
index 76ff2d8..3f8698c 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -1,4 +1,13 @@
+round_it <- function( x )
+{
+ if(isTRUE((x - floor(x))>=0.5))
+ ceiling(x)
+ else
+ floor(x)
+}
+
+
nominal_to_binary <- function( data )
{
result = NULL
@@ -41,9 +50,13 @@ nominal_to_binary <- function( data )
result
}
-process_data <- function( data )
+process_data <- function( data, colnames=NULL )
{
data.num <- as.data.frame(data)
+ if (!is.null(colnames))
+ {
+ data.num = subset(data.num, select = colnames)
+ }
if (!is.numeric(data.num))
{
data.num = nominal_to_binary(data.num)
@@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 )
cbind(s$partition[,m])
}
-stratified_split <- function( data, ratio=0.3, method="cluster" )
+stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
{
- data.processed = as.matrix(process_data( data ))
+ data.processed = as.matrix(process_data( data, colnames ))
+ print(paste("split using #features: ",ncol(data.processed)))
if (method == "samplecube")
{
require("sampling")
# adjust ratio to make samplecube return exact number of samples
- ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
+ ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
pik = rep(ratio,times=nrow(data.processed))
data.strat = cbind(pik,data.processed)
samplecube(data.strat,pik,order=2,comment=F)
@@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" )
stop("unknown method")
}
-stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
{
print(paste(num_folds,"-fold-split, data-size",nrow(data)))
- data.processed = as.matrix(process_data( data ))
+ data.processed = as.matrix(process_data( data, colnames ))
+ print(paste("split using #features: ",ncol(data.processed)))
if (method == "samplecube")
{
folds = rep(0, times=nrow(data))
@@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
{
require("TunePareto")
cl = cluster(data.processed)
- res = generateCVRuns(cl,ntimes=1,nfold=3)
+ res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
folds = rep(0, times=nrow(data))
for (i in 1:num_folds)
for(j in 1:length(res[[1]][[i]]))
@@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
stop("unknown method")
}
+duplicate_indices <- function( data ) {
+ indices = 1:nrow(data)
+ z = data
+ duplicate_index = anyDuplicated(z)
+ while(duplicate_index) {
+ duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
+ #print(paste(duplicate_index,'is dupl to',duplicate_to_index))
+ indices[duplicate_index] <- duplicate_to_index
+ z[duplicate_index,] <- paste('123$ยง%',duplicate_index)
+ duplicate_index = anyDuplicated(z)
+ }
+ indices
+}
+
+add_duplicates <- function( data, dup_indices ) {
+ result = data[1,]
+ for(i in 2:length(dup_indices)) {
+ row = data[rownames(data)==dup_indices[i],]
+ if(length(row)==0)
+ stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
+ result = rbind(result, row)
+ }
+ rownames(result)<-NULL
+ result
+}
+
+sammon_duplicates <- function( data, ... ) {
+ di <- duplicate_indices(data)
+ print(di)
+ u <- unique(data)
+ print(paste('unique data points',nrow(u),'of',nrow(data)))
+ if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
+ points_unique <- sammon(dist(u), ...)$points
+ if (nrow(u)<nrow(data))
+ {
+ points <- add_duplicates(points_unique, di)
+ points
+ }
+ else
+ {
+ points_unique
+ }
+}
+
plot_pre_process <- function( data, method="pca" )
{
data.processed = process_data( data )
@@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" )
data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
data.emb$conf
}
+ else if (method == "sammon")
+ {
+ require("MASS")
+ sammon_duplicates(data.processed, k=2)
+ }
else
stop("unknown method")
}
diff --git a/lib/utils.rb b/lib/utils.rb
index a3f8161..88b8347 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -359,7 +359,7 @@ module OpenTox
# @param[Hash] keys: SMILES, values: InChIs
# @param[Array] field descriptions, one for each feature
# @return[Array] CSV, array of field ids, array of field descriptions
- def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids)
+ def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil)
master=nil
ids=[]
@@ -369,7 +369,7 @@ module OpenTox
(1...ambit_result_uri.size).collect { |idx|
curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
#LOGGER.debug "Requesting #{curr_uri}"
- csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv"}) )
+ csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
if csv_data[0] && csv_data[0].size>1
if master.nil? # This is the smiles entry
(1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }