diff options
author | David Vorgrimmler <vorgrimmlerdavid@gmx.de> | 2012-04-19 16:06:33 +0200 |
---|---|---|
committer | David Vorgrimmler <vorgrimmlerdavid@gmx.de> | 2012-04-19 16:06:33 +0200 |
commit | 6dab51b5cf637b7c0d3d8585fe63f4116553ac5c (patch) | |
tree | d215da070142639e8d65ec30b24d3608e0561b38 | |
parent | ad7cd1120e982253ecf0b515cc90dd0e45267685 (diff) |
Manual merge with development.
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | VERSION | 2 | ||||
-rw-r--r-- | lib/algorithm.rb | 2 | ||||
-rw-r--r-- | lib/authorization.rb | 18 | ||||
-rw-r--r-- | lib/compound.rb | 2 | ||||
-rw-r--r-- | lib/environment.rb | 2 | ||||
-rw-r--r-- | lib/model.rb | 3 | ||||
-rw-r--r-- | lib/parser.rb | 1 | ||||
-rw-r--r-- | lib/r-util.rb | 123 | ||||
-rw-r--r-- | lib/stratification.R | 78 | ||||
-rw-r--r-- | lib/utils.rb | 4 |
11 files changed, 189 insertions, 54 deletions
@@ -1,3 +1,11 @@ +v3.1.0 2012-02-24 + * utils.rb: added for special routines (e.g. descriptor calculation) + * task.rb: Polling with increasing interval + * parser.rb: CSV up and download fixed + * transform.rb: routines to create machine learning data matrices + * algorithm.rb: SVM parameter grid search, cos similarity as algorithm, + gauss() removed + v3.0.1 2011-10-19 * feature: model registration to ontology service * ontology lib gets endpoints from ontology service @@ -1 +1 @@ -3.0.1 +3.1.0 diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 8d661b5..b921b9c 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -551,7 +551,7 @@ module OpenTox # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type # @return [Hash] Hash with matching Smarts and number of hits def self.lookup(params) - params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type], params[:lib]) + params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type], params[:lib],params[:subjectid]) end end diff --git a/lib/authorization.rb b/lib/authorization.rb index 5d57781..a9744e9 100644 --- a/lib/authorization.rb +++ b/lib/authorization.rb @@ -37,13 +37,15 @@ module OpenTox #Loads and sends Policyfile(XML) to open-sso server # @param [String] URI to create a policy for - def send(uri) + def send(uri) xml = get_xml(uri) ret = false - ret = Authorization.create_policy(xml, @subjectid) + ret = Authorization.create_policy(xml, @subjectid) + LOGGER.warn "Create policy on openSSO failed for URI: #{uri} subjectid: #{@subjectid}. Will try again." if !ret + ret = Authorization.create_policy(xml, @subjectid) if !ret LOGGER.debug "Policy send with subjectid: #{@subjectid}" LOGGER.warn "Not created Policy is: #{xml}" if !ret - ret + ret end end @@ -337,7 +339,7 @@ module OpenTox # @param [String] subjectid # @return [Boolean] true if access granted, else otherwise def self.authorized?(uri, request_method, subjectid) - if CONFIG[:authorization][:free_request].include?(request_method) + if CONFIG[:authorization][:free_request].include?(request_method) #LOGGER.debug "authorized? >>true<< (request is free), method: #{request_method}, URI: #{uri}, subjectid: #{subjectid}" true elsif OpenTox::Authorization.free_uri?(uri, request_method) @@ -360,7 +362,7 @@ module OpenTox false end end - + private def self.free_uri?(uri, request_method) if CONFIG[:authorization][:free_uris] @@ -374,7 +376,7 @@ module OpenTox end return false end - + def self.authorize_exception?(uri, request_method) if CONFIG[:authorization][:authorize_exceptions] CONFIG[:authorization][:authorize_exceptions].each do |request_methods,uris| @@ -387,6 +389,6 @@ module OpenTox end return false end - + end -end
\ No newline at end of file +end diff --git a/lib/compound.rb b/lib/compound.rb index b180b15..a08d541 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -259,7 +259,7 @@ module OpenTox uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"], "/pc/AllDescriptors"), {:dataset_uri => temp_uri, :pc_type => pc_type, :lib => lib}) ds = OpenTox::Dataset.find(uri) entry = ds.data_entries[self.uri] - ds.delete + ds.delete(subjectid) temp_ds.delete end features = entry.keys diff --git a/lib/environment.rb b/lib/environment.rb index 6a72ba5..c1b8312 100644 --- a/lib/environment.rb +++ b/lib/environment.rb @@ -91,5 +91,5 @@ DC = OwlNamespace.new 'http://purl.org/dc/elements/1.1/' OT = OwlNamespace.new 'http://www.opentox.org/api/1.1#' OTA = OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#' XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#' -BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#' +#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#' diff --git a/lib/model.rb b/lib/model.rb index f8d98ba..c9d367e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -258,7 +258,8 @@ module OpenTox :features => @features, :feature_dataset_uri => @metadata[OT.featureDataset], :pc_type => self.parameter(\"pc_type\"), - :lib => self.parameter(\"lib\") + :lib => self.parameter(\"lib\"), + :subjectid => subjectid })") # Adding fingerprint of query compound with features and values(p_value*nr_hits) diff --git a/lib/parser.rb b/lib/parser.rb index 2e1dc5d..07b44db 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -536,6 +536,7 @@ module OpenTox def initialize @data = {} @activity_errors = [] + @max_class_values = 3 end def feature_values(feature) diff --git a/lib/r-util.rb b/lib/r-util.rb index 7163c46..0d4e82c 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir require "tempfile" +class Array + + def check_uniq + hash = {} + self.each do |x| + raise "duplicate #{x}" if hash[x] + hash[x] = true + end + end + +end + module OpenTox class RUtil @@ -75,12 +87,10 @@ module OpenTox end # embedds feature values of two datasets into 2D and plots it - # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method) # def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, - features=nil, fast_plot=true, subjectid=nil, waiting_task=nil) + features=nil, subjectid=nil, waiting_task=nil) - raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof") LOGGER.debug("r-util> create feature value plot") d1 = OpenTox::Dataset.find(dataset_uri1,subjectid) d2 = OpenTox::Dataset.find(dataset_uri2,subjectid) @@ -102,17 +112,13 @@ module OpenTox @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))" @r.names = [dataset_name1, dataset_name2] LOGGER.debug("r-util> - convert data to 2d") - @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')" + #@r.eval "save.image(\"/tmp/image.R\")" + @r.eval "df.2d <- plot_pre_process(df, method='sammon')" waiting_task.progress(75) if waiting_task - if fast_plot - info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'" - else - info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'" - end LOGGER.debug("r-util> - plot data") plot_to_files(files) do |file| - @r.eval "plot_split( df.2d, split, names, #{info})" + @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')" end end @@ -170,19 +176,68 @@ module OpenTox end end - # stratified splits a dataset into two dataset the feature values + # stratified splits a dataset into two dataset according to the feature values + # all features are taken into account unless <split_features> is given + # returns two datases + def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) + stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features ) + end + + # stratified splits a dataset into k datasets according the feature values # all features are taken into account unless <split_features> is given - def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) + # returns two arrays of datasets + def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil ) + stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features ) + end + + private + def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil ) + raise "internal error" if num_folds!=nil and pct!=nil + k_fold_split = num_folds!=nil + if k_fold_split + raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum) + else + raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric) + end raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0 + raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0 + raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String) LOGGER.debug("r-util> apply stratified split to #{dataset.uri}") - df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features ) + df = dataset_to_dataframe( dataset, missing_values, subjectid) @r.eval "set.seed(#{seed})" - @r.eval "split <- stratified_split(#{df}, ratio=#{pct})" - split = @r.pull 'split' - split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set - split_to_datasets( df, split, subjectid ) + str_split_features = "" + if split_features + @r.split_features = split_features if split_features + str_split_features = "colnames=split_features" + end + @r.eval "save.image(\"/tmp/image.R\")" + + if k_fold_split + @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})" + split = @r.pull 'split' + train = [] + test = [] + num_folds.times do |f| + datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s + metadata[DC.title] = "training "+datasetname + train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) } + metadata[DC.title] = "test "+datasetname + test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) } + end + return train, test + else + puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" + @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" + split = @r.pull 'split' + metadata[DC.title] = "Training dataset split of "+dataset.uri + train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 } + metadata[DC.title] = "Test dataset split of "+dataset.uri + test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 } + return train, test + end end + public # dataset should be loaded completely (use Dataset.find) # takes duplicates into account @@ -212,9 +267,13 @@ module OpenTox features = dataset.features.keys.sort end compounds = [] + compound_names = [] dataset.compounds.each do |c| + count = 0 num_compounds[c].times do |i| compounds << c + compound_names << "#{c}$#{count}" + count+=1 end end @@ -238,7 +297,7 @@ module OpenTox end end df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" - assign_dataframe(df_name,d_values,compounds,features) + assign_dataframe(df_name,d_values,compound_names,features) # set dataframe column types accordingly f_count = 1 #R starts at 1 @@ -264,16 +323,18 @@ module OpenTox # converts a dataframe into a dataset (a new dataset is created at the dataset webservice) # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!) - def dataframe_to_dataset( df, subjectid=nil ) - dataframe_to_dataset_indices( df, subjectid, nil) + def dataframe_to_dataset( df, metadata={}, subjectid=nil ) + dataframe_to_dataset_indices( df, metadata, subjectid, nil) end private - def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil ) + def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil ) raise unless @@feats[df].size>0 - values, compounds, features = pull_dataframe(df) + values, compound_names, features = pull_dataframe(df) + compounds = compound_names.collect{|c| c.split("$")[0]} features.each{|f| raise unless @@feats[df][f]} dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid) + dataset.add_metadata(metadata) LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}" compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)} features.each{|f| dataset.add_feature(f,@@feats[df][f])} @@ -290,16 +351,12 @@ module OpenTox dataset end - def split_to_datasets( df, split, subjectid=nil ) - sets = [] - (split.min.to_i .. split.max.to_i).each do |i| - indices = [] - split.size.times{|j| indices<<j if split[j]==i} - dataset = dataframe_to_dataset_indices( df, subjectid, indices ) - LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}") - sets << dataset - end - sets + def split_to_dataset( df, split, metadata={}, subjectid=nil ) + indices = [] + split.size.times{|i| indices<<i if yield(split[i]) } + dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices ) + LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}") + dataset end def pull_dataframe(df) @@ -323,6 +380,8 @@ module OpenTox end def assign_dataframe(df,input,rownames,colnames) + rownames.check_uniq if rownames + colnames.check_uniq if colnames tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv") file = File.new(tmp, 'w') input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")} diff --git a/lib/stratification.R b/lib/stratification.R index 76ff2d8..3f8698c 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -1,4 +1,13 @@ +round_it <- function( x ) +{ + if(isTRUE((x - floor(x))>=0.5)) + ceiling(x) + else + floor(x) +} + + nominal_to_binary <- function( data ) { result = NULL @@ -41,9 +50,13 @@ nominal_to_binary <- function( data ) result } -process_data <- function( data ) +process_data <- function( data, colnames=NULL ) { data.num <- as.data.frame(data) + if (!is.null(colnames)) + { + data.num = subset(data.num, select = colnames) + } if (!is.numeric(data.num)) { data.num = nominal_to_binary(data.num) @@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 ) cbind(s$partition[,m]) } -stratified_split <- function( data, ratio=0.3, method="cluster" ) +stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL ) { - data.processed = as.matrix(process_data( data )) + data.processed = as.matrix(process_data( data, colnames )) + print(paste("split using #features: ",ncol(data.processed))) if (method == "samplecube") { require("sampling") # adjust ratio to make samplecube return exact number of samples - ratio = round(nrow(data.processed)*ratio)/nrow(data.processed) + ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed) pik = rep(ratio,times=nrow(data.processed)) data.strat = cbind(pik,data.processed) samplecube(data.strat,pik,order=2,comment=F) @@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" ) stop("unknown method") } -stratified_k_fold_split <- function( data, num_folds=10, method="cluster" ) +stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL ) { print(paste(num_folds,"-fold-split, data-size",nrow(data))) - data.processed = as.matrix(process_data( data )) + data.processed = as.matrix(process_data( data, colnames )) + print(paste("split using #features: ",ncol(data.processed))) if (method == "samplecube") { folds = rep(0, times=nrow(data)) @@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" ) { require("TunePareto") cl = cluster(data.processed) - res = generateCVRuns(cl,ntimes=1,nfold=3) + res = generateCVRuns(cl,ntimes=1,nfold=num_folds) folds = rep(0, times=nrow(data)) for (i in 1:num_folds) for(j in 1:length(res[[1]][[i]])) @@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" ) stop("unknown method") } +duplicate_indices <- function( data ) { + indices = 1:nrow(data) + z = data + duplicate_index = anyDuplicated(z) + while(duplicate_index) { + duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T) + #print(paste(duplicate_index,'is dupl to',duplicate_to_index)) + indices[duplicate_index] <- duplicate_to_index + z[duplicate_index,] <- paste('123$ยง%',duplicate_index) + duplicate_index = anyDuplicated(z) + } + indices +} + +add_duplicates <- function( data, dup_indices ) { + result = data[1,] + for(i in 2:length(dup_indices)) { + row = data[rownames(data)==dup_indices[i],] + if(length(row)==0) + stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data')) + result = rbind(result, row) + } + rownames(result)<-NULL + result +} + +sammon_duplicates <- function( data, ... ) { + di <- duplicate_indices(data) + print(di) + u <- unique(data) + print(paste('unique data points',nrow(u),'of',nrow(data))) + if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4") + points_unique <- sammon(dist(u), ...)$points + if (nrow(u)<nrow(data)) + { + points <- add_duplicates(points_unique, di) + points + } + else + { + points_unique + } +} + plot_pre_process <- function( data, method="pca" ) { data.processed = process_data( data ) @@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" ) data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T) data.emb$conf } + else if (method == "sammon") + { + require("MASS") + sammon_duplicates(data.processed, k=2) + } else stop("unknown method") } diff --git a/lib/utils.rb b/lib/utils.rb index a3f8161..88b8347 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -359,7 +359,7 @@ module OpenTox # @param[Hash] keys: SMILES, values: InChIs # @param[Array] field descriptions, one for each feature # @return[Array] CSV, array of field ids, array of field descriptions - def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids) + def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil) master=nil ids=[] @@ -369,7 +369,7 @@ module OpenTox (1...ambit_result_uri.size).collect { |idx| curr_uri = ambit_result_uri[0] + ambit_result_uri[idx] #LOGGER.debug "Requesting #{curr_uri}" - csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv"}) ) + csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) ) if csv_data[0] && csv_data[0].size>1 if master.nil? # This is the smiles entry (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] } |