diff options
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | VERSION | 2 | ||||
-rw-r--r-- | lib/algorithm.rb | 30 | ||||
-rw-r--r-- | lib/authorization.rb | 18 | ||||
-rw-r--r-- | lib/compound.rb | 53 | ||||
-rw-r--r-- | lib/environment.rb | 2 | ||||
-rw-r--r-- | lib/model.rb | 8 | ||||
-rw-r--r-- | lib/parser.rb | 17 | ||||
-rw-r--r-- | lib/r-util.rb | 123 | ||||
-rw-r--r-- | lib/serializer.rb | 11 | ||||
-rw-r--r-- | lib/stratification.R | 78 | ||||
-rw-r--r-- | lib/transform.rb | 4 | ||||
-rw-r--r-- | lib/utils.rb | 175 |
13 files changed, 299 insertions, 230 deletions
@@ -1,11 +1,3 @@ -v3.1.0 2012-02-24 - * utils.rb: added for special routines (e.g. descriptor calculation) - * task.rb: Polling with increasing interval - * parser.rb: CSV up and download fixed - * transform.rb: routines to create machine learning data matrices - * algorithm.rb: SVM parameter grid search, cos similarity as algorithm, - gauss() removed - v3.0.1 2011-10-19 * feature: model registration to ontology service * ontology lib gets endpoints from ontology service @@ -1 +1 @@ -3.1.0 +3.0.1 diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 9dcf6a8..ebd2019 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -477,10 +477,22 @@ module OpenTox # assumes a data matrix 'features' and a vector 'y' of target values row.names(features)=NULL + # features with all values missing removed + na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) ) + features = features[,!names(features) %in% na_col] + + # features with infinite values removed + inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) ) + features = features[,!names(features) %in% inf_col] + + # features with zero variance removed + zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) ) + features = features[,!names(features) %in% zero_var] + pp = NULL if (del_missing) { # needed if rows should be removed - na_ids = apply(features,1,function(x)any(is.na(x))) + na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) ) features = features[!na_ids,] y = y[!na_ids] pp = preProcess(features, method=c("scale", "center")) @@ -490,15 +502,21 @@ module OpenTox } features = predict(pp, features) + # features with nan values removed (sometimes preProcess return NaN values) + nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) ) + features = features[,!names(features) %in% nan_col] + # determine subsets - subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) - subsets = c(2,3,4,5,7,10,subsets) + subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7) + #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) + #subsets = c(2,3,4,5,7,10,subsets) + #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30) subsets = unique(sort(round(subsets))) subsets = subsets[subsets<=dim(features)[2]] subsets = subsets[subsets>1] - + # Recursive feature elimination - rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets) + rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets) # read existing dataset and select most useful features csv=feats[,c("SMILES", rfProfile$optVariables)] @@ -528,7 +546,7 @@ module OpenTox # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type # @return [Hash] Hash with matching Smarts and number of hits def self.lookup(params) - params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid]) + params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type]) end end diff --git a/lib/authorization.rb b/lib/authorization.rb index a9744e9..5d57781 100644 --- a/lib/authorization.rb +++ b/lib/authorization.rb @@ -37,15 +37,13 @@ module OpenTox #Loads and sends Policyfile(XML) to open-sso server # @param [String] URI to create a policy for - def send(uri) + def send(uri) xml = get_xml(uri) ret = false - ret = Authorization.create_policy(xml, @subjectid) - LOGGER.warn "Create policy on openSSO failed for URI: #{uri} subjectid: #{@subjectid}. Will try again." if !ret - ret = Authorization.create_policy(xml, @subjectid) if !ret + ret = Authorization.create_policy(xml, @subjectid) LOGGER.debug "Policy send with subjectid: #{@subjectid}" LOGGER.warn "Not created Policy is: #{xml}" if !ret - ret + ret end end @@ -339,7 +337,7 @@ module OpenTox # @param [String] subjectid # @return [Boolean] true if access granted, else otherwise def self.authorized?(uri, request_method, subjectid) - if CONFIG[:authorization][:free_request].include?(request_method) + if CONFIG[:authorization][:free_request].include?(request_method) #LOGGER.debug "authorized? >>true<< (request is free), method: #{request_method}, URI: #{uri}, subjectid: #{subjectid}" true elsif OpenTox::Authorization.free_uri?(uri, request_method) @@ -362,7 +360,7 @@ module OpenTox false end end - + private def self.free_uri?(uri, request_method) if CONFIG[:authorization][:free_uris] @@ -376,7 +374,7 @@ module OpenTox end return false end - + def self.authorize_exception?(uri, request_method) if CONFIG[:authorization][:authorize_exceptions] CONFIG[:authorization][:authorize_exceptions].each do |request_methods,uris| @@ -389,6 +387,6 @@ module OpenTox end return false end - + end -end +end
\ No newline at end of file diff --git a/lib/compound.rb b/lib/compound.rb index 8928081..6d3cb68 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -3,6 +3,7 @@ module OpenTox + require "rexml/document" # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure). class Compound @@ -130,6 +131,47 @@ module OpenTox "not available" end end + + + # Get all known compound names sorted by classification. Relies on an external service for name lookups. + # @example + # names = compound.to_names_hash + # @return [Hash] Classification => Name Array + def to_names_hash + begin + xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml") + xmldoc = REXML::Document.new(xml) + data = {} + + xmldoc.root.elements[1].elements.each{|e| + if data.has_key?(e.attribute("classification").value) == false + data[e.attribute("classification").value] = [e.text] + else + data[e.attribute("classification").value].push(e.text) + end + } + data + rescue + "not available" + end + end + + # Get all known compound names sorted by classification. Relies on an external service for name lookups. + # @example + # names = compound.to_names_hash + # @return [Hash] Classification => Name Array + def to_ambit_names_hash + begin + ds = OpenTox::Dataset.new + ds.save + ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}")) + ds.save + ds.uri + rescue + "not available" + end + end + # Match a smarts string # @example @@ -197,6 +239,7 @@ module OpenTox # Lookup numerical values, returns hash with feature name as key and value as value # @param [Array] Array of feature names # @param [String] Feature dataset uri + # @param [String] Comma separated pc types # @return [Hash] Hash with feature name as key and value as value def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil) ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid) @@ -211,11 +254,12 @@ module OpenTox LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil? if entry.nil? - uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type}) - uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid) - ds = OpenTox::Dataset.find(uri,subjectid) + temp_ds = OpenTox::Dataset.create; temp_ds.add_compound(self.uri) + uri = RestClientWrapper.post(temp_ds.save + "/pcdesc", {:pc_type => pc_type}) + ds = OpenTox::Dataset.find(uri) entry = ds.data_entries[self.uri] - ds.delete(subjectid) + ds.delete + temp_ds.delete end features = entry.keys features.each { |feature| @@ -224,7 +268,6 @@ module OpenTox entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit } #res = feature_array.collect {|v| entry[v]} - #LOGGER.debug "----- am #{entry.to_yaml}" entry end diff --git a/lib/environment.rb b/lib/environment.rb index c1b8312..6a72ba5 100644 --- a/lib/environment.rb +++ b/lib/environment.rb @@ -91,5 +91,5 @@ DC = OwlNamespace.new 'http://purl.org/dc/elements/1.1/' OT = OwlNamespace.new 'http://www.opentox.org/api/1.1#' OTA = OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#' XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#' -#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#' +BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#' diff --git a/lib/model.rb b/lib/model.rb index a858a0f..b3de1a3 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -237,6 +237,7 @@ module OpenTox @compound = Compound.new compound_uri features = {} + #LOGGER.debug self.to_yaml unless @prediction_dataset @prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid) @@ -247,19 +248,22 @@ module OpenTox OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}] } ) end + if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression" all_activities = [] all_activities = @activities.values.flatten.collect! { |i| i.to_f } end + unless database_activity(subjectid) # adds database activity to @prediction_dataset + # Calculation of needed values for query compound @compound_features = eval("#{@feature_calculation_algorithm}({ :compound => @compound, :features => @features, :feature_dataset_uri => @metadata[OT.featureDataset], - :pc_type => self.parameter(\"pc_type\"), - :subjectid => subjectid + :pc_type => self.parameter(\"pc_type\") })") + # Adding fingerprint of query compound with features and values(p_value*nr_hits) @compound_fingerprints = {} @compound_features.each do |feature, value| # value is nil if "Substructure.match" diff --git a/lib/parser.rb b/lib/parser.rb index 56e4fed..e871323 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -349,8 +349,11 @@ module OpenTox # Load CSV string (format specification: http://toxcreate.org/help) # @param [String] csv CSV representation of the dataset + # @param [Boolean] drop_missing Whether completely missing rows should be droppped + # @param [Boolean] all_numeric Whether all features should be treated as numeric + # @param [Boolean] del_nominal All nominal features will be removed # @return [OpenTox::Dataset] Dataset object with CSV data - def load_csv(csv, drop_missing=false) + def load_csv(csv, drop_missing=false, all_numeric=false) row = 0 input = csv.split("\n") headers = split_row(input.shift) @@ -362,7 +365,7 @@ module OpenTox row = split_row(row) value_maps = detect_new_values(row, value_maps) value_maps.each_with_index { |vm,j| - if vm.size > @max_class_values # max @max_class_values classes. + if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes. regression_features[j]=true else regression_features[j]=false @@ -395,17 +398,14 @@ module OpenTox info = '' @feature_types.each do |feature,types| if types.uniq.size == 0 - type = "helper#MissingFeature" + type = "helper#MissingFeature" # TODO: Fit to OT ontology! elsif types.uniq.size > 1 type = OT.NumericFeature else type = types.first end @dataset.add_feature_metadata(feature,{RDF.type => [type]}) - info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type - - # TODO: rewrite feature values - # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored." + info += "'#{@dataset.feature_name(feature)}' detected as '#{type.split('#').last}'<br>" if type end @dataset.metadata[OT.Info] = info @@ -522,7 +522,6 @@ module OpenTox def initialize @data = {} @activity_errors = [] - @max_class_values = 3 end def feature_values(feature) @@ -654,7 +653,7 @@ module OpenTox obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) } table.data[compound.uri] = row end - + # find and remove ignored_features @activity_errors = table.clean_features table.add_to_dataset @dataset diff --git a/lib/r-util.rb b/lib/r-util.rb index 0d4e82c..7163c46 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -8,18 +8,6 @@ PACKAGE_DIR = package_dir require "tempfile" -class Array - - def check_uniq - hash = {} - self.each do |x| - raise "duplicate #{x}" if hash[x] - hash[x] = true - end - end - -end - module OpenTox class RUtil @@ -87,10 +75,12 @@ module OpenTox end # embedds feature values of two datasets into 2D and plots it + # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method) # def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, - features=nil, subjectid=nil, waiting_task=nil) + features=nil, fast_plot=true, subjectid=nil, waiting_task=nil) + raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof") LOGGER.debug("r-util> create feature value plot") d1 = OpenTox::Dataset.find(dataset_uri1,subjectid) d2 = OpenTox::Dataset.find(dataset_uri2,subjectid) @@ -112,13 +102,17 @@ module OpenTox @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))" @r.names = [dataset_name1, dataset_name2] LOGGER.debug("r-util> - convert data to 2d") - #@r.eval "save.image(\"/tmp/image.R\")" - @r.eval "df.2d <- plot_pre_process(df, method='sammon')" + @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')" waiting_task.progress(75) if waiting_task + if fast_plot + info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'" + else + info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'" + end LOGGER.debug("r-util> - plot data") plot_to_files(files) do |file| - @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')" + @r.eval "plot_split( df.2d, split, names, #{info})" end end @@ -176,68 +170,19 @@ module OpenTox end end - # stratified splits a dataset into two dataset according to the feature values - # all features are taken into account unless <split_features> is given - # returns two datases - def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) - stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features ) - end - - # stratified splits a dataset into k datasets according the feature values + # stratified splits a dataset into two dataset the feature values # all features are taken into account unless <split_features> is given - # returns two arrays of datasets - def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil ) - stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features ) - end - - private - def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil ) - raise "internal error" if num_folds!=nil and pct!=nil - k_fold_split = num_folds!=nil - if k_fold_split - raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum) - else - raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric) - end + def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0 - raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0 - raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String) LOGGER.debug("r-util> apply stratified split to #{dataset.uri}") - df = dataset_to_dataframe( dataset, missing_values, subjectid) + df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features ) @r.eval "set.seed(#{seed})" - str_split_features = "" - if split_features - @r.split_features = split_features if split_features - str_split_features = "colnames=split_features" - end - @r.eval "save.image(\"/tmp/image.R\")" - - if k_fold_split - @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})" - split = @r.pull 'split' - train = [] - test = [] - num_folds.times do |f| - datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s - metadata[DC.title] = "training "+datasetname - train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) } - metadata[DC.title] = "test "+datasetname - test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) } - end - return train, test - else - puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" - @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" - split = @r.pull 'split' - metadata[DC.title] = "Training dataset split of "+dataset.uri - train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 } - metadata[DC.title] = "Test dataset split of "+dataset.uri - test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 } - return train, test - end + @r.eval "split <- stratified_split(#{df}, ratio=#{pct})" + split = @r.pull 'split' + split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set + split_to_datasets( df, split, subjectid ) end - public # dataset should be loaded completely (use Dataset.find) # takes duplicates into account @@ -267,13 +212,9 @@ module OpenTox features = dataset.features.keys.sort end compounds = [] - compound_names = [] dataset.compounds.each do |c| - count = 0 num_compounds[c].times do |i| compounds << c - compound_names << "#{c}$#{count}" - count+=1 end end @@ -297,7 +238,7 @@ module OpenTox end end df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" - assign_dataframe(df_name,d_values,compound_names,features) + assign_dataframe(df_name,d_values,compounds,features) # set dataframe column types accordingly f_count = 1 #R starts at 1 @@ -323,18 +264,16 @@ module OpenTox # converts a dataframe into a dataset (a new dataset is created at the dataset webservice) # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!) - def dataframe_to_dataset( df, metadata={}, subjectid=nil ) - dataframe_to_dataset_indices( df, metadata, subjectid, nil) + def dataframe_to_dataset( df, subjectid=nil ) + dataframe_to_dataset_indices( df, subjectid, nil) end private - def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil ) + def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil ) raise unless @@feats[df].size>0 - values, compound_names, features = pull_dataframe(df) - compounds = compound_names.collect{|c| c.split("$")[0]} + values, compounds, features = pull_dataframe(df) features.each{|f| raise unless @@feats[df][f]} dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid) - dataset.add_metadata(metadata) LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}" compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)} features.each{|f| dataset.add_feature(f,@@feats[df][f])} @@ -351,12 +290,16 @@ module OpenTox dataset end - def split_to_dataset( df, split, metadata={}, subjectid=nil ) - indices = [] - split.size.times{|i| indices<<i if yield(split[i]) } - dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices ) - LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}") - dataset + def split_to_datasets( df, split, subjectid=nil ) + sets = [] + (split.min.to_i .. split.max.to_i).each do |i| + indices = [] + split.size.times{|j| indices<<j if split[j]==i} + dataset = dataframe_to_dataset_indices( df, subjectid, indices ) + LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}") + sets << dataset + end + sets end def pull_dataframe(df) @@ -380,8 +323,6 @@ module OpenTox end def assign_dataframe(df,input,rownames,colnames) - rownames.check_uniq if rownames - colnames.check_uniq if colnames tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv") file = File.new(tmp, 'w') input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")} diff --git a/lib/serializer.rb b/lib/serializer.rb index 30cb2ba..2205ade 100644 --- a/lib/serializer.rb +++ b/lib/serializer.rb @@ -460,6 +460,17 @@ module OpenTox @rows = [] @rows << ["SMILES"] features = dataset.features.keys + + delete_features = [] + features.each{ |fn| + dataset.features[fn][RDF.type].each { |typestr| + if typestr.include? "MissingFeature" + delete_features << fn + end + } + } + features = features - delete_features + @rows.first << features @rows.first.flatten! dataset.data_entries.each do |compound,entries| diff --git a/lib/stratification.R b/lib/stratification.R index 3f8698c..76ff2d8 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -1,13 +1,4 @@ -round_it <- function( x ) -{ - if(isTRUE((x - floor(x))>=0.5)) - ceiling(x) - else - floor(x) -} - - nominal_to_binary <- function( data ) { result = NULL @@ -50,13 +41,9 @@ nominal_to_binary <- function( data ) result } -process_data <- function( data, colnames=NULL ) +process_data <- function( data ) { data.num <- as.data.frame(data) - if (!is.null(colnames)) - { - data.num = subset(data.num, select = colnames) - } if (!is.numeric(data.num)) { data.num = nominal_to_binary(data.num) @@ -85,15 +72,14 @@ cluster <- function( data, min=10, max=15 ) cbind(s$partition[,m]) } -stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL ) +stratified_split <- function( data, ratio=0.3, method="cluster" ) { - data.processed = as.matrix(process_data( data, colnames )) - print(paste("split using #features: ",ncol(data.processed))) + data.processed = as.matrix(process_data( data )) if (method == "samplecube") { require("sampling") # adjust ratio to make samplecube return exact number of samples - ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed) + ratio = round(nrow(data.processed)*ratio)/nrow(data.processed) pik = rep(ratio,times=nrow(data.processed)) data.strat = cbind(pik,data.processed) samplecube(data.strat,pik,order=2,comment=F) @@ -115,11 +101,10 @@ stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL ) stop("unknown method") } -stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL ) +stratified_k_fold_split <- function( data, num_folds=10, method="cluster" ) { print(paste(num_folds,"-fold-split, data-size",nrow(data))) - data.processed = as.matrix(process_data( data, colnames )) - print(paste("split using #features: ",ncol(data.processed))) + data.processed = as.matrix(process_data( data )) if (method == "samplecube") { folds = rep(0, times=nrow(data)) @@ -148,7 +133,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna { require("TunePareto") cl = cluster(data.processed) - res = generateCVRuns(cl,ntimes=1,nfold=num_folds) + res = generateCVRuns(cl,ntimes=1,nfold=3) folds = rep(0, times=nrow(data)) for (i in 1:num_folds) for(j in 1:length(res[[1]][[i]])) @@ -159,50 +144,6 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna stop("unknown method") } -duplicate_indices <- function( data ) { - indices = 1:nrow(data) - z = data - duplicate_index = anyDuplicated(z) - while(duplicate_index) { - duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T) - #print(paste(duplicate_index,'is dupl to',duplicate_to_index)) - indices[duplicate_index] <- duplicate_to_index - z[duplicate_index,] <- paste('123$ยง%',duplicate_index) - duplicate_index = anyDuplicated(z) - } - indices -} - -add_duplicates <- function( data, dup_indices ) { - result = data[1,] - for(i in 2:length(dup_indices)) { - row = data[rownames(data)==dup_indices[i],] - if(length(row)==0) - stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data')) - result = rbind(result, row) - } - rownames(result)<-NULL - result -} - -sammon_duplicates <- function( data, ... ) { - di <- duplicate_indices(data) - print(di) - u <- unique(data) - print(paste('unique data points',nrow(u),'of',nrow(data))) - if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4") - points_unique <- sammon(dist(u), ...)$points - if (nrow(u)<nrow(data)) - { - points <- add_duplicates(points_unique, di) - points - } - else - { - points_unique - } -} - plot_pre_process <- function( data, method="pca" ) { data.processed = process_data( data ) @@ -217,11 +158,6 @@ plot_pre_process <- function( data, method="pca" ) data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T) data.emb$conf } - else if (method == "sammon") - { - require("MASS") - sammon_duplicates(data.processed, k=2) - } else stop("unknown method") } diff --git a/lib/transform.rb b/lib/transform.rb index f6f769d..cb530a3 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -396,8 +396,8 @@ module OpenTox @q_prop = gsl_q_prop_orig.row(0).to_a end - LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop) - LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" if (@sims && @acts) + LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop) + LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" @sims = [ gram_matrix, @sims ] diff --git a/lib/utils.rb b/lib/utils.rb index d9d7b4b..40988db 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -1,4 +1,5 @@ require 'csv' +require 'tempfile' module OpenTox @@ -8,18 +9,60 @@ module OpenTox include OpenTox # Calculate physico-chemical descriptors. - # @param[Hash] Required keys: :dataset_uri, :pc_type + # @param[Hash] Required keys: :dataset_uri, :pc_type, :rjb # @return[String] dataset uri - def self.pc_descriptors(params) begin ds = OpenTox::Dataset.find(params[:dataset_uri]) compounds = ds.compounds.collect - ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } ) - #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing - LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'" - load_ds_csv(ambit_result_uri, smiles_to_inchi) + + jl_master=nil + ambit_master=nil + + # joelib via rjb + types = params[:pc_type].split(",") + + step= (1.0/types.size * 100).floor + if types.size && types.include?("joelib") + jl_master = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb] } ) + types.delete("joelib") + end + params[:task].progress(step) if params[:task] + + + # ambit via REST + if types.size > 0 + ambit_result_uri, smiles_to_inchi = get_ambit_descriptors( { :compounds => compounds, :pc_type => types.join(','), :task => params[:task], :step => step } ) + LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'" + ambit_master = load_ds_csv(ambit_result_uri, smiles_to_inchi) + end + + + # Fuse CSVs + if jl_master && ambit_master + nr_cols = (jl_master[0].size)-1 + LOGGER.debug "Merging #{nr_cols} new columns" + ambit_master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows + jl_master.each do |row| + temp = ambit_master.assoc(row[0]) # Finds the appropriate line in master + ((-1*nr_cols)..-1).collect.each { |idx| + temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found + } + end + master = ambit_master + else + master = jl_master if jl_master + master = ambit_master if ambit_master + end + + parser = OpenTox::Parser::Spreadsheets.new + ds = OpenTox::Dataset.new + ds.save + parser.dataset = ds + ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"),false,true) + ds.save + rescue Exception => e LOGGER.debug "#{e.class}: #{e.message}" LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" @@ -27,10 +70,94 @@ module OpenTox end - # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit. + + # Calculates PC descriptors via JOELib2. + # @param[Hash] Required keys: :compounds, :rjb + # @return[String] dataset uri + def self.get_jl_descriptors(params) + + s = params[:rjb] + master = nil + raise "No Java environment" unless s + + # Load keys, enter CSV headers + begin + keysfile = File.join(ENV['HOME'], ".opentox", "config", "jl_keys.yaml") + csvfile = Tempfile.open(['jl_descriptors-csv-','.sdf']) + jl_keys = YAML::load_file(keysfile) + jl_colnames = jl_keys.collect{ |k| + k.split(".").last + } + csvfile.puts((["SMILES"] + jl_colnames).join(",")) + + # remember inchis + inchis = params[:compounds].collect { |c_uri| + cmpd = OpenTox::Compound.new(c_uri) + URI.encode_www_form_component(cmpd.to_inchi) + } + + # Process compounds + params[:compounds].each_with_index { |c_uri, c_idx| + cmpd = OpenTox::Compound.new(c_uri) + inchi = cmpd.to_inchi + sdf_data = cmpd.to_sdf + + infile = Tempfile.open(['jl_descriptors-in-','.sdf']) + outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out") + + begin + infile.puts sdf_data + infile.flush + s.new(infile.path, outfile_path) + + row = [inchis[c_idx]] + jl_keys.each_with_index do |k,i| # Fill row + re = Regexp.new(k) + open(outfile_path) do |f| + f.each do |line| + if @prev =~ re + entry = line.chomp + val = nil + if OpenTox::Algorithm.numeric?(entry) + val = Float(entry) + val = nil if val.nan? + val = nil if val.infinite? + end + row << val + end + @prev = line + end + end + end + csvfile.puts(row.join(",")) + csvfile.flush + + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf")) + File.delete(outfile_path) + infile.close! + end + } + master = CSV::parse(File.open(csvfile.path, "rb").read) + + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + [ csvfile].each { |f| f.close! } + end + + master + end + + + # Calcul:compoundsates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit. # @param[Hash] Required keys: :compounds, :pc_type # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features - def self.get_pc_descriptors(params) + def self.get_ambit_descriptors(params) begin @@ -38,16 +165,17 @@ module OpenTox ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632" descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") ) descs_uris = [] - params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type types = params[:pc_type].split(",") descs.each { |uri, cat_name| if types.include? cat_name[:category] - descs_uris << uri + descs_uris << "#{cat_name[:category]}:::#{uri}" end } if descs_uris.size == 0 raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?" end + descs_uris.sort! + descs_uris.collect! { |uri| uri.split(":::").last } #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}" begin @@ -75,17 +203,21 @@ module OpenTox end ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp - # Calculate 3D for CPSA - if types.include? "cpsa" - ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} ) - LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }" - end + # -C-a-l-c-u-l-a-t-e- -3-D- -f-o-r- -C-P-S-A- + # Always calculate 3D! See http://goo.gl/Tk81j + #if types.include? "cpsa" + ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} ) + LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }" + #end # Get Ambit results ambit_result_uri = [] # 1st pos: base uri, then features ambit_result_uri << ambit_ds_uri + "?" ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&") + current_cat = "" descs_uris.each_with_index do |uri, i| + old_cat = current_cat; current_cat = descs[uri][:category] + params[:task].progress(params[:task].metadata[OT.percentageCompleted] + params[:step]) if params[:task] && params[:step] && old_cat != current_cat && old_cat != "" algorithm = Algorithm::Generic.new(uri) result_uri = algorithm.run({:dataset_uri => ambit_ds_uri}) ambit_result_uri << result_uri.split("?")[1] + "&" @@ -104,13 +236,13 @@ module OpenTox # Load dataset via CSV # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features # @return[String] dataset uri - def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil) + def self.load_ds_csv(ambit_result_uri, smiles_to_inchi) master=nil (1...ambit_result_uri.size).collect { |idx| curr_uri = ambit_result_uri[0] + ambit_result_uri[idx] LOGGER.debug "Requesting #{curr_uri}" - csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) ) + csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv"}) ) if csv_data[0] && csv_data[0].size>1 if master.nil? # This is the smiles entry (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] } @@ -139,17 +271,12 @@ module OpenTox master[0][0] = "Compound" #"SMILES" index_smi = master[0].index("SMILES") master.map {|i| i.delete_at(index_smi)} if index_smi - #master[0][0] = "SMILES" + master[0][0] = "SMILES" #LOGGER.debug "-------- AM: Writing to dumpfile" #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) } - parser = OpenTox::Parser::Spreadsheets.new - ds = OpenTox::Dataset.new(nil,subjectid) - ds.save(subjectid) - parser.dataset = ds - ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n")) - ds.save(subjectid) + master end |