summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--VERSION2
-rw-r--r--lib/algorithm.rb30
-rw-r--r--lib/authorization.rb18
-rw-r--r--lib/compound.rb53
-rw-r--r--lib/environment.rb2
-rw-r--r--lib/model.rb8
-rw-r--r--lib/parser.rb17
-rw-r--r--lib/r-util.rb123
-rw-r--r--lib/serializer.rb11
-rw-r--r--lib/stratification.R78
-rw-r--r--lib/transform.rb4
-rw-r--r--lib/utils.rb175
13 files changed, 299 insertions, 230 deletions
diff --git a/ChangeLog b/ChangeLog
index 5872d56..de9e01b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,11 +1,3 @@
-v3.1.0 2012-02-24
- * utils.rb: added for special routines (e.g. descriptor calculation)
- * task.rb: Polling with increasing interval
- * parser.rb: CSV up and download fixed
- * transform.rb: routines to create machine learning data matrices
- * algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
- gauss() removed
-
v3.0.1 2011-10-19
* feature: model registration to ontology service
* ontology lib gets endpoints from ontology service
diff --git a/VERSION b/VERSION
index fd2a018..cb2b00e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.1.0
+3.0.1
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 9dcf6a8..ebd2019 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -477,10 +477,22 @@ module OpenTox
# assumes a data matrix 'features' and a vector 'y' of target values
row.names(features)=NULL
+ # features with all values missing removed
+ na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
+ features = features[,!names(features) %in% na_col]
+
+ # features with infinite values removed
+ inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
+ features = features[,!names(features) %in% inf_col]
+
+ # features with zero variance removed
+ zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
+ features = features[,!names(features) %in% zero_var]
+
pp = NULL
if (del_missing) {
# needed if rows should be removed
- na_ids = apply(features,1,function(x)any(is.na(x)))
+ na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )
features = features[!na_ids,]
y = y[!na_ids]
pp = preProcess(features, method=c("scale", "center"))
@@ -490,15 +502,21 @@ module OpenTox
}
features = predict(pp, features)
+ # features with nan values removed (sometimes preProcess return NaN values)
+ nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
+ features = features[,!names(features) %in% nan_col]
+
# determine subsets
- subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
- subsets = c(2,3,4,5,7,10,subsets)
+ subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
+ #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+ #subsets = c(2,3,4,5,7,10,subsets)
+ #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
subsets = unique(sort(round(subsets)))
subsets = subsets[subsets<=dim(features)[2]]
subsets = subsets[subsets>1]
-
+
# Recursive feature elimination
- rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
# read existing dataset and select most useful features
csv=feats[,c("SMILES", rfProfile$optVariables)]
@@ -528,7 +546,7 @@ module OpenTox
# @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
# @return [Hash] Hash with matching Smarts and number of hits
def self.lookup(params)
- params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
+ params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type])
end
end
diff --git a/lib/authorization.rb b/lib/authorization.rb
index a9744e9..5d57781 100644
--- a/lib/authorization.rb
+++ b/lib/authorization.rb
@@ -37,15 +37,13 @@ module OpenTox
#Loads and sends Policyfile(XML) to open-sso server
# @param [String] URI to create a policy for
- def send(uri)
+ def send(uri)
xml = get_xml(uri)
ret = false
- ret = Authorization.create_policy(xml, @subjectid)
- LOGGER.warn "Create policy on openSSO failed for URI: #{uri} subjectid: #{@subjectid}. Will try again." if !ret
- ret = Authorization.create_policy(xml, @subjectid) if !ret
+ ret = Authorization.create_policy(xml, @subjectid)
LOGGER.debug "Policy send with subjectid: #{@subjectid}"
LOGGER.warn "Not created Policy is: #{xml}" if !ret
- ret
+ ret
end
end
@@ -339,7 +337,7 @@ module OpenTox
# @param [String] subjectid
# @return [Boolean] true if access granted, else otherwise
def self.authorized?(uri, request_method, subjectid)
- if CONFIG[:authorization][:free_request].include?(request_method)
+ if CONFIG[:authorization][:free_request].include?(request_method)
#LOGGER.debug "authorized? >>true<< (request is free), method: #{request_method}, URI: #{uri}, subjectid: #{subjectid}"
true
elsif OpenTox::Authorization.free_uri?(uri, request_method)
@@ -362,7 +360,7 @@ module OpenTox
false
end
end
-
+
private
def self.free_uri?(uri, request_method)
if CONFIG[:authorization][:free_uris]
@@ -376,7 +374,7 @@ module OpenTox
end
return false
end
-
+
def self.authorize_exception?(uri, request_method)
if CONFIG[:authorization][:authorize_exceptions]
CONFIG[:authorization][:authorize_exceptions].each do |request_methods,uris|
@@ -389,6 +387,6 @@ module OpenTox
end
return false
end
-
+
end
-end
+end \ No newline at end of file
diff --git a/lib/compound.rb b/lib/compound.rb
index 8928081..6d3cb68 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -3,6 +3,7 @@
module OpenTox
+ require "rexml/document"
# Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
class Compound
@@ -130,6 +131,47 @@ module OpenTox
"not available"
end
end
+
+
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+ # @example
+ # names = compound.to_names_hash
+ # @return [Hash] Classification => Name Array
+ def to_names_hash
+ begin
+ xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml")
+ xmldoc = REXML::Document.new(xml)
+ data = {}
+
+ xmldoc.root.elements[1].elements.each{|e|
+ if data.has_key?(e.attribute("classification").value) == false
+ data[e.attribute("classification").value] = [e.text]
+ else
+ data[e.attribute("classification").value].push(e.text)
+ end
+ }
+ data
+ rescue
+ "not available"
+ end
+ end
+
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+ # @example
+ # names = compound.to_names_hash
+ # @return [Hash] Classification => Name Array
+ def to_ambit_names_hash
+ begin
+ ds = OpenTox::Dataset.new
+ ds.save
+ ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}"))
+ ds.save
+ ds.uri
+ rescue
+ "not available"
+ end
+ end
+
# Match a smarts string
# @example
@@ -197,6 +239,7 @@ module OpenTox
# Lookup numerical values, returns hash with feature name as key and value as value
# @param [Array] Array of feature names
# @param [String] Feature dataset uri
+ # @param [String] Comma separated pc types
# @return [Hash] Hash with feature name as key and value as value
def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil)
ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid)
@@ -211,11 +254,12 @@ module OpenTox
LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
if entry.nil?
- uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
- uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid)
- ds = OpenTox::Dataset.find(uri,subjectid)
+ temp_ds = OpenTox::Dataset.create; temp_ds.add_compound(self.uri)
+ uri = RestClientWrapper.post(temp_ds.save + "/pcdesc", {:pc_type => pc_type})
+ ds = OpenTox::Dataset.find(uri)
entry = ds.data_entries[self.uri]
- ds.delete(subjectid)
+ ds.delete
+ temp_ds.delete
end
features = entry.keys
features.each { |feature|
@@ -224,7 +268,6 @@ module OpenTox
entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
}
#res = feature_array.collect {|v| entry[v]}
- #LOGGER.debug "----- am #{entry.to_yaml}"
entry
end
diff --git a/lib/environment.rb b/lib/environment.rb
index c1b8312..6a72ba5 100644
--- a/lib/environment.rb
+++ b/lib/environment.rb
@@ -91,5 +91,5 @@ DC = OwlNamespace.new 'http://purl.org/dc/elements/1.1/'
OT = OwlNamespace.new 'http://www.opentox.org/api/1.1#'
OTA = OwlNamespace.new 'http://www.opentox.org/algorithmTypes.owl#'
XSD = OwlNamespace.new 'http://www.w3.org/2001/XMLSchema#'
-#BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
+BO = OwlNamespace.new 'http://www.blueobelisk.org/ontologies/chemoinformatics-algorithms/#'
diff --git a/lib/model.rb b/lib/model.rb
index a858a0f..b3de1a3 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -237,6 +237,7 @@ module OpenTox
@compound = Compound.new compound_uri
features = {}
+
#LOGGER.debug self.to_yaml
unless @prediction_dataset
@prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@@ -247,19 +248,22 @@ module OpenTox
OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
} )
end
+
if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
all_activities = []
all_activities = @activities.values.flatten.collect! { |i| i.to_f }
end
+
unless database_activity(subjectid) # adds database activity to @prediction_dataset
+
# Calculation of needed values for query compound
@compound_features = eval("#{@feature_calculation_algorithm}({
:compound => @compound,
:features => @features,
:feature_dataset_uri => @metadata[OT.featureDataset],
- :pc_type => self.parameter(\"pc_type\"),
- :subjectid => subjectid
+ :pc_type => self.parameter(\"pc_type\")
})")
+
# Adding fingerprint of query compound with features and values(p_value*nr_hits)
@compound_fingerprints = {}
@compound_features.each do |feature, value| # value is nil if "Substructure.match"
diff --git a/lib/parser.rb b/lib/parser.rb
index 56e4fed..e871323 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -349,8 +349,11 @@ module OpenTox
# Load CSV string (format specification: http://toxcreate.org/help)
# @param [String] csv CSV representation of the dataset
+ # @param [Boolean] drop_missing Whether completely missing rows should be droppped
+ # @param [Boolean] all_numeric Whether all features should be treated as numeric
+ # @param [Boolean] del_nominal All nominal features will be removed
# @return [OpenTox::Dataset] Dataset object with CSV data
- def load_csv(csv, drop_missing=false)
+ def load_csv(csv, drop_missing=false, all_numeric=false)
row = 0
input = csv.split("\n")
headers = split_row(input.shift)
@@ -362,7 +365,7 @@ module OpenTox
row = split_row(row)
value_maps = detect_new_values(row, value_maps)
value_maps.each_with_index { |vm,j|
- if vm.size > @max_class_values # max @max_class_values classes.
+ if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
regression_features[j]=true
else
regression_features[j]=false
@@ -395,17 +398,14 @@ module OpenTox
info = ''
@feature_types.each do |feature,types|
if types.uniq.size == 0
- type = "helper#MissingFeature"
+ type = "helper#MissingFeature" # TODO: Fit to OT ontology!
elsif types.uniq.size > 1
type = OT.NumericFeature
else
type = types.first
end
@dataset.add_feature_metadata(feature,{RDF.type => [type]})
- info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
-
- # TODO: rewrite feature values
- # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
+ info += "'#{@dataset.feature_name(feature)}' detected as '#{type.split('#').last}'<br>" if type
end
@dataset.metadata[OT.Info] = info
@@ -522,7 +522,6 @@ module OpenTox
def initialize
@data = {}
@activity_errors = []
- @max_class_values = 3
end
def feature_values(feature)
@@ -654,7 +653,7 @@ module OpenTox
obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
table.data[compound.uri] = row
end
-
+
# find and remove ignored_features
@activity_errors = table.clean_features
table.add_to_dataset @dataset
diff --git a/lib/r-util.rb b/lib/r-util.rb
index 0d4e82c..7163c46 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -8,18 +8,6 @@ PACKAGE_DIR = package_dir
require "tempfile"
-class Array
-
- def check_uniq
- hash = {}
- self.each do |x|
- raise "duplicate #{x}" if hash[x]
- hash[x] = true
- end
- end
-
-end
-
module OpenTox
class RUtil
@@ -87,10 +75,12 @@ module OpenTox
end
# embedds feature values of two datasets into 2D and plots it
+ # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
#
def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
- features=nil, subjectid=nil, waiting_task=nil)
+ features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
+ raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
LOGGER.debug("r-util> create feature value plot")
d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
@@ -112,13 +102,17 @@ module OpenTox
@r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
@r.names = [dataset_name1, dataset_name2]
LOGGER.debug("r-util> - convert data to 2d")
- #@r.eval "save.image(\"/tmp/image.R\")"
- @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
+ @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
waiting_task.progress(75) if waiting_task
+ if fast_plot
+ info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
+ else
+ info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
+ end
LOGGER.debug("r-util> - plot data")
plot_to_files(files) do |file|
- @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
+ @r.eval "plot_split( df.2d, split, names, #{info})"
end
end
@@ -176,68 +170,19 @@ module OpenTox
end
end
- # stratified splits a dataset into two dataset according to the feature values
- # all features are taken into account unless <split_features> is given
- # returns two datases
- def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
- stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
- end
-
- # stratified splits a dataset into k datasets according the feature values
+ # stratified splits a dataset into two dataset the feature values
# all features are taken into account unless <split_features> is given
- # returns two arrays of datasets
- def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
- stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
- end
-
- private
- def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
- raise "internal error" if num_folds!=nil and pct!=nil
- k_fold_split = num_folds!=nil
- if k_fold_split
- raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
- else
- raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
- end
+ def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
- raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
- raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)
LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
- df = dataset_to_dataframe( dataset, missing_values, subjectid)
+ df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
@r.eval "set.seed(#{seed})"
- str_split_features = ""
- if split_features
- @r.split_features = split_features if split_features
- str_split_features = "colnames=split_features"
- end
- @r.eval "save.image(\"/tmp/image.R\")"
-
- if k_fold_split
- @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
- split = @r.pull 'split'
- train = []
- test = []
- num_folds.times do |f|
- datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s
- metadata[DC.title] = "training "+datasetname
- train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
- metadata[DC.title] = "test "+datasetname
- test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
- end
- return train, test
- else
- puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
- @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
- split = @r.pull 'split'
- metadata[DC.title] = "Training dataset split of "+dataset.uri
- train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
- metadata[DC.title] = "Test dataset split of "+dataset.uri
- test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
- return train, test
- end
+ @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
+ split = @r.pull 'split'
+ split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
+ split_to_datasets( df, split, subjectid )
end
- public
# dataset should be loaded completely (use Dataset.find)
# takes duplicates into account
@@ -267,13 +212,9 @@ module OpenTox
features = dataset.features.keys.sort
end
compounds = []
- compound_names = []
dataset.compounds.each do |c|
- count = 0
num_compounds[c].times do |i|
compounds << c
- compound_names << "#{c}$#{count}"
- count+=1
end
end
@@ -297,7 +238,7 @@ module OpenTox
end
end
df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
- assign_dataframe(df_name,d_values,compound_names,features)
+ assign_dataframe(df_name,d_values,compounds,features)
# set dataframe column types accordingly
f_count = 1 #R starts at 1
@@ -323,18 +264,16 @@ module OpenTox
# converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
# this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
- def dataframe_to_dataset( df, metadata={}, subjectid=nil )
- dataframe_to_dataset_indices( df, metadata, subjectid, nil)
+ def dataframe_to_dataset( df, subjectid=nil )
+ dataframe_to_dataset_indices( df, subjectid, nil)
end
private
- def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
+ def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
raise unless @@feats[df].size>0
- values, compound_names, features = pull_dataframe(df)
- compounds = compound_names.collect{|c| c.split("$")[0]}
+ values, compounds, features = pull_dataframe(df)
features.each{|f| raise unless @@feats[df][f]}
dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
- dataset.add_metadata(metadata)
LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
features.each{|f| dataset.add_feature(f,@@feats[df][f])}
@@ -351,12 +290,16 @@ module OpenTox
dataset
end
- def split_to_dataset( df, split, metadata={}, subjectid=nil )
- indices = []
- split.size.times{|i| indices<<i if yield(split[i]) }
- dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
- LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
- dataset
+ def split_to_datasets( df, split, subjectid=nil )
+ sets = []
+ (split.min.to_i .. split.max.to_i).each do |i|
+ indices = []
+ split.size.times{|j| indices<<j if split[j]==i}
+ dataset = dataframe_to_dataset_indices( df, subjectid, indices )
+ LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+ sets << dataset
+ end
+ sets
end
def pull_dataframe(df)
@@ -380,8 +323,6 @@ module OpenTox
end
def assign_dataframe(df,input,rownames,colnames)
- rownames.check_uniq if rownames
- colnames.check_uniq if colnames
tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
file = File.new(tmp, 'w')
input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
diff --git a/lib/serializer.rb b/lib/serializer.rb
index 30cb2ba..2205ade 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -460,6 +460,17 @@ module OpenTox
@rows = []
@rows << ["SMILES"]
features = dataset.features.keys
+
+ delete_features = []
+ features.each{ |fn|
+ dataset.features[fn][RDF.type].each { |typestr|
+ if typestr.include? "MissingFeature"
+ delete_features << fn
+ end
+ }
+ }
+ features = features - delete_features
+
@rows.first << features
@rows.first.flatten!
dataset.data_entries.each do |compound,entries|
diff --git a/lib/stratification.R b/lib/stratification.R
index 3f8698c..76ff2d8 100644
--- a/lib/stratification.R
+++ b/lib/stratification.R
@@ -1,13 +1,4 @@
-round_it <- function( x )
-{
- if(isTRUE((x - floor(x))>=0.5))
- ceiling(x)
- else
- floor(x)
-}
-
-
nominal_to_binary <- function( data )
{
result = NULL
@@ -50,13 +41,9 @@ nominal_to_binary <- function( data )
result
}
-process_data <- function( data, colnames=NULL )
+process_data <- function( data )
{
data.num <- as.data.frame(data)
- if (!is.null(colnames))
- {
- data.num = subset(data.num, select = colnames)
- }
if (!is.numeric(data.num))
{
data.num = nominal_to_binary(data.num)
@@ -85,15 +72,14 @@ cluster <- function( data, min=10, max=15 )
cbind(s$partition[,m])
}
-stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
+stratified_split <- function( data, ratio=0.3, method="cluster" )
{
- data.processed = as.matrix(process_data( data, colnames ))
- print(paste("split using #features: ",ncol(data.processed)))
+ data.processed = as.matrix(process_data( data ))
if (method == "samplecube")
{
require("sampling")
# adjust ratio to make samplecube return exact number of samples
- ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
+ ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
pik = rep(ratio,times=nrow(data.processed))
data.strat = cbind(pik,data.processed)
samplecube(data.strat,pik,order=2,comment=F)
@@ -115,11 +101,10 @@ stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
stop("unknown method")
}
-stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
{
print(paste(num_folds,"-fold-split, data-size",nrow(data)))
- data.processed = as.matrix(process_data( data, colnames ))
- print(paste("split using #features: ",ncol(data.processed)))
+ data.processed = as.matrix(process_data( data ))
if (method == "samplecube")
{
folds = rep(0, times=nrow(data))
@@ -148,7 +133,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna
{
require("TunePareto")
cl = cluster(data.processed)
- res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
+ res = generateCVRuns(cl,ntimes=1,nfold=3)
folds = rep(0, times=nrow(data))
for (i in 1:num_folds)
for(j in 1:length(res[[1]][[i]]))
@@ -159,50 +144,6 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colna
stop("unknown method")
}
-duplicate_indices <- function( data ) {
- indices = 1:nrow(data)
- z = data
- duplicate_index = anyDuplicated(z)
- while(duplicate_index) {
- duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
- #print(paste(duplicate_index,'is dupl to',duplicate_to_index))
- indices[duplicate_index] <- duplicate_to_index
- z[duplicate_index,] <- paste('123$ยง%',duplicate_index)
- duplicate_index = anyDuplicated(z)
- }
- indices
-}
-
-add_duplicates <- function( data, dup_indices ) {
- result = data[1,]
- for(i in 2:length(dup_indices)) {
- row = data[rownames(data)==dup_indices[i],]
- if(length(row)==0)
- stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
- result = rbind(result, row)
- }
- rownames(result)<-NULL
- result
-}
-
-sammon_duplicates <- function( data, ... ) {
- di <- duplicate_indices(data)
- print(di)
- u <- unique(data)
- print(paste('unique data points',nrow(u),'of',nrow(data)))
- if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
- points_unique <- sammon(dist(u), ...)$points
- if (nrow(u)<nrow(data))
- {
- points <- add_duplicates(points_unique, di)
- points
- }
- else
- {
- points_unique
- }
-}
-
plot_pre_process <- function( data, method="pca" )
{
data.processed = process_data( data )
@@ -217,11 +158,6 @@ plot_pre_process <- function( data, method="pca" )
data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
data.emb$conf
}
- else if (method == "sammon")
- {
- require("MASS")
- sammon_duplicates(data.processed, k=2)
- }
else
stop("unknown method")
}
diff --git a/lib/transform.rb b/lib/transform.rb
index f6f769d..cb530a3 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -396,8 +396,8 @@ module OpenTox
@q_prop = gsl_q_prop_orig.row(0).to_a
end
- LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
- LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" if (@sims && @acts)
+ LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
+ LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
@sims = [ gram_matrix, @sims ]
diff --git a/lib/utils.rb b/lib/utils.rb
index d9d7b4b..40988db 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -1,4 +1,5 @@
require 'csv'
+require 'tempfile'
module OpenTox
@@ -8,18 +9,60 @@ module OpenTox
include OpenTox
# Calculate physico-chemical descriptors.
- # @param[Hash] Required keys: :dataset_uri, :pc_type
+ # @param[Hash] Required keys: :dataset_uri, :pc_type, :rjb
# @return[String] dataset uri
-
def self.pc_descriptors(params)
begin
ds = OpenTox::Dataset.find(params[:dataset_uri])
compounds = ds.compounds.collect
- ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
- #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
- LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
- load_ds_csv(ambit_result_uri, smiles_to_inchi)
+
+ jl_master=nil
+ ambit_master=nil
+
+ # joelib via rjb
+ types = params[:pc_type].split(",")
+
+ step= (1.0/types.size * 100).floor
+ if types.size && types.include?("joelib")
+ jl_master = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb] } )
+ types.delete("joelib")
+ end
+ params[:task].progress(step) if params[:task]
+
+
+ # ambit via REST
+ if types.size > 0
+ ambit_result_uri, smiles_to_inchi = get_ambit_descriptors( { :compounds => compounds, :pc_type => types.join(','), :task => params[:task], :step => step } )
+ LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
+ ambit_master = load_ds_csv(ambit_result_uri, smiles_to_inchi)
+ end
+
+
+ # Fuse CSVs
+ if jl_master && ambit_master
+ nr_cols = (jl_master[0].size)-1
+ LOGGER.debug "Merging #{nr_cols} new columns"
+ ambit_master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
+ jl_master.each do |row|
+ temp = ambit_master.assoc(row[0]) # Finds the appropriate line in master
+ ((-1*nr_cols)..-1).collect.each { |idx|
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
+ }
+ end
+ master = ambit_master
+ else
+ master = jl_master if jl_master
+ master = ambit_master if ambit_master
+ end
+
+ parser = OpenTox::Parser::Spreadsheets.new
+ ds = OpenTox::Dataset.new
+ ds.save
+ parser.dataset = ds
+ ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"),false,true)
+ ds.save
+
rescue Exception => e
LOGGER.debug "#{e.class}: #{e.message}"
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
@@ -27,10 +70,94 @@ module OpenTox
end
- # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
+
+ # Calculates PC descriptors via JOELib2.
+ # @param[Hash] Required keys: :compounds, :rjb
+ # @return[String] dataset uri
+ def self.get_jl_descriptors(params)
+
+ s = params[:rjb]
+ master = nil
+ raise "No Java environment" unless s
+
+ # Load keys, enter CSV headers
+ begin
+ keysfile = File.join(ENV['HOME'], ".opentox", "config", "jl_keys.yaml")
+ csvfile = Tempfile.open(['jl_descriptors-csv-','.sdf'])
+ jl_keys = YAML::load_file(keysfile)
+ jl_colnames = jl_keys.collect{ |k|
+ k.split(".").last
+ }
+ csvfile.puts((["SMILES"] + jl_colnames).join(","))
+
+ # remember inchis
+ inchis = params[:compounds].collect { |c_uri|
+ cmpd = OpenTox::Compound.new(c_uri)
+ URI.encode_www_form_component(cmpd.to_inchi)
+ }
+
+ # Process compounds
+ params[:compounds].each_with_index { |c_uri, c_idx|
+ cmpd = OpenTox::Compound.new(c_uri)
+ inchi = cmpd.to_inchi
+ sdf_data = cmpd.to_sdf
+
+ infile = Tempfile.open(['jl_descriptors-in-','.sdf'])
+ outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out")
+
+ begin
+ infile.puts sdf_data
+ infile.flush
+ s.new(infile.path, outfile_path)
+
+ row = [inchis[c_idx]]
+ jl_keys.each_with_index do |k,i| # Fill row
+ re = Regexp.new(k)
+ open(outfile_path) do |f|
+ f.each do |line|
+ if @prev =~ re
+ entry = line.chomp
+ val = nil
+ if OpenTox::Algorithm.numeric?(entry)
+ val = Float(entry)
+ val = nil if val.nan?
+ val = nil if val.infinite?
+ end
+ row << val
+ end
+ @prev = line
+ end
+ end
+ end
+ csvfile.puts(row.join(","))
+ csvfile.flush
+
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ ensure
+ File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf"))
+ File.delete(outfile_path)
+ infile.close!
+ end
+ }
+ master = CSV::parse(File.open(csvfile.path, "rb").read)
+
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ ensure
+ [ csvfile].each { |f| f.close! }
+ end
+
+ master
+ end
+
+
+ # Calcul:compoundsates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
# @param[Hash] Required keys: :compounds, :pc_type
# @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
- def self.get_pc_descriptors(params)
+ def self.get_ambit_descriptors(params)
begin
@@ -38,16 +165,17 @@ module OpenTox
ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
descs_uris = []
- params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
types = params[:pc_type].split(",")
descs.each { |uri, cat_name|
if types.include? cat_name[:category]
- descs_uris << uri
+ descs_uris << "#{cat_name[:category]}:::#{uri}"
end
}
if descs_uris.size == 0
raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
end
+ descs_uris.sort!
+ descs_uris.collect! { |uri| uri.split(":::").last }
#LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
begin
@@ -75,17 +203,21 @@ module OpenTox
end
ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
- # Calculate 3D for CPSA
- if types.include? "cpsa"
- ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} )
- LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
- end
+ # -C-a-l-c-u-l-a-t-e- -3-D- -f-o-r- -C-P-S-A-
+ # Always calculate 3D! See http://goo.gl/Tk81j
+ #if types.include? "cpsa"
+ ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} )
+ LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
+ #end
# Get Ambit results
ambit_result_uri = [] # 1st pos: base uri, then features
ambit_result_uri << ambit_ds_uri + "?"
ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
+ current_cat = ""
descs_uris.each_with_index do |uri, i|
+ old_cat = current_cat; current_cat = descs[uri][:category]
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + params[:step]) if params[:task] && params[:step] && old_cat != current_cat && old_cat != ""
algorithm = Algorithm::Generic.new(uri)
result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
ambit_result_uri << result_uri.split("?")[1] + "&"
@@ -104,13 +236,13 @@ module OpenTox
# Load dataset via CSV
# @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
# @return[String] dataset uri
- def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil)
+ def self.load_ds_csv(ambit_result_uri, smiles_to_inchi)
master=nil
(1...ambit_result_uri.size).collect { |idx|
curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
LOGGER.debug "Requesting #{curr_uri}"
- csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
+ csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv"}) )
if csv_data[0] && csv_data[0].size>1
if master.nil? # This is the smiles entry
(1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
@@ -139,17 +271,12 @@ module OpenTox
master[0][0] = "Compound" #"SMILES"
index_smi = master[0].index("SMILES")
master.map {|i| i.delete_at(index_smi)} if index_smi
- #master[0][0] = "SMILES"
+ master[0][0] = "SMILES"
#LOGGER.debug "-------- AM: Writing to dumpfile"
#File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
- parser = OpenTox::Parser::Spreadsheets.new
- ds = OpenTox::Dataset.new(nil,subjectid)
- ds.save(subjectid)
- parser.dataset = ds
- ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
- ds.save(subjectid)
+ master
end