summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-04-23 11:17:58 +0200
committermguetlein <martin.guetlein@gmail.com>2012-04-23 11:17:58 +0200
commit3b2f2033aa1d0936009bf13bc32ef6938834efb6 (patch)
treeba52da0dc26e28d58cb9500c33cc3818eceaca47
parent6927ecb8780ab2b969ecde71972e6c3c40e9d16f (diff)
parent49e31b25079b82d6169bfadab734344b6bca9be0 (diff)
Merge branch 'development' of github.com:opentox/opentox-ruby into development
-rw-r--r--ChangeLog23
-rw-r--r--VERSION2
-rw-r--r--lib/algorithm.rb47
-rw-r--r--lib/compound.rb62
-rw-r--r--lib/dataset.rb41
-rw-r--r--lib/model.rb49
-rw-r--r--lib/parser.rb52
-rw-r--r--lib/serializer.rb73
-rw-r--r--lib/transform.rb10
-rw-r--r--lib/utils.rb449
10 files changed, 623 insertions, 185 deletions
diff --git a/ChangeLog b/ChangeLog
index 5872d56..8ed5b85 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,13 +1,18 @@
+2012-04-20
+ * Support for joelib and openbabel descriptors in a completely unified interface with CDK (Ambit)
+ * Features can have multiple types (nominal and numeric), PC descriptors have detailed meta data
+ * Myriads of bugfixes to CSV download code (e.g. missing descriptors, handling of duplicates)
+
v3.1.0 2012-02-24
- * utils.rb: added for special routines (e.g. descriptor calculation)
- * task.rb: Polling with increasing interval
- * parser.rb: CSV up and download fixed
- * transform.rb: routines to create machine learning data matrices
- * algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
- gauss() removed
+ * utils.rb: added for special routines (e.g. descriptor calculation)
+ * task.rb: Polling with increasing interval
+ * parser.rb: CSV up and download fixed
+ * transform.rb: routines to create machine learning data matrices
+ * algorithm.rb: SVM parameter grid search, cos similarity as algorithm, gauss() removed
v3.0.1 2011-10-19
- * feature: model registration to ontology service
- * ontology lib gets endpoints from ontology service
+ * feature: model registration to ontology service
+ * ontology lib gets endpoints from ontology service
+
v3.0.0 2011-09-23
- * datasets stored as json (with Yajl) to improve performance
+ * datasets stored as json (with Yajl) to improve performance \ No newline at end of file
diff --git a/VERSION b/VERSION
index cb2b00e..fd2a018 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.0.1
+3.1.0
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 9dcf6a8..b921b9c 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -56,9 +56,15 @@ module OpenTox
def check_params(params,per_mil,subjectid=nil)
raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
- raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
- @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
@training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
+
+ unless params[:prediction_feature] # try to read prediction_feature from dataset
+ raise OpenTox::NotFoundError.new "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1
+ prediction_feature = OpenTox::Feature.find(@training_dataset.features.keys.first,@subjectid)
+ params[:prediction_feature] = prediction_feature.uri
+ end
+ @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
+
raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
unless params[:min_frequency].nil?
@@ -69,10 +75,10 @@ module OpenTox
end
end
- def add_fminer_data(fminer_instance, params, value_map)
+ def add_fminer_data(fminer_instance, value_map)
id = 1 # fminer start id is not 0
- @training_dataset.data_entries.each do |compound,entry|
+ @training_dataset.data_entries.each do |compound,entry| #order of compounds does not influence result
begin
smiles = OpenTox::Compound.smiles(compound.to_s)
rescue
@@ -84,7 +90,6 @@ module OpenTox
next
end
- value_map=params[:value_map] unless params[:value_map].nil?
entry.each do |feature,values|
if feature == @prediction_feature.uri
values.each do |value|
@@ -92,7 +97,7 @@ module OpenTox
LOGGER.warn "No #{feature} activity for #{compound.to_s}."
else
if @prediction_feature.feature_type == "classification"
- activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
+ activity= value_map.invert[value].to_i # activities are mapped to 1..n
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
elsif @prediction_feature.feature_type == "regression"
activity= value.to_f
@@ -477,10 +482,22 @@ module OpenTox
# assumes a data matrix 'features' and a vector 'y' of target values
row.names(features)=NULL
+ # features with all values missing removed
+ na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
+ features = features[,!names(features) %in% na_col]
+
+ # features with infinite values removed
+ inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
+ features = features[,!names(features) %in% inf_col]
+
+ # features with zero variance removed
+ zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
+ features = features[,!names(features) %in% zero_var]
+
pp = NULL
if (del_missing) {
# needed if rows should be removed
- na_ids = apply(features,1,function(x)any(is.na(x)))
+ na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )
features = features[!na_ids,]
y = y[!na_ids]
pp = preProcess(features, method=c("scale", "center"))
@@ -490,15 +507,21 @@ module OpenTox
}
features = predict(pp, features)
+ # features with nan values removed (sometimes preProcess return NaN values)
+ nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
+ features = features[,!names(features) %in% nan_col]
+
# determine subsets
- subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
- subsets = c(2,3,4,5,7,10,subsets)
+ subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
+ #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+ #subsets = c(2,3,4,5,7,10,subsets)
+ #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
subsets = unique(sort(round(subsets)))
subsets = subsets[subsets<=dim(features)[2]]
subsets = subsets[subsets>1]
-
+
# Recursive feature elimination
- rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
# read existing dataset and select most useful features
csv=feats[,c("SMILES", rfProfile$optVariables)]
@@ -528,7 +551,7 @@ module OpenTox
# @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
# @return [Hash] Hash with matching Smarts and number of hits
def self.lookup(params)
- params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
+ params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type], params[:lib],params[:subjectid])
end
end
diff --git a/lib/compound.rb b/lib/compound.rb
index c25125b..a08d541 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -3,6 +3,7 @@
module OpenTox
+ require "rexml/document"
# Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
class Compound
@@ -130,6 +131,47 @@ module OpenTox
"not available"
end
end
+
+
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+ # @example
+ # names = compound.to_names_hash
+ # @return [Hash] Classification => Name Array
+ def to_names_hash
+ begin
+ xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml")
+ xmldoc = REXML::Document.new(xml)
+ data = {}
+
+ xmldoc.root.elements[1].elements.each{|e|
+ if data.has_key?(e.attribute("classification").value) == false
+ data[e.attribute("classification").value] = [e.text]
+ else
+ data[e.attribute("classification").value].push(e.text)
+ end
+ }
+ data
+ rescue
+ "not available"
+ end
+ end
+
+ # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+ # @example
+ # names = compound.to_names_hash
+ # @return [Hash] Classification => Name Array
+ def to_ambit_names_hash
+ begin
+ ds = OpenTox::Dataset.new
+ ds.save
+ ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}"))
+ ds.save
+ ds.uri
+ rescue
+ "not available"
+ end
+ end
+
# Match a smarts string
# @example
@@ -197,25 +239,28 @@ module OpenTox
# Lookup numerical values, returns hash with feature name as key and value as value
# @param [Array] Array of feature names
# @param [String] Feature dataset uri
+ # @param [String] Comma separated pc types
+ # @param [String] Comma separated lib
# @return [Hash] Hash with feature name as key and value as value
- def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil)
+ def lookup(feature_array,feature_dataset_uri,pc_type,lib,subjectid=nil)
ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid)
#entry = ds.data_entries[self.uri]
entry = nil
- ds.data_entries.each { |c_uri, values|
- if c_uri.split('/compound/').last == self.to_inchi
- entry = ds.data_entries[self.uri]
+ ds.data_entries.each { |c_uri, values|
+ compound = OpenTox::Compound.new(c_uri)
+ if compound.to_inchi == self.to_inchi # Compare compounds by InChI
+ entry = ds.data_entries[c_uri]
break
end
}
LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
-
if entry.nil?
- uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
- uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid)
- ds = OpenTox::Dataset.find(uri,subjectid)
+ temp_ds = OpenTox::Dataset.create; temp_ds.add_compound(self.uri); temp_uri = temp_ds.save
+ uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"], "/pc/AllDescriptors"), {:dataset_uri => temp_uri, :pc_type => pc_type, :lib => lib})
+ ds = OpenTox::Dataset.find(uri)
entry = ds.data_entries[self.uri]
ds.delete(subjectid)
+ temp_ds.delete
end
features = entry.keys
features.each { |feature|
@@ -224,7 +269,6 @@ module OpenTox
entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
}
#res = feature_array.collect {|v| entry[v]}
- #LOGGER.debug "----- am #{entry.to_yaml}"
entry
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 95c1918..c916722 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -197,7 +197,12 @@ module OpenTox
accept_values
end
- # Detect feature type(s) in the dataset
+ # Detect feature type (reduced to one across all features)
+ # Classification takes precedence over regression
+ # DEPRECATED --
+ # HAS NO SENSE FOR DATASETS WITH MORE THAN 1 FEATURE
+ # FEATURES CAN HAVE MULTIPLE TYPES
+ # Replacement: see feature_types()
# @return [String] `classification", "regression", "mixed" or unknown`
def feature_type(subjectid=nil)
load_features(subjectid)
@@ -210,6 +215,24 @@ module OpenTox
"unknown"
end
end
+
+
+ # Detect feature types. A feature can have multiple types.
+ # Returns types hashed by feature URI, with missing features omitted.
+ # Example (YAML):
+ # http://toxcreate3.in-silico.ch:8082/dataset/152/feature/nHal:
+ # - http://www.opentox.org/api/1.1#NumericFeature
+ # - http://www.opentox.org/api/1.1#NominalFeature
+ # ...
+ #
+ # @return [Hash] Keys: feature URIs, Values: Array of types
+ def feature_types(subjectid=nil)
+ load_features(subjectid)
+ @features.inject({}){ |h,(f,metadata)|
+ h[f]=metadata[RDF.type] unless metadata[RDF.type][0].include? "MissingFeature"
+ h
+ }
+ end
=begin
=end
@@ -316,11 +339,14 @@ module OpenTox
end
# Complete feature values by adding zeroes
- def complete_data_entries
+ # @param [Hash] key: compound, value: duplicate sizes
+ def complete_data_entries(compound_sizes)
all_features = @features.keys
@data_entries.each { |c, e|
(Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
- self.add(c,f,0)
+ compound_sizes[c].times {
+ self.add(c,f,0)
+ }
}
}
end
@@ -454,6 +480,14 @@ module OpenTox
end
end
+ def value_map(prediction_feature_uri)
+ training_classes = accept_values(prediction_feature_uri).sort
+ value_map=Hash.new
+ training_classes.each_with_index { |c,i| value_map[i+1] = c }
+ value_map
+ end
+
+
private
# Copy a dataset (rewrites URI)
def copy(dataset)
@@ -504,6 +538,7 @@ module OpenTox
@data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact if @data_entries[compound.uri]
end
+
# def errors(compound)
# features = @data_entries[compound.uri].keys
# features.collect{|f| @features[f][OT.error]}.join(" ") if features
diff --git a/lib/model.rb b/lib/model.rb
index a858a0f..c9d367e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -103,7 +103,7 @@ module OpenTox
include Model
- attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
+ attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors, :compounds
def initialize(uri=nil)
if uri
@@ -169,12 +169,13 @@ module OpenTox
lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
lazar.subjectid = hash["subjectid"] if hash["subjectid"]
lazar.value_map = hash["value_map"] if hash["value_map"]
+ lazar.compounds = hash["compounds"] if hash["compounds"]
lazar
end
def to_json
- Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
+ Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map, :compounds => @compounds})
end
def run( params, accept_header=nil, waiting_task=nil )
@@ -237,6 +238,7 @@ module OpenTox
@compound = Compound.new compound_uri
features = {}
+
#LOGGER.debug self.to_yaml
unless @prediction_dataset
@prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@@ -247,19 +249,19 @@ module OpenTox
OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
} )
end
- if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
- all_activities = []
- all_activities = @activities.values.flatten.collect! { |i| i.to_f }
- end
+
unless database_activity(subjectid) # adds database activity to @prediction_dataset
+
# Calculation of needed values for query compound
@compound_features = eval("#{@feature_calculation_algorithm}({
:compound => @compound,
:features => @features,
:feature_dataset_uri => @metadata[OT.featureDataset],
:pc_type => self.parameter(\"pc_type\"),
+ :lib => self.parameter(\"lib\"),
:subjectid => subjectid
})")
+
# Adding fingerprint of query compound with features and values(p_value*nr_hits)
@compound_fingerprints = {}
@compound_features.each do |feature, value| # value is nil if "Substructure.match"
@@ -314,6 +316,16 @@ module OpenTox
@prediction_dataset.add @compound.uri, feature_uri, true
f+=1
end
+ elsif @feature_calculation_algorithm == "Substructure.lookup"
+ f = 0
+ @compound_features.each do |feature, value|
+ features[feature] = feature
+ @prediction_dataset.add_feature(feature, {
+ RDF.type => [OT.NumericFeature]
+ })
+ @prediction_dataset.add @compound.uri, feature, value
+ f+=1
+ end
else
@compound_features.each do |feature|
features[feature] = feature
@@ -337,15 +349,26 @@ module OpenTox
else
feature_uri = feature
end
- @prediction_dataset.add neighbor[:compound], feature_uri, true
+ if @feature_calculation_algorithm == "Substructure.lookup"
+ @prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
+ else
+ @prediction_dataset.add neighbor[:compound], feature_uri, true
+ end
+
unless features.has_key? feature
features[feature] = feature_uri
- @prediction_dataset.add_feature(feature_uri, {
- RDF.type => [OT.Substructure],
- OT.smarts => feature,
- OT.pValue => @p_values[feature],
- OT.effect => @effects[feature]
- })
+ if @feature_calculation_algorithm == "Substructure.lookup"
+ @prediction_dataset.add_feature(feature_uri, {
+ RDF.type => [OT.NumericFeature]
+ })
+ else
+ @prediction_dataset.add_feature(feature_uri, {
+ RDF.type => [OT.Substructure],
+ OT.smarts => feature,
+ OT.pValue => @p_values[feature],
+ OT.effect => @effects[feature]
+ })
+ end
f+=1
end
end
diff --git a/lib/parser.rb b/lib/parser.rb
index 56e4fed..07b44db 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -349,8 +349,11 @@ module OpenTox
# Load CSV string (format specification: http://toxcreate.org/help)
# @param [String] csv CSV representation of the dataset
+ # @param [Boolean] drop_missing Whether completely missing rows should be droppped
+ # @param [Boolean] all_numeric Whether all features should be treated as numeric
+ # @param [Boolean] del_nominal All nominal features will be removed
# @return [OpenTox::Dataset] Dataset object with CSV data
- def load_csv(csv, drop_missing=false)
+ def load_csv(csv, drop_missing=false, all_numeric=false)
row = 0
input = csv.split("\n")
headers = split_row(input.shift)
@@ -362,7 +365,7 @@ module OpenTox
row = split_row(row)
value_maps = detect_new_values(row, value_maps)
value_maps.each_with_index { |vm,j|
- if vm.size > @max_class_values # max @max_class_values classes.
+ if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
regression_features[j]=true
else
regression_features[j]=false
@@ -392,22 +395,30 @@ module OpenTox
def warnings
- info = ''
+ info = '<br>'
@feature_types.each do |feature,types|
+ @dataset.add_feature_metadata(feature,{RDF.type => []})
if types.uniq.size == 0
- type = "helper#MissingFeature"
- elsif types.uniq.size > 1
- type = OT.NumericFeature
+ @dataset.add_feature_metadata(
+ feature, {RDF.type => ( @dataset.features[feature][RDF.type] << "helper#MissingFeature" ) } # TODO: Fit to OT ontology!
+ )
+ info += "'#{@dataset.feature_name(feature)}' detected as 'MissingFeature'<br>"
else
- type = types.first
+ info += "'#{@dataset.feature_name(feature)}' detected as "
+ types_arr = []
+ types.uniq.each { |t|
+ types_arr << t
+ info += "'#{t.split('#').last}', "
+ }
+
+ @dataset.add_feature_metadata(
+ feature, {RDF.type => types_arr.sort} # nominal should be first for downward compatibility
+ )
+
+ info.chop!.chop!
+ info += "<br>"
end
- @dataset.add_feature_metadata(feature,{RDF.type => [type]})
- info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
-
- # TODO: rewrite feature values
- # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
end
-
@dataset.metadata[OT.Info] = info
warnings = ''
@@ -469,28 +480,31 @@ module OpenTox
unless @duplicate_feature_indices.include? i
value = row[i]
- #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
feature = @features[feature_idx]
type = feature_type(value) # May be NIL
- type = OT.NominalFeature unless (type.nil? || regression_features[i])
@feature_types[feature] << type if type
+ # Add nominal type if #distinct values le @max_class_values
+ if type == OT.NumericFeature
+ @feature_types[feature] << OT.NominalFeature unless regression_features[i]
+ end
val = nil
case type
when OT.NumericFeature
val = value.to_f
+ val = nil if val.infinite?
when OT.NominalFeature
val = value.to_s
end
feature_idx += 1
- if val != nil
+ if val != nil
@dataset.add(compound.uri, feature, val)
- if type != OT.NumericFeature
+ if @feature_types[feature].include? OT.NominalFeature
@dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
- @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+ @dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val)
end
end
@@ -654,7 +668,7 @@ module OpenTox
obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
table.data[compound.uri] = row
end
-
+
# find and remove ignored_features
@activity_errors = table.clean_features
table.add_to_dataset @dataset
diff --git a/lib/serializer.rb b/lib/serializer.rb
index 30cb2ba..4c26329 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -459,31 +459,64 @@ module OpenTox
def initialize(dataset)
@rows = []
@rows << ["SMILES"]
+
features = dataset.features.keys
- @rows.first << features
+
+ # prepare for subgraphs
+ have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq
+ if have_substructures.size == 1 && have_substructures[0]
+ features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" }
+ end
+
+ # gather missing features
+ delete_features = []
+ features.each{ |id|
+ dataset.features[id][RDF.type].each { |typestr|
+ if typestr.include? "MissingFeature"
+ delete_features << id
+ end
+ }
+ }
+ features = features - delete_features
+
+ # detect nr duplicates per compound
+ compound_sizes = {}
+ dataset.compounds.each do |compound|
+ entries=dataset.data_entries[compound]
+ entries.each do |feature, values|
+ compound_sizes[compound] || compound_sizes[compound] = []
+ compound_sizes[compound] << values.size
+ end
+ compound_sizes[compound].uniq!
+ raise "Inappropriate data for CSV export" if compound_sizes[compound].size > 1
+ compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
+ end
+
+ # substructures: complete data entries with zeroes of appropriate duplicates
+ features_smarts && dataset.complete_data_entries(compound_sizes)
+
+ # get headers
+ features_smarts && @rows.first << features_smarts || @rows.first << features
@rows.first.flatten!
- dataset.data_entries.each do |compound,entries|
+
+ dataset.compounds.each do |compound|
+ entries=dataset.data_entries[compound]
cmpd = Compound.new(compound)
- smiles = cmpd.to_smiles
inchi = URI.encode_www_form_component(cmpd.to_inchi)
- row_container = Array.new
- row = Array.new(@rows.first.size)
- row_container << row
- #row[0] = smiles
- row[0] = inchi
- entries.each do |feature, values|
- i = features.index(feature)+1
- values.each do |value|
- if row_container[0][i]
- #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
- row_container << row_container.last.collect
- row_container.last[i] = value
- #LOGGER.debug "RC: #{row_container.to_yaml}"
- else
- row_container.each { |r| r[i] = value }
- end
- end
+
+ # allocate container
+ row_container = Array.new(compound_sizes[compound])
+ (0...row_container.size).each do |i|
+ row_container[i] = Array.new(@rows.first.size)
end
+
+ entries.each { |feature, values|
+ (0...compound_sizes[compound]).each { |i|
+ j = features.index(feature)+1
+ row_container[i][0] = inchi
+ row_container[i][j] = values[i]
+ }
+ }
row_container.each { |r| @rows << r }
end
end
diff --git a/lib/transform.rb b/lib/transform.rb
index f6f769d..8632f6c 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -396,8 +396,8 @@ module OpenTox
@q_prop = gsl_q_prop_orig.row(0).to_a
end
- LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
- LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" if (@sims && @acts)
+ LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
+ LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
@sims = [ gram_matrix, @sims ]
@@ -490,8 +490,10 @@ module OpenTox
@cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
- @model.fingerprints.each { |fp|
- cmpd = fp[0]; fp = fp[1]
+ # Major BUG! Must loop over @model.compounds, hash is unordered!
+ # @model.fingerprints.each
+ @model.compounds.each { |cmpd|
+ fp = @model.fingerprints[cmpd]
if @model.activities[cmpd] # row good
acts = @model.activities[cmpd]; @acts += acts
LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
diff --git a/lib/utils.rb b/lib/utils.rb
index d9d7b4b..88b8347 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -1,155 +1,414 @@
require 'csv'
+require 'tempfile'
module OpenTox
module Algorithm
+ @ambit_descriptor_algorithm_uri = "http://apps.ideaconsult.net:8080/ambit2/algorithm/org.openscience.cdk.qsar.descriptors.molecular."
+ @ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
+ @ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
+ @keysfile = File.join(ENV['HOME'], ".opentox", "config", "pc_descriptors.yaml")
+
include OpenTox
# Calculate physico-chemical descriptors.
- # @param[Hash] Required keys: :dataset_uri, :pc_type
+ # @param[Hash] required: :dataset_uri, :pc_type, :rjb, :task, :add_uri, optional: :descriptor, :lib
# @return[String] dataset uri
-
def self.pc_descriptors(params)
+ ds = OpenTox::Dataset.find(params[:dataset_uri])
+ compounds = ds.compounds.collect
+ task_weights = {"joelib"=> 20, "openbabel"=> 1, "cdk"=> 50 }
+ task_weights.keys.each { |step| task_weights.delete(step) if (params[:lib] && (!params[:lib].split(",").include?(step)))}
+ task_weights["load"] = 10
+ task_sum = Float task_weights.values.sum
+ task_weights.keys.each { |step| task_weights[step] /= task_sum }
+ task_weights.keys.each { |step| task_weights[step] = (task_weights[step]*100).floor }
+
+ jl_master=nil
+ cdk_master=nil
+ ob_master=nil
+
+
+ # # # openbabel (via ruby bindings)
+ if !params[:lib] || params[:lib].split(",").include?("openbabel")
+ ob_master, ob_ids = get_ob_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["openbabel"]) if params[:task]
+ end
+
+
+ # # # joelib (via rjb)
+ if !params[:lib] || params[:lib].split(",").include?("joelib")
+ jl_master, jl_ids = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["joelib"]) if params[:task]
+ end
+
+
+ # # # cdk (via REST)
+ if !params[:lib] || params[:lib].split(",").include?("cdk")
+ ambit_result_uri, smiles_to_inchi, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :task => params[:task], :step => task_weights["cdk"], :descriptor => params[:descriptor] } )
+ #LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
+ cdk_master, cdk_ids, ambit_ids = load_ds_csv(ambit_result_uri, smiles_to_inchi, cdk_ids )
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["load"]) if params[:task]
+ end
+
+ # # # fuse CSVs ("master" structures)
+ if jl_master && cdk_master
+ nr_cols = (jl_master[0].size)-1
+ LOGGER.debug "Merging #{nr_cols} new columns"
+ cdk_master.each {|row| nr_cols.times { row.push(nil) } }
+ jl_master.each do |row|
+ temp = cdk_master.assoc(row[0]) # Finds the appropriate line in master
+ ((-1*nr_cols)..-1).collect.each { |idx|
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
+ }
+ end
+ master = cdk_master
+ else # either jl_master or cdk_master nil
+ master = jl_master || cdk_master
+ end
+
+ if ob_master && master
+ nr_cols = (ob_master[0].size)-1
+ LOGGER.debug "Merging #{nr_cols} new columns"
+ master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
+ ob_master.each do |row|
+ temp = master.assoc(row[0]) # Finds the appropriate line in master
+ ((-1*nr_cols)..-1).collect.each { |idx|
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
+ }
+ end
+ else # either ob_master or master nil
+ master = ob_master || master
+ end
+
+ if master
+
+ ds = OpenTox::Dataset.find(
+ OpenTox::RestClientWrapper.post(
+ File.join(CONFIG[:services]["opentox-dataset"]), master.collect { |row| row.join(",") }.join("\n"), {:content_type => "text/csv"}
+ )
+ )
+
+ # # # add feature metadata
+ pc_descriptors = YAML::load_file(@keysfile)
+ ambit_ids && ambit_ids.each_with_index { |id,idx|
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[cdk_ids[idx]][:name]} [#{pc_descriptors[cdk_ids[idx]][:pc_type]}, #{pc_descriptors[cdk_ids[idx]][:lib]}]"})
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => @ambit_descriptor_algorithm_uri + cdk_ids[idx]})
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
+ }
+ ob_ids && ob_ids.each { |id|
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
+ creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
+ creator_uri += "/#{id}" if params[:add_uri]
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
+ }
+ jl_ids && jl_ids.each { |id|
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
+ creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
+ creator_uri += "/#{id}" if params[:add_uri]
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
+ }
+
+ ds.save
+ else
+ raise OpenTox::BadRequestError.new "No descriptors matching your criteria found."
+ end
+
+ end
+
+
+ # Calculate OpenBabel physico-chemical descriptors.
+ # @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
+ # @return[Array] CSV, array of field ids, array of field descriptions
+ def self.get_ob_descriptors(params)
+
+ master = nil
+
begin
- ds = OpenTox::Dataset.find(params[:dataset_uri])
- compounds = ds.compounds.collect
- ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
- #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
- LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
- load_ds_csv(ambit_result_uri, smiles_to_inchi)
+ csvfile = Tempfile.open(['ob_descriptors-','.csv'])
+
+ pc_descriptors = YAML::load_file(@keysfile)
+ ids = pc_descriptors.collect{ |id, info|
+ id if info[:lib] == "openbabel" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
+ }.compact
+
+ if ids.length > 0
+ csvfile.puts((["SMILES"] + ids).join(","))
+
+ # remember inchis
+ inchis = params[:compounds].collect { |c_uri|
+ URI.encode_www_form_component(OpenTox::Compound.new(c_uri).to_inchi)
+ }
+
+ # Process compounds
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_and_out_formats 'inchi', 'can'
+
+ inchis.each_with_index { |inchi, c_idx|
+ row = [inchis[c_idx]]
+ obconversion.read_string(obmol, URI.decode_www_form_component(inchi))
+ ids.each { |name|
+ if obmol.respond_to?(name.underscore)
+ val = eval("obmol.#{name.underscore}") if obmol.respond_to?(name.underscore)
+ else
+ if name != "nF" && name != "spinMult" && name != "nHal" && name != "logP"
+ val = OpenBabel::OBDescriptor.find_type(name.underscore).predict(obmol)
+ elsif name == "nF"
+ val = OpenBabel::OBDescriptor.find_type("nf").predict(obmol)
+ elsif name == "spinMult" || name == "nHal" || name == "logP"
+ val = OpenBabel::OBDescriptor.find_type(name).predict(obmol)
+ end
+ end
+ if OpenTox::Algorithm.numeric?(val)
+ val = Float(val)
+ val = nil if val.nan?
+ val = nil if (val && val.infinite?)
+ end
+ row << val
+ }
+ LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
+ csvfile.puts(row.join(","))
+ csvfile.flush
+ }
+ master = CSV::parse(File.open(csvfile.path, "rb").read)
+ end
+
rescue Exception => e
LOGGER.debug "#{e.class}: #{e.message}"
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ ensure
+ csvfile.close!
end
+ [ master, ids ]
+
end
-
- # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
- # @param[Hash] Required keys: :compounds, :pc_type
- # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
- def self.get_pc_descriptors(params)
+
+
+ # Calculate Joelib2 physico-chemical descriptors.
+ # @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
+ # @return[Array] CSV, array of field ids, array of field descriptions
+ def self.get_jl_descriptors(params)
+
+ master = nil
+ s = params[:rjb]; raise "No Java environment" unless s
+
+ # Load keys, enter CSV headers
begin
+ csvfile = Tempfile.open(['jl_descriptors-','.csv'])
- ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
- ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
- descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
- descs_uris = []
- params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
- types = params[:pc_type].split(",")
- descs.each { |uri, cat_name|
- if types.include? cat_name[:category]
- descs_uris << uri
- end
- }
- if descs_uris.size == 0
- raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
+ pc_descriptors = YAML::load_file(@keysfile)
+ ids = pc_descriptors.collect{ |id, info|
+ id if info[:lib] == "joelib" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
+ }.compact
+
+
+ if ids.length > 0
+ csvfile.puts((["SMILES"] + ids).join(","))
+
+ # remember inchis
+ inchis = params[:compounds].collect { |c_uri|
+ cmpd = OpenTox::Compound.new(c_uri)
+ URI.encode_www_form_component(cmpd.to_inchi)
+ }
+
+ # Process compounds
+ params[:compounds].each_with_index { |c_uri, c_idx|
+ cmpd = OpenTox::Compound.new(c_uri)
+ inchi = cmpd.to_inchi
+ sdf_data = cmpd.to_sdf
+
+ infile = Tempfile.open(['jl_descriptors-in-','.sdf'])
+ outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out")
+
+ begin
+ infile.puts sdf_data
+ infile.flush
+ s.new(infile.path, outfile_path) # runs joelib
+
+ row = [inchis[c_idx]]
+ ids.each_with_index do |k,i| # Fill row
+ re = Regexp.new(k)
+ open(outfile_path) do |f|
+ f.each do |line|
+ if @prev == k
+ entry = line.chomp
+ val = nil
+ if OpenTox::Algorithm.numeric?(entry)
+ val = Float(entry)
+ val = nil if val.nan?
+ val = nil if (val && val.infinite?)
+ end
+ row << val
+ break
+ end
+ @prev = line.gsub(/^.*types./,"").gsub(/count./,"").gsub(/>/,"").chomp if line =~ re
+ end
+ end
+ end
+ LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
+ csvfile.puts(row.join(","))
+ csvfile.flush
+
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ ensure
+ File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf"))
+ File.delete(outfile_path)
+ infile.close!
+ end
+ }
+ master = CSV::parse(File.open(csvfile.path, "rb").read)
end
- #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ ensure
+ [ csvfile].each { |f| f.close! }
+ end
+
+ [ master, ids ]
+
+ end
+
+ # Calculate CDK physico-chemical descriptors via Ambit -- DO NOT OVERLOAD Ambit.
+ # @param[Hash] required: :compounds, :pc_type, :task, :step optional: :descriptor
+ # @return[Array] array of Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features, hash smiles to inchi, array of field descriptions
+ def self.get_cdk_descriptors(params)
+
+ ambit_result_uri = [] # 1st pos: base uri, then features
+ smiles_to_inchi = {}
+ task_weights = {"electronic"=> 4, "topological"=> 19, "constitutional"=> 12, "geometrical"=> 3, "hybrid"=> 2, "cpsa"=> 1 }
+ task_weights.keys.each { |pc_type| task_weights.delete(pc_type) if (params[:pc_type] && (!params[:pc_type].split(",").include?(pc_type)))}
+ task_sum = Float task_weights.values.sum
+ task_weights.keys.each { |pc_type| task_weights[pc_type] /= task_sum }
+ task_weights.keys.each { |pc_type| task_weights[pc_type] *= params[:step] }
+
+
+ # extract wanted descriptors from config file and parameters
+ pc_descriptors = YAML::load_file(@keysfile)
+
+ ids = pc_descriptors.collect { |id, info|
+ "#{info[:pc_type]}:::#{id}" if info[:lib] == "cdk" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
+ }.compact
+
+ if ids.size > 0
+ ids.sort!
+ ids.collect! { |id| id.split(":::").last }
+
+ # create dataset at Ambit
begin
- # Create SMI
- smiles_array = []; smiles_to_inchi = {}
params[:compounds].each do |n|
cmpd = OpenTox::Compound.new(n)
smiles_string = cmpd.to_smiles
smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
- smiles_array << smiles_string
end
- smi_file = Tempfile.open(['pc_ambit', '.csv'])
- pc_descriptors = nil
-
- # Create Ambit dataset
- smi_file.puts( "SMILES\n" )
- smi_file.puts( smiles_array.join("\n") )
- smi_file.flush
- ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
+ smi_file = Tempfile.open(['pc_ambit', '.csv']) ; smi_file.puts( "SMILES\n" + smiles_to_inchi.keys.join("\n") ) ; smi_file.flush
+ ambit_ds_uri = OpenTox::RestClientWrapper.post(@ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
+ ambit_result_uri = [ ambit_ds_uri + "?" ] # 1st pos: base uri, then features
rescue Exception => e
LOGGER.debug "#{e.class}: #{e.message}"
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
ensure
smi_file.close! if smi_file
end
- ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
-
- # Calculate 3D for CPSA
- if types.include? "cpsa"
- ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} )
- LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
- end
-
- # Get Ambit results
- ambit_result_uri = [] # 1st pos: base uri, then features
- ambit_result_uri << ambit_ds_uri + "?"
+ # get SMILES feature URI
+ ambit_smiles_uri = OpenTox::RestClientWrapper.get(
+ ambit_ds_uri + "/features",
+ {:accept=> "text/uri-list"}
+ ).chomp
ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
- descs_uris.each_with_index do |uri, i|
- algorithm = Algorithm::Generic.new(uri)
+ # always calculate 3D (http://goo.gl/Tk81j), then get results
+ OpenTox::RestClientWrapper.post(
+ @ambit_mopac_model_uri,
+ {:dataset_uri => ambit_ds_uri},
+ {:accept => "text/uri-list"}
+ )
+ current_cat = ""
+ ids.each_with_index do |id, i|
+ old_cat = current_cat; current_cat = pc_descriptors[id][:pc_type]
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[old_cat]) if params[:task] && old_cat != current_cat && old_cat != ""
+ algorithm = Algorithm::Generic.new(@ambit_descriptor_algorithm_uri+id)
result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
ambit_result_uri << result_uri.split("?")[1] + "&"
- LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}"
+ LOGGER.debug "Ambit (#{ids.size}): #{i+1}"
end
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[current_cat]) if params[:task]
#LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
- [ ambit_result_uri, smiles_to_inchi ]
-
- rescue Exception => e
- LOGGER.debug "#{e.class}: #{e.message}"
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
end
+
+ [ ambit_result_uri, smiles_to_inchi, ids ]
+
end
# Load dataset via CSV
# @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
- # @return[String] dataset uri
- def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil)
+ # @param[Hash] keys: SMILES, values: InChIs
+ # @param[Array] field descriptions, one for each feature
+ # @return[Array] CSV, array of field ids, array of field descriptions
+ def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil)
master=nil
- (1...ambit_result_uri.size).collect { |idx|
- curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
- LOGGER.debug "Requesting #{curr_uri}"
- csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
- if csv_data[0] && csv_data[0].size>1
- if master.nil? # This is the smiles entry
- (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
- master = csv_data
- next
- else
- index_uri = csv_data[0].index("SMILES")
- csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
-
- nr_cols = (csv_data[0].size)-1
- LOGGER.debug "Merging #{nr_cols} new columns"
- master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
- csv_data.each do |row|
- temp = master.assoc(row[0]) # Finds the appropriate line in master
- ((-1*nr_cols)..-1).collect.each { |idx|
- temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
- }
+ ids=[]
+ ambit_ids=[]
+
+ if ambit_result_uri.size > 0
+ (1...ambit_result_uri.size).collect { |idx|
+ curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
+ #LOGGER.debug "Requesting #{curr_uri}"
+ csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
+ if csv_data[0] && csv_data[0].size>1
+ if master.nil? # This is the smiles entry
+ (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
+ master = csv_data
+ next
+ else
+ index_uri = csv_data[0].index("SMILES")
+ csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
+
+ nr_cols = (csv_data[0].size)-1
+ LOGGER.debug "Merging #{nr_cols} new columns"
+ ids += Array.new(nr_cols, single_ids[idx-2])
+ master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
+ csv_data.each do |row|
+ temp = master.assoc(row[0]) # Finds the appropriate line in master
+ ((-1*nr_cols)..-1).collect.each { |idx|
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
+ }
+ end
end
end
- end
- }
+ }
- index_uri = master[0].index("Compound")
- master.map {|i| i.delete_at(index_uri)}
- master[0].each {|cell| cell.chomp!(" ")}
- master[0][0] = "Compound" #"SMILES"
- index_smi = master[0].index("SMILES")
- master.map {|i| i.delete_at(index_smi)} if index_smi
- #master[0][0] = "SMILES"
+ index_uri = master[0].index("Compound")
+ master.map {|i| i.delete_at(index_uri)}
+ master[0].each {|cell| cell.chomp!(" ")}
+ master[0][0] = "Compound" #"SMILES"
+ index_smi = master[0].index("SMILES")
+ master.map {|i| i.delete_at(index_smi)} if index_smi
+ master[0][0] = "SMILES"
+ ambit_ids=master[0].collect
+ ambit_ids.shift
+ end
#LOGGER.debug "-------- AM: Writing to dumpfile"
#File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
- parser = OpenTox::Parser::Spreadsheets.new
- ds = OpenTox::Dataset.new(nil,subjectid)
- ds.save(subjectid)
- parser.dataset = ds
- ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
- ds.save(subjectid)
+ [ master, ids, ambit_ids ]
+
end