summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorMartin Gütlein <martin.guetlein@gmail.com>2010-04-14 11:30:58 +0200
committerMartin Gütlein <martin.guetlein@gmail.com>2010-04-14 11:30:58 +0200
commitef9d136c275f86147fea116c9351190489ff41c7 (patch)
treed40d1e269ce19023de7eadb5c72c974377e0d056 /lib
parent483b89ab23449372582e8754b3b9b481d338654f (diff)
performance tweaking for owl dataset loading
Diffstat (limited to 'lib')
-rw-r--r--lib/dataset.rb77
-rw-r--r--lib/owl.rb225
2 files changed, 232 insertions, 70 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index ee92a56..d6e0b39 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -24,40 +24,46 @@ module OpenTox
d.source = owl.source
d.identifier = owl.identifier.sub(/^\[/,'').sub(/\]$/,'')
d.uri = d.identifier
- d.data = owl.data
- halt 404, "Dataset #{uri} empty!" if d.data.empty?
- d.data.each do |compound,features|
- d.compounds << compound
- features.each do |f,v|
- d.features << f.keys[0]
- end
- end
- d.compounds.uniq!
- d.features.uniq!
- #PENDING: remove debug checks
- d.data.each do |c,f|
- f.each do |ff,v|
- raise "illegal data: feature is no string "+ff.inspect unless ff.is_a?(Hash)
- end
- end
- raise "illedal dataset data\n"+d.data.inspect+"\n" unless d.data.is_a?(Hash) and d.data.values.is_a?(Array)
- raise "illegal dataset features:\n"+d.features.inspect+"\n" unless d.features.size>0 and d.features[0].is_a?(String)
+ # when loading a dataset from owl, only compound- and feature-uris are loaded
+ owl.load_data_compounds_and_features(d.compounds, d.features)
+ # all features are marked as dirty, loaded dynamically later
+ d.init_dirty_features(owl)
+
+ d.compounds.uniq!
+ d.features.uniq!
end
return d
end
# creates a new dataset, using only those compounsd specified in new_compounds
# returns uri of new dataset
- def create_new_dataset( new_compounds, new_title, new_source )
+ def create_new_dataset( new_compounds, new_features, new_title, new_source )
+
+ # load require features
+ if ((defined? @dirty_features) && (@dirty_features - new_features).size > 0)
+ (@dirty_features - new_features).each{|f| load_feature_values(f)}
+ end
dataset = OpenTox::Dataset.new
dataset.title = new_title
dataset.source = new_source
- dataset.features = @features
+ dataset.features = new_features
dataset.compounds = new_compounds
+
+ # Ccopy dataset data for compounds and features
+ # PENDING: why storing feature values in an array?
new_compounds.each do |c|
- dataset.data[c] = @data[c]
+ data_c = []
+ @data[c].each do |d|
+ m = {}
+ new_features.each do |f|
+ m[f] = d[f]
+ end
+ data_c << m
+ end
+
+ dataset.data[c] = data_c
end
return dataset.save
end
@@ -94,6 +100,10 @@ module OpenTox
# return compound-feature value
def get_value(compound, feature)
+ if (defined? @dirty_features) && @dirty_features.include?(feature)
+ load_feature_values(feature)
+ end
+
v = @data[compound]
raise "no values for compound "+compound.to_s if v==nil
if v.is_a?(Array)
@@ -113,8 +123,25 @@ module OpenTox
end
end
+ # loads specified feature and removes dirty-flag, loads all features if feature is nil
+ def load_feature_values(feature=nil)
+ if feature
+ raise "feature already loaded" unless @dirty_features.include?(feature)
+ @owl.load_data_compounds_and_features(@compounds, @data, feature)
+ @dirty_features.delete(feature)
+ else
+ @data = {}
+ @owl.load_dataset_feature_values(@compounds, @data)
+ @dirty_features.clear
+ end
+ end
def save
+ # loads all features before loading
+ if ((defined? @dirty_features) && @dirty_features.size > 0)
+ load_feature_values()
+ end
+
@features.uniq!
@compounds.uniq!
RestClient::Resource.new(@@config[:services]["opentox-dataset"], :user => @@users[:users].keys[0], :password => @@users[:users].values[0]).post(self.to_yaml, :content_type => "application/x-yaml").chomp.to_s
@@ -232,6 +259,12 @@ module OpenTox
end
=end
- end
+
+ def init_dirty_features(owl)
+ @dirty_features = @features
+ @owl = owl
+ end
+ end
+
end
diff --git a/lib/owl.rb b/lib/owl.rb
index f09ee01..374c1e6 100644
--- a/lib/owl.rb
+++ b/lib/owl.rb
@@ -1,3 +1,14 @@
+class Redland::Literal
+
+ # the literal node of the ruby swig api provdides the 'value' of a literal but not the 'datatype'
+ # found solution in mailing list
+ def datatype()
+ uri = Redland.librdf_node_get_literal_value_datatype_uri(self.node)
+ return Redland.librdf_uri_to_string(uri) if uri
+ end
+
+end
+
module OpenTox
class Owl
@@ -18,6 +29,7 @@ module OpenTox
end
def self.from_data(data,uri)
+
owl = OpenTox::Owl.new
parser = Redland::Parser.new
begin
@@ -37,10 +49,6 @@ module OpenTox
@model.to_string
end
- #def predictedVariables
- #
- #end
-
def method_missing(name, *args)
methods = ['title', 'source', 'identifier', 'algorithm', 'independentVariables', 'dependentVariables', 'predictedVariables', 'date','trainingDataset', 'hasStatus', "percentageCompleted" ]
if methods.include? name.to_s.sub(/=/,'')
@@ -142,25 +150,143 @@ module OpenTox
@model.add feature, DC['source'], feature_uri
end
feature
- end
+ end
+
+ # feature values are not loaded for performance reasons
+ # loading compounds and features into arrays that are given as params
+ def load_dataset( compounds, features )
+ ot_compound = OT['compound']
+ dc_identifier = DC['identifier']
+ @model.subjects(RDF['type'], OT['DataEntry']).each do |data_entry|
+ compound_node = @model.object(data_entry, ot_compound)
+ compound_uri = @model.object(compound_node, dc_identifier).to_s
+ compounds << compound_uri
+ end
+ @model.subjects(RDF['type'], OT['Feature']).each do |feature|
+ feature_literal = @model.object(feature, dc_identifier)
+ raise "feature is no literal" unless feature_literal.is_a?(Redland::Literal)
+ # PENDING: to be able to recreate literal nodes for features, the datatype is stored
+ @@feature_datatype = feature_literal.datatype
+ features << feature_literal.value
+ end
+ LOGGER.debug "loaded "+compounds.size.to_s+" compounds and "+features.size.to_s+" features"
+ end
+
+ # loading feature values for the specified feature
+ # if feature is nil, all feature values are loaded
+ #
+ # general remark on the rdf loading (found out with some testing):
+ # the search methods (subjects/find) are fast, the time consuming parts is creating resources,
+ # which cannot be avoided in general (implemented some performance tweaks with uri storing when loading all features)
+ def load_dataset_feature_values( compounds, data, feature_uri=nil )
+
+ LOGGER.debug("load feature values"+ ( (feature_uri!=nil)?(" for feature: "+feature_uri):"") )
+
+ # values are stored in the data-hash, hash has a key for each compound
+ compounds.each{|c| data[c] = [] unless data[c]}
+
+ ot_values = OT['values']
+ ot_feature = OT['feature']
+ ot_compound = OT['compound']
+ dc_identifier = DC['identifier']
+ ot_value = OT['value']
+ rdf_type = RDF['type']
+ ot_feature_value = OT['FeatureValue']
+
+ load_all_features = feature_uri==nil
+ feature_node = nil
+
+ # create feature node for feature uri if specified
+ unless load_all_features
+ feature_literal = Redland::Literal.new(feature_uri,nil,Redland::Uri.new(@@feature_datatype))
+ feature_node = @model.subject(dc_identifier, feature_literal)
+ # remark: solution without creating the literal node:
+ #@model.subjects(RDF['type'], OT['Feature']).each do |feature|
+ # f_uri = @model.object(feature, dc_identifier).value
+ # if feature_uri==f_uri
+ # feature_node = feature
+ # break
+ # end
+ #end
+ raise "feature node not found" unless feature_node
+ end
+
+ count = 0
+
+ # preformance tweak: store uirs to save some resource init time
+ compound_uri_store = {}
+ feature_uri_store = {}
+
+ # search for all feature_value_node with property 'ot_feature'
+ # feature_node is either nil, i.e. a wildcard or specified
+ @model.find(nil, ot_feature, feature_node) do |feature_value_node,p,o|
+
+ # get compound_uri by "backtracking" to values node (property is 'ot_values'), then get compound_node via 'ot_compound'
+ value_nodes = @model.subjects(ot_values,feature_value_node)
+ raise "more than one value node "+value_nodes.size.to_s unless value_nodes.size==1
+ value_node = value_nodes[0]
+ compound_node = @model.object(value_node, ot_compound)
+ compound_uri = compound_uri_store[compound_node.to_s]
+ unless compound_uri
+ compound_uri = @model.object(compound_node, dc_identifier).to_s
+ compound_uri_store[compound_node.to_s] = compound_uri
+ end
+
+ if load_all_features
+ # if load all features, feautre_uri is not specified, derieve from feature_node
+ feature_uri = feature_uri_store[o.to_s]
+ unless feature_uri
+ feature_literal = @model.object(o, dc_identifier)
+ raise "feature is no literal" unless feature_literal.is_a?(Redland::Literal)
+ feature_uri = feature_literal.value
+ feature_uri_store[o.to_s] = feature_uri
+ end
+ end
+
+ value_node_type = @model.object(feature_value_node, rdf_type)
+ if (value_node_type == ot_feature_value)
+ value_literal = @model.object( feature_value_node, ot_value)
+ raise "feature value no literal" unless value_literal.is_a?(Redland::Literal)
+
+ case value_literal.datatype
+ when /XMLSchema#double/
+ data[compound_uri] << {feature_uri => value_literal.value.to_f }
+ when /XMLSchema#string/
+ data[compound_uri] << {feature_uri => value_literal.value }
+ else
+ raise "feature value datatype undefined: "+value_literal.datatype
+ end
+ else
+ raise "feature value type not yet implemented "+value_node_type.to_s
+ end
+ count += 1
+ LOGGER.debug "loaded "+count.to_s+" feature values" if (count%500 == 0)
+ break if count == 1000
+ end
+
+ LOGGER.debug "loaded "+count.to_s+" feature values"
+ end
- def data
- data = {}
- @model.subjects(RDF['type'], OT['DataEntry']).each do |data_entry|
- compound_node = @model.object(data_entry, OT['compound'])
- compound_uri = @model.object(compound_node, DC['identifier']).to_s
- @model.find(data_entry, OT['values'], nil) do |s,p,values|
- feature_node = @model.object values, OT['feature']
- feature_uri = @model.object(feature_node, DC['identifier']).to_s.sub(/\^\^.*$/,'') # remove XML datatype
- type = @model.object(values, RDF['type'])
- if type == OT['FeatureValue']
- value = @model.object(values, OT['value']).to_s
- case value.to_s
- when TRUE_REGEXP # defined in environment.rb
- value = true
- when FALSE_REGEXP # defined in environment.rb
- value = false
- when /.*\^\^<.*XMLSchema#.*>/
+=begin
+ def data
+ LOGGER.debug("getting data from model")
+
+ data = {}
+ @model.subjects(RDF['type'], OT['DataEntry']).each do |data_entry|
+ compound_node = @model.object(data_entry, OT['compound'])
+ compound_uri = @model.object(compound_node, DC['identifier']).to_s
+ @model.find(data_entry, OT['values'], nil) do |s,p,values|
+ feature_node = @model.object values, OT['feature']
+ feature_uri = @model.object(feature_node, DC['identifier']).to_s.sub(/\^\^.*$/,'') # remove XML datatype
+ type = @model.object(values, RDF['type'])
+ if type == OT['FeatureValue']
+ value = @model.object(values, OT['value']).to_s
+ case value.to_s
+ when TRUE_REGEXP # defined in environment.rb
+ value = true
+ when FALSE_REGEXP # defined in environment.rb
+ value = false
+ when /.*\^\^<.*XMLSchema#.*>/
#HACK for reading ambit datasets
case value.to_s
when /XMLSchema#string/
@@ -171,36 +297,39 @@ module OpenTox
LOGGER.warn " ILLEGAL TYPE "+compound_uri + " has value '" + value.to_s + "' for feature " + feature_uri
value = nil
end
- else
- LOGGER.warn compound_uri + " has value '" + value.to_s + "' for feature " + feature_uri
- value = nil
- end
+ else
+ LOGGER.warn compound_uri + " has value '" + value.to_s + "' for feature " + feature_uri
+ value = nil
+ end
LOGGER.debug "converting owl to yaml, #compounds: "+(data.keys.size+1).to_s if (data.keys.size+1)%10==0 && !data.has_key?(compound_uri)
- #return data if (data.keys.size+1)%2==0 && !data.has_key?(compound_uri)
+
+ return data if (data.keys.size)>9 && !data.has_key?(compound_uri)
+
#puts "c "+compound_uri.to_s
#puts "f "+feature_uri.to_s
#puts "v "+value.to_s
#puts ""
- data[compound_uri] = [] unless data[compound_uri]
- data[compound_uri] << {feature_uri => value} unless value.nil?
- elsif type == OT['Tuple']
- entry = {}
- data[compound_uri] = [] unless data[compound_uri]
- #data[compound_uri][feature_uri] = [] unless data[compound_uri][feature_uri]
- @model.find(values, OT['complexValue'],nil) do |s,p,complex_value|
- name_node = @model.object complex_value, OT['feature']
- name = @model.object(name_node, DC['title']).to_s
- value = @model.object(complex_value, OT['value']).to_s
- v = value.sub(/\^\^.*$/,'') # remove XML datatype
- v = v.to_f if v.match(/^[\.|\d]+$/) # guess numeric datatype
- entry[name] = v
- end
- data[compound_uri] << {feature_uri => entry} unless entry.empty?
- end
- end
- end
- data
- end
+ data[compound_uri] = [] unless data[compound_uri]
+ data[compound_uri] << {feature_uri => value} unless value.nil?
+ elsif type == OT['Tuple']
+ entry = {}
+ data[compound_uri] = [] unless data[compound_uri]
+ #data[compound_uri][feature_uri] = [] unless data[compound_uri][feature_uri]
+ @model.find(values, OT['complexValue'],nil) do |s,p,complex_value|
+ name_node = @model.object complex_value, OT['feature']
+ name = @model.object(name_node, DC['title']).to_s
+ value = @model.object(complex_value, OT['value']).to_s
+ v = value.sub(/\^\^.*$/,'') # remove XML datatype
+ v = v.to_f if v.match(/^[\.|\d]+$/) # guess numeric datatype
+ entry[name] = v
+ end
+ data[compound_uri] << {feature_uri => entry} unless entry.empty?
+ end
+ end
+ end
+ data
+ end
+=end
- end
+ end
end