diff options
author | Martin Gütlein <martin.guetlein@gmail.com> | 2010-04-14 11:30:58 +0200 |
---|---|---|
committer | Martin Gütlein <martin.guetlein@gmail.com> | 2010-04-14 11:30:58 +0200 |
commit | ef9d136c275f86147fea116c9351190489ff41c7 (patch) | |
tree | d40d1e269ce19023de7eadb5c72c974377e0d056 /lib | |
parent | 483b89ab23449372582e8754b3b9b481d338654f (diff) |
performance tweaking for owl dataset loading
Diffstat (limited to 'lib')
-rw-r--r-- | lib/dataset.rb | 77 | ||||
-rw-r--r-- | lib/owl.rb | 225 |
2 files changed, 232 insertions, 70 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb index ee92a56..d6e0b39 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -24,40 +24,46 @@ module OpenTox d.source = owl.source d.identifier = owl.identifier.sub(/^\[/,'').sub(/\]$/,'') d.uri = d.identifier - d.data = owl.data - halt 404, "Dataset #{uri} empty!" if d.data.empty? - d.data.each do |compound,features| - d.compounds << compound - features.each do |f,v| - d.features << f.keys[0] - end - end - d.compounds.uniq! - d.features.uniq! - #PENDING: remove debug checks - d.data.each do |c,f| - f.each do |ff,v| - raise "illegal data: feature is no string "+ff.inspect unless ff.is_a?(Hash) - end - end - raise "illedal dataset data\n"+d.data.inspect+"\n" unless d.data.is_a?(Hash) and d.data.values.is_a?(Array) - raise "illegal dataset features:\n"+d.features.inspect+"\n" unless d.features.size>0 and d.features[0].is_a?(String) + # when loading a dataset from owl, only compound- and feature-uris are loaded + owl.load_data_compounds_and_features(d.compounds, d.features) + # all features are marked as dirty, loaded dynamically later + d.init_dirty_features(owl) + + d.compounds.uniq! + d.features.uniq! end return d end # creates a new dataset, using only those compounsd specified in new_compounds # returns uri of new dataset - def create_new_dataset( new_compounds, new_title, new_source ) + def create_new_dataset( new_compounds, new_features, new_title, new_source ) + + # load require features + if ((defined? @dirty_features) && (@dirty_features - new_features).size > 0) + (@dirty_features - new_features).each{|f| load_feature_values(f)} + end dataset = OpenTox::Dataset.new dataset.title = new_title dataset.source = new_source - dataset.features = @features + dataset.features = new_features dataset.compounds = new_compounds + + # Ccopy dataset data for compounds and features + # PENDING: why storing feature values in an array? new_compounds.each do |c| - dataset.data[c] = @data[c] + data_c = [] + @data[c].each do |d| + m = {} + new_features.each do |f| + m[f] = d[f] + end + data_c << m + end + + dataset.data[c] = data_c end return dataset.save end @@ -94,6 +100,10 @@ module OpenTox # return compound-feature value def get_value(compound, feature) + if (defined? @dirty_features) && @dirty_features.include?(feature) + load_feature_values(feature) + end + v = @data[compound] raise "no values for compound "+compound.to_s if v==nil if v.is_a?(Array) @@ -113,8 +123,25 @@ module OpenTox end end + # loads specified feature and removes dirty-flag, loads all features if feature is nil + def load_feature_values(feature=nil) + if feature + raise "feature already loaded" unless @dirty_features.include?(feature) + @owl.load_data_compounds_and_features(@compounds, @data, feature) + @dirty_features.delete(feature) + else + @data = {} + @owl.load_dataset_feature_values(@compounds, @data) + @dirty_features.clear + end + end def save + # loads all features before loading + if ((defined? @dirty_features) && @dirty_features.size > 0) + load_feature_values() + end + @features.uniq! @compounds.uniq! RestClient::Resource.new(@@config[:services]["opentox-dataset"], :user => @@users[:users].keys[0], :password => @@users[:users].values[0]).post(self.to_yaml, :content_type => "application/x-yaml").chomp.to_s @@ -232,6 +259,12 @@ module OpenTox end =end - end + + def init_dirty_features(owl) + @dirty_features = @features + @owl = owl + end + end + end @@ -1,3 +1,14 @@ +class Redland::Literal + + # the literal node of the ruby swig api provdides the 'value' of a literal but not the 'datatype' + # found solution in mailing list + def datatype() + uri = Redland.librdf_node_get_literal_value_datatype_uri(self.node) + return Redland.librdf_uri_to_string(uri) if uri + end + +end + module OpenTox class Owl @@ -18,6 +29,7 @@ module OpenTox end def self.from_data(data,uri) + owl = OpenTox::Owl.new parser = Redland::Parser.new begin @@ -37,10 +49,6 @@ module OpenTox @model.to_string end - #def predictedVariables - # - #end - def method_missing(name, *args) methods = ['title', 'source', 'identifier', 'algorithm', 'independentVariables', 'dependentVariables', 'predictedVariables', 'date','trainingDataset', 'hasStatus', "percentageCompleted" ] if methods.include? name.to_s.sub(/=/,'') @@ -142,25 +150,143 @@ module OpenTox @model.add feature, DC['source'], feature_uri end feature - end + end + + # feature values are not loaded for performance reasons + # loading compounds and features into arrays that are given as params + def load_dataset( compounds, features ) + ot_compound = OT['compound'] + dc_identifier = DC['identifier'] + @model.subjects(RDF['type'], OT['DataEntry']).each do |data_entry| + compound_node = @model.object(data_entry, ot_compound) + compound_uri = @model.object(compound_node, dc_identifier).to_s + compounds << compound_uri + end + @model.subjects(RDF['type'], OT['Feature']).each do |feature| + feature_literal = @model.object(feature, dc_identifier) + raise "feature is no literal" unless feature_literal.is_a?(Redland::Literal) + # PENDING: to be able to recreate literal nodes for features, the datatype is stored + @@feature_datatype = feature_literal.datatype + features << feature_literal.value + end + LOGGER.debug "loaded "+compounds.size.to_s+" compounds and "+features.size.to_s+" features" + end + + # loading feature values for the specified feature + # if feature is nil, all feature values are loaded + # + # general remark on the rdf loading (found out with some testing): + # the search methods (subjects/find) are fast, the time consuming parts is creating resources, + # which cannot be avoided in general (implemented some performance tweaks with uri storing when loading all features) + def load_dataset_feature_values( compounds, data, feature_uri=nil ) + + LOGGER.debug("load feature values"+ ( (feature_uri!=nil)?(" for feature: "+feature_uri):"") ) + + # values are stored in the data-hash, hash has a key for each compound + compounds.each{|c| data[c] = [] unless data[c]} + + ot_values = OT['values'] + ot_feature = OT['feature'] + ot_compound = OT['compound'] + dc_identifier = DC['identifier'] + ot_value = OT['value'] + rdf_type = RDF['type'] + ot_feature_value = OT['FeatureValue'] + + load_all_features = feature_uri==nil + feature_node = nil + + # create feature node for feature uri if specified + unless load_all_features + feature_literal = Redland::Literal.new(feature_uri,nil,Redland::Uri.new(@@feature_datatype)) + feature_node = @model.subject(dc_identifier, feature_literal) + # remark: solution without creating the literal node: + #@model.subjects(RDF['type'], OT['Feature']).each do |feature| + # f_uri = @model.object(feature, dc_identifier).value + # if feature_uri==f_uri + # feature_node = feature + # break + # end + #end + raise "feature node not found" unless feature_node + end + + count = 0 + + # preformance tweak: store uirs to save some resource init time + compound_uri_store = {} + feature_uri_store = {} + + # search for all feature_value_node with property 'ot_feature' + # feature_node is either nil, i.e. a wildcard or specified + @model.find(nil, ot_feature, feature_node) do |feature_value_node,p,o| + + # get compound_uri by "backtracking" to values node (property is 'ot_values'), then get compound_node via 'ot_compound' + value_nodes = @model.subjects(ot_values,feature_value_node) + raise "more than one value node "+value_nodes.size.to_s unless value_nodes.size==1 + value_node = value_nodes[0] + compound_node = @model.object(value_node, ot_compound) + compound_uri = compound_uri_store[compound_node.to_s] + unless compound_uri + compound_uri = @model.object(compound_node, dc_identifier).to_s + compound_uri_store[compound_node.to_s] = compound_uri + end + + if load_all_features + # if load all features, feautre_uri is not specified, derieve from feature_node + feature_uri = feature_uri_store[o.to_s] + unless feature_uri + feature_literal = @model.object(o, dc_identifier) + raise "feature is no literal" unless feature_literal.is_a?(Redland::Literal) + feature_uri = feature_literal.value + feature_uri_store[o.to_s] = feature_uri + end + end + + value_node_type = @model.object(feature_value_node, rdf_type) + if (value_node_type == ot_feature_value) + value_literal = @model.object( feature_value_node, ot_value) + raise "feature value no literal" unless value_literal.is_a?(Redland::Literal) + + case value_literal.datatype + when /XMLSchema#double/ + data[compound_uri] << {feature_uri => value_literal.value.to_f } + when /XMLSchema#string/ + data[compound_uri] << {feature_uri => value_literal.value } + else + raise "feature value datatype undefined: "+value_literal.datatype + end + else + raise "feature value type not yet implemented "+value_node_type.to_s + end + count += 1 + LOGGER.debug "loaded "+count.to_s+" feature values" if (count%500 == 0) + break if count == 1000 + end + + LOGGER.debug "loaded "+count.to_s+" feature values" + end - def data - data = {} - @model.subjects(RDF['type'], OT['DataEntry']).each do |data_entry| - compound_node = @model.object(data_entry, OT['compound']) - compound_uri = @model.object(compound_node, DC['identifier']).to_s - @model.find(data_entry, OT['values'], nil) do |s,p,values| - feature_node = @model.object values, OT['feature'] - feature_uri = @model.object(feature_node, DC['identifier']).to_s.sub(/\^\^.*$/,'') # remove XML datatype - type = @model.object(values, RDF['type']) - if type == OT['FeatureValue'] - value = @model.object(values, OT['value']).to_s - case value.to_s - when TRUE_REGEXP # defined in environment.rb - value = true - when FALSE_REGEXP # defined in environment.rb - value = false - when /.*\^\^<.*XMLSchema#.*>/ +=begin + def data + LOGGER.debug("getting data from model") + + data = {} + @model.subjects(RDF['type'], OT['DataEntry']).each do |data_entry| + compound_node = @model.object(data_entry, OT['compound']) + compound_uri = @model.object(compound_node, DC['identifier']).to_s + @model.find(data_entry, OT['values'], nil) do |s,p,values| + feature_node = @model.object values, OT['feature'] + feature_uri = @model.object(feature_node, DC['identifier']).to_s.sub(/\^\^.*$/,'') # remove XML datatype + type = @model.object(values, RDF['type']) + if type == OT['FeatureValue'] + value = @model.object(values, OT['value']).to_s + case value.to_s + when TRUE_REGEXP # defined in environment.rb + value = true + when FALSE_REGEXP # defined in environment.rb + value = false + when /.*\^\^<.*XMLSchema#.*>/ #HACK for reading ambit datasets case value.to_s when /XMLSchema#string/ @@ -171,36 +297,39 @@ module OpenTox LOGGER.warn " ILLEGAL TYPE "+compound_uri + " has value '" + value.to_s + "' for feature " + feature_uri value = nil end - else - LOGGER.warn compound_uri + " has value '" + value.to_s + "' for feature " + feature_uri - value = nil - end + else + LOGGER.warn compound_uri + " has value '" + value.to_s + "' for feature " + feature_uri + value = nil + end LOGGER.debug "converting owl to yaml, #compounds: "+(data.keys.size+1).to_s if (data.keys.size+1)%10==0 && !data.has_key?(compound_uri) - #return data if (data.keys.size+1)%2==0 && !data.has_key?(compound_uri) + + return data if (data.keys.size)>9 && !data.has_key?(compound_uri) + #puts "c "+compound_uri.to_s #puts "f "+feature_uri.to_s #puts "v "+value.to_s #puts "" - data[compound_uri] = [] unless data[compound_uri] - data[compound_uri] << {feature_uri => value} unless value.nil? - elsif type == OT['Tuple'] - entry = {} - data[compound_uri] = [] unless data[compound_uri] - #data[compound_uri][feature_uri] = [] unless data[compound_uri][feature_uri] - @model.find(values, OT['complexValue'],nil) do |s,p,complex_value| - name_node = @model.object complex_value, OT['feature'] - name = @model.object(name_node, DC['title']).to_s - value = @model.object(complex_value, OT['value']).to_s - v = value.sub(/\^\^.*$/,'') # remove XML datatype - v = v.to_f if v.match(/^[\.|\d]+$/) # guess numeric datatype - entry[name] = v - end - data[compound_uri] << {feature_uri => entry} unless entry.empty? - end - end - end - data - end + data[compound_uri] = [] unless data[compound_uri] + data[compound_uri] << {feature_uri => value} unless value.nil? + elsif type == OT['Tuple'] + entry = {} + data[compound_uri] = [] unless data[compound_uri] + #data[compound_uri][feature_uri] = [] unless data[compound_uri][feature_uri] + @model.find(values, OT['complexValue'],nil) do |s,p,complex_value| + name_node = @model.object complex_value, OT['feature'] + name = @model.object(name_node, DC['title']).to_s + value = @model.object(complex_value, OT['value']).to_s + v = value.sub(/\^\^.*$/,'') # remove XML datatype + v = v.to_f if v.match(/^[\.|\d]+$/) # guess numeric datatype + entry[name] = v + end + data[compound_uri] << {feature_uri => entry} unless entry.empty? + end + end + end + data + end +=end - end + end end |