From b93002b4ea50ff7e357da08abd10577347ce2d5f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 11 Nov 2010 09:31:27 +0100 Subject: first steps towards version 2.0, yard documentation started, passes compound, dataset, feature, algorithm, fminer tests --- lib/dataset.rb | 482 ++++++++++++++++++--------------------------------------- 1 file changed, 148 insertions(+), 334 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 7c8ce24..05b2ed3 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -1,74 +1,19 @@ module OpenTox # Ruby wrapper for OpenTox Dataset Webservices (http://opentox.org/dev/apis/api-1.2/dataset). - # - # Examples: - # require "opentox-ruby-api-wrapper" - # - # # Creating datasets - # - # # create an empty dataset - # dataset = OpenTox::Dataset.new - # # create an empty dataset with URI - # # this does not load data from the dataset service - use one of the load_* methods - # dataset = OpenTox::Dataset.new("http:://webservices.in-silico/ch/dataset/1") - # # create new dataset and sav it to obtain a URI - # dataset = OpenTox::Dataset.create - # # create a new dataset from yaml representation - # dataset = OpenTox::Dataset.from_yaml - # # create a new dataset from CSV string - # csv_string = "SMILES, Toxicity\nc1ccccc1N, true" - # dataset = OpenTox::Dataset.from_csv(csv_string) - # - # # Loading data - # # Datasets created with OpenTox::Dataset.new(uri) are empty by default - # # Invoking one of the following functions will load data into the object - # - # # create an empty dataset with URI - # dataset = OpenTox::Dataset.new("http:://webservices.in-silico/ch/dataset/1") - # # loads (and returns) only metadata - # dataset.load_metadata - # # loads (and returns) only compounds - # dataset.load_compounds - # # loads (and returns) only features - # dataset.load_features - # # load all data from URI - # dataset.load_all - # - # # Getting dataset representations - # - # dataset = OpenTox::Dataset.new("http:://webservices.in-silico/ch/dataset/1") - # dataset.load_all - # # OWL-DL (RDF/XML) - # dataset.rdfxml - # # OWL-DL (Ntriples) - # dataset.ntriples - # # YAML - # dataset.yaml - # # CSV - # dataset.csv - # - # # Modifying datasets - # - # # insert a statement (compound_uri,feature_uri,value) - # dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", true - # - # - # # Saving datasets - # # save dataset at dataset service - # dataset.save - # - # # Deleting datasets - # # delete dataset (also at dataset service) - # dataset.delete class Dataset - include OtObject + include OpenTox attr_reader :features, :compounds, :data_entries, :metadata - attr_writer :metadata - # Create dataset with optional URI + # Create dataset with optional URI. Does not load data into the dataset - you will need to execute one of the load_* methods to pull data from a service or to insert it from other representations. + # @example Create an empty dataset + # dataset = OpenTox::Dataset.new + # @example Create an empty dataset with URI + # dataset = OpenTox::Dataset.new("http:://webservices.in-silico/ch/dataset/1") + # @param [optional, String] uri Dataset URI + # @return [OpenTox::Dataset] Dataset object def initialize(uri=nil) super uri @features = {} @@ -76,52 +21,79 @@ module OpenTox @data_entries = {} end - # Create and save an empty dataset (assigns URI to dataset) + # Create an empty dataset and save it at the dataset service (assigns URI to dataset) + # @example Create new dataset and save it to obtain a URI + # dataset = OpenTox::Dataset.create + # @param [optional, String] uri Dataset URI + # @return [OpenTox::Dataset] Dataset object def self.create(uri=CONFIG[:services]["opentox-dataset"]) dataset = Dataset.new - dataset.uri = RestClientWrapper.post(uri,{}).to_s.chomp + dataset.save + dataset + end + + # Find a dataset and load all data. This can be time consuming, use Dataset.new together with one of the load_* methods for a fine grained control over data loading. + # @param [String] uri Dataset URI + # @return [OpenTox::Dataset] Dataset object with all data + def self.find(uri) + dataset = Dataset.new(uri) + dataset.load_all dataset end # Get all datasets from a service -# def self.all(uri=CONFIG[:services]["opentox-dataset"]) -# RestClientWrapper.get(uri,:accept => "text/uri-list").to_s.each_line.collect{|u| Dataset.new(u)} -# end + # @param [optional,String] uri URI of the dataset service, defaults to service specified in configuration + # @return [Array] Array of dataset object with all data + def self.all(uri=CONFIG[:services]["opentox-dataset"]) + RestClientWrapper.get(uri,:accept => "text/uri-list").to_s.each_line.collect{|u| Dataset.new(u)} + end - # Create a dataset from YAML string - def self.from_yaml(yaml) - dataset = Dataset.create - dataset.copy YAML.load(yaml) - dataset + # Load YAML representation into the dataset + # @param [String] yaml YAML representation of the dataset + # @return [OpenTox::Dataset] Dataset object with YAML data + def load_yaml(yaml) + copy YAML.load(yaml) + end + + # Load RDF/XML representation from a file + # @param [String] file File with RDF/XML representation of the dataset + # @return [OpenTox::Dataset] Dataset object with RDF/XML data + def load_rdfxml_file(file) + parser = Parser::Owl::Dataset.new @uri + parser.uri = file.path + copy parser.load_uri end - # Create dataset from CSV string (format specification: http://toxcreate.org/help) + # Load CSV string (format specification: http://toxcreate.org/help) # - loads data_entries, compounds, features # - sets metadata (warnings) for parser errors # - you will have to set remaining metadata manually - def self.from_csv(csv) - dataset = Dataset.create - Parser::Spreadsheet.new(dataset).load_csv(csv) - dataset + # @param [String] csv CSV representation of the dataset + # @return [OpenTox::Dataset] Dataset object with CSV data + def load_csv(csv) + save unless @uri # get a uri for creating features + parser = Parser::Spreadsheets.new + parser.dataset = self + parser.load_csv(csv) end - # Create dataset from Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)) + # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)) # - loads data_entries, compounds, features # - sets metadata (warnings) for parser errors # - you will have to set remaining metadata manually - def self.from_spreadsheet(book) - dataset = Dataset.create - Parser::Spreadsheet.new(dataset).load_excel(book) - dataset + # @param [Excel] book Excel workbook object (created with roo gem) + # @return [OpenTox::Dataset] Dataset object with Excel data + def load_spreadsheet(book) + save unless @uri # get a uri for creating features + parser = Parser::Spreadsheets.new + parser.dataset = self + parser.load_excel(book) end - # Load and return metadata of a Dataset object + # Load and return only metadata of a Dataset object + # @return [Hash] Metadata of the dataset def load_metadata - #if (CONFIG[:yaml_hosts].include?(URI.parse(@uri).host)) - #add_metadata YAML.load(RestClientWrapper.get(File.join(@uri,"metadata"), :accept => "application/x-yaml")) - #else - add_metadata Parser::Owl::Dataset.new(@uri).metadata - #end + add_metadata Parser::Owl::Dataset.new(@uri).metadata self.uri = @uri if @uri # keep uri @metadata end @@ -136,7 +108,8 @@ module OpenTox end end - # Load and return all compound URIs + # Load and return only compound URIs from the dataset service + # @return [Array] Compound URIs in the dataset def load_compounds RestClientWrapper.get(File.join(uri,"compounds"),:accept=> "text/uri-list").to_s.each_line do |compound_uri| @compounds << compound_uri.chomp @@ -144,44 +117,75 @@ module OpenTox @compounds.uniq! end - # Load all feature URIs + # Load and return only features from the dataset service + # @return [Hash] Features of the dataset def load_features - RestClientWrapper.get(File.join(uri,"features"),:accept=> "text/uri-list").to_s.each_line do |feature_uri| - @features[feature_uri.chomp] = Feature.new(feature_uri.chomp).load_metadata - end + parser = Parser::Owl::Dataset.new(@uri) + @features = parser.load_features @features end - # Get YAML representation - def yaml - self.to_yaml + # Detect feature type(s) in the dataset + # @return [String] `classification", "regression", "mixed" or unknown` + def feature_type + feature_types = @features.collect{|f,metadata| metadata[OT.isA]}.uniq + LOGGER.debug "FEATURES" + LOGGER.debug feature_types.inspect + if feature_types.size > 1 + "mixed" + else + case feature_types.first + when /NominalFeature/ + "classification" + when /NumericFeature/ + "regression" + else + "unknown" + end + end end - # Get Excel representation, returns a Spreadsheet::Workbook which can be written with the 'spreadsheet' gem (data_entries only, metadata will ) - def excel - Serializer::Spreadsheets.new(self).excel + # Get Excel representation + # @return [Spreadsheet::Workbook] Workbook which can be written with the spreadsheet gem (data_entries only, metadata will will be discarded)) + def to_xls + Serializer::Spreadsheets.new(self).to_xls end # Get CSV string representation (data_entries only, metadata will be discarded) - def csv - Serializer::Spreadsheets.new(self).csv + # @return [String] CSV representation + def to_csv + Serializer::Spreadsheets.new(self).to_csv end # Get OWL-DL in ntriples format - def ntriples + # @return [String] N-Triples representation + def to_ntriples s = Serializer::Owl.new s.add_dataset(self) - s.ntriples + s.to_ntriples end # Get OWL-DL in RDF/XML format - def rdfxml + # @return [String] RDF/XML representation + def to_rdfxml s = Serializer::Owl.new s.add_dataset(self) - s.rdfxml + s.to_rdfxml + end + + # Get name (DC.title) of a feature + # @param [String] feature Feature URI + # @return [String] Feture title + def feature_name(feature) + @features[feature][DC.title] end # Insert a statement (compound_uri,feature_uri,value) + # @example Insert a statement (compound_uri,feature_uri,value) + # dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", true + # @param [String] compound Compound URI + # @param [String] feature Compound URI + # @param [Boolean,Float] value Feature value def add (compound,feature,value) @compounds << compound unless @compounds.include? compound @features[feature] = {} unless @features[feature] @@ -190,252 +194,62 @@ module OpenTox @data_entries[compound][feature] << value end - # Add metadata (hash with predicate_uri => value) + # Add/modify metadata, existing entries will be overwritten + # @example + # dataset.add_metadata({DC.title => "any_title", DC.creator => "my_email"}) + # @param [Hash] metadata Hash mapping predicate_uris to values def add_metadata(metadata) metadata.each { |k,v| @metadata[k] = v } end - # Copy a dataset (rewrites URI) - def copy(dataset) - @metadata = dataset.metadata - @data_entries = dataset.data_entries - @compounds = dataset.compounds - @features = dataset.features - if @uri - self.uri = @uri - else - @uri = dataset.metadata[XSD.anyUri] - end + # Add a feature + # @param [String] feature Feature URI + # @param [Hash] metadata Hash with feature metadata + def add_feature(feature,metadata={}) + @features[feature] = metadata end - # save dataset (overwrites existing dataset) + # Add/modify metadata for a feature + # @param [String] feature Feature URI + # @param [Hash] metadata Hash with feature metadata + def add_feature_metadata(feature,metadata) + metadata.each { |k,v| @features[feature][k] = v } + end + + # Save dataset at the dataset service + # - creates a new dataset if uri is not set + # - overwrites dataset if uri exists + # @return [String] Dataset URI def save # TODO: rewrite feature URI's ?? - # create dataset if uri empty @compounds.uniq! - RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml) + if @uri + RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml) + else + # create dataset if uri is empty + self.uri = RestClientWrapper.post(CONFIG[:services]["opentox-dataset"],{}).to_s.chomp + RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml) + end + @uri end # Delete dataset at the dataset service def delete RestClientWrapper.delete @uri end - end -end - - ######################################################### - # kept for backward compatibility, may have to be fixed # - ######################################################### - -=begin - def from_owl(owl) - # creates dataset object from Opentox::Owl object - # use Dataset.find( ) to load dataset from rdf-supporting datasetservice - # note: does not load all feature values, as this is time consuming - raise "invalid param" unless owl.is_a?(OpenTox::Owl) - @metadata[DC.title] = owl.get("title") - @metadata[DC.creator] = owl.get("creator") - @metadata[XSD.anyUri] = owl.uri - # when loading a dataset from owl, only compound- and feature-uris are loaded - owl.load_dataset(@compounds, @features) - # all features are marked as dirty - # as soon as a feature-value is requested all values for this feature are loaded from the rdf - @dirty_features = @features.dclone - @owl = owl - end - - def self.find(uri, accept_header=nil) - - unless accept_header - if (CONFIG[:yaml_hosts].include?(URI.parse(uri).host)) - accept_header = 'application/x-yaml' - else - accept_header = "application/rdf+xml" - end - end - - case accept_header - when "application/x-yaml" - LOGGER.debug "DATASET: "+ uri - LOGGER.debug RestClientWrapper.get(uri.to_s.strip, :accept => 'application/x-yaml').to_s - d = YAML.load RestClientWrapper.get(uri.to_s.strip, :accept => 'application/x-yaml').to_s - #d.uri = @metadata[XSD.anyUri] unless d.uri - when "application/rdf+xml" - owl = OpenTox::Owl.from_uri(uri.to_s.strip, "Dataset") - d = Dataset.new(owl) - else - raise "cannot get datset with accept header: "+accept_header.to_s - end - d - end - # converts a dataset represented in owl to yaml - # (uses a temporary dataset) - # note: to_yaml is overwritten, loads complete owl dataset values - def self.owl_to_yaml( owl_data, uri) - owl = OpenTox::Owl.from_data(owl_data, uri, "Dataset") - d = Dataset.new(owl) - d.to_yaml - end - - # creates a new dataset, using only those compounsd specified in new_compounds - # returns uri of new dataset - def create_new_dataset( new_compounds, new_features, new_title, new_creator ) - - LOGGER.debug "create new dataset with "+new_compounds.size.to_s+"/"+compounds.size.to_s+" compounds" - raise "no new compounds selected" unless new_compounds and new_compounds.size>0 - - # load require features - if ((defined? @dirty_features) && (@dirty_features & new_features).size > 0) - (@dirty_features & new_features).each{|f| load_feature_values(f)} - end - - dataset = OpenTox::Dataset.new - dataset.title = new_title - dataset.creator = new_creator - dataset.features = new_features - dataset.compounds = new_compounds - - # Copy dataset data for compounds and features - # PENDING: why storing feature values in an array? - new_compounds.each do |c| - data_c = [] - raise "no data for compound '"+c.to_s+"'" if @data[c]==nil - @data[c].each do |d| - m = {} - new_features.each do |f| - m[f] = d[f] - end - data_c << m - end - dataset.data[c] = data_c - end - return dataset.save - end - - # returns classification value - def get_predicted_class(compound, feature) - v = get_value(compound, feature) - if v.is_a?(Hash) - k = v.keys.grep(/classification/).first - unless k.empty? - #if v.has_key?(:classification) - return v[k] - else - return "no classification key" - end - elsif v.is_a?(Array) - raise "predicted class value is an array\n"+ - "value "+v.to_s+"\n"+ - "value-class "+v.class.to_s+"\n"+ - "dataset "+self.uri.to_s+"\n"+ - "compound "+compound.to_s+"\n"+ - "feature "+feature.to_s+"\n" - else - return v - end - end - - # returns regression value - def get_predicted_regression(compound, feature) - v = get_value(compound, feature) - if v.is_a?(Hash) - k = v.keys.grep(/regression/).first - unless k.empty? - return v[k] - else - return "no regression key" - end - elsif v.is_a?(Array) - raise "predicted regression value is an array\n"+ - "value "+v.to_s+"\n"+ - "value-class "+v.class.to_s+"\n"+ - "dataset "+self.uri.to_s+"\n"+ - "compound "+compound.to_s+"\n"+ - "feature "+feature.to_s+"\n" - else - return v - end - end - - # returns prediction confidence if available - def get_prediction_confidence(compound, feature) - v = get_value(compound, feature) - if v.is_a?(Hash) - k = v.keys.grep(/confidence/).first - unless k.empty? - #if v.has_key?(:confidence) - return v[k].abs - #return v["http://ot-dev.in-silico.ch/model/lazar#confidence"].abs - else - # PENDING: return nil isntead of raising an exception - raise "no confidence key" - end - else - LOGGER.warn "no confidence for compound: "+compound.to_s+", feature: "+feature.to_s - return 1 - end - end - - # return compound-feature value - def get_value(compound, feature) - if (defined? @dirty_features) && @dirty_features.include?(feature) - load_feature_values(feature) - end - - v = @data[compound] - return nil if v == nil # missing values for all features - if v.is_a?(Array) - # PENDING: why using an array here? - v.each do |e| - if e.is_a?(Hash) - if e.has_key?(feature) - return e[feature] - end - else - raise "invalid internal value type" - end - end - return nil #missing value - else - raise "value is not an array\n"+ - "value "+v.to_s+"\n"+ - "value-class "+v.class.to_s+"\n"+ - "dataset "+self.uri.to_s+"\n"+ - "compound "+compound.to_s+"\n"+ - "feature "+feature.to_s+"\n" - end - end - - # loads specified feature and removes dirty-flag, loads all features if feature is nil - def load_feature_values(feature=nil) - if feature - raise "feature already loaded" unless @dirty_features.include?(feature) - @owl.load_dataset_feature_values(@compounds, @data, [feature]) - @dirty_features.delete(feature) + private + # Copy a dataset (rewrites URI) + def copy(dataset) + @metadata = dataset.metadata + @data_entries = dataset.data_entries + @compounds = dataset.compounds + @features = dataset.features + if @uri + self.uri = @uri else - @data = {} unless @data - @owl.load_dataset_feature_values(@compounds, @data, @dirty_features) - @dirty_features.clear + @uri = dataset.metadata[XSD.anyURI] end end - - # overwrite to yaml: - # in case dataset is loaded from owl: - # * load all values - def to_yaml - # loads all features - if ((defined? @dirty_features) && @dirty_features.size > 0) - load_feature_values - end - super - end - - # * remove @owl from yaml, not necessary - def to_yaml_properties - super - ["@owl"] - end - end end -=end -- cgit v1.2.3