summaryrefslogtreecommitdiff
path: root/lib/dataset.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dataset.rb')
-rw-r--r--lib/dataset.rb532
1 files changed, 344 insertions, 188 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 526e0a2..2c47502 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -1,229 +1,385 @@
module OpenTox
+ # Ruby wrapper for OpenTox Dataset Webservices (http://opentox.org/dev/apis/api-1.2/dataset).
class Dataset
- attr_accessor :uri, :title, :creator, :data, :features, :compounds
+ include OpenTox
- def initialize( owl=nil )
- @data = {}
- @features = []
+ attr_reader :features, :compounds, :data_entries, :metadata
+
+ # Create dataset with optional URI. Does not load data into the dataset - you will need to execute one of the load_* methods to pull data from a service or to insert it from other representations.
+ # @example Create an empty dataset
+ # dataset = OpenTox::Dataset.new
+ # @example Create an empty dataset with URI
+ # dataset = OpenTox::Dataset.new("http:://webservices.in-silico/ch/dataset/1")
+ # @param [optional, String] uri Dataset URI
+ # @return [OpenTox::Dataset] Dataset object
+ def initialize(uri=nil,subjectid=nil)
+ super uri
+ @features = {}
@compounds = []
-
- # creates dataset object from Opentox::Owl object
- # use Dataset.find( <uri> ) to load dataset from rdf-supporting datasetservice
- # note: does not load all feature values, as this is time consuming
- if owl
- raise "invalid param" unless owl.is_a?(OpenTox::Owl)
- @title = owl.get("title")
- @creator = owl.get("creator")
- @uri = owl.uri
- # when loading a dataset from owl, only compound- and feature-uris are loaded
- owl.load_dataset(@compounds, @features)
- # all features are marked as dirty
- # as soon as a feature-value is requested all values for this feature are loaded from the rdf
- @dirty_features = @features.dclone
- @owl = owl
- end
+ @data_entries = {}
end
-
- # Find and a dataset and load all data from the dataset service
- # @param [String] uri Dataset URI
+
+ # Create an empty dataset and save it at the dataset service (assigns URI to dataset)
+ # @example Create new dataset and save it to obtain a URI
+ # dataset = OpenTox::Dataset.create
+ # @param [optional, String] uri Dataset URI
# @return [OpenTox::Dataset] Dataset object
- def self.find(uri, accept_header=nil)
-
- unless accept_header
- if (@@config[:yaml_hosts].include?(URI.parse(uri).host))
- accept_header = 'application/x-yaml'
- else
- accept_header = "application/rdf+xml"
- end
- end
-
- case accept_header
- when "application/x-yaml"
- d = YAML.load RestClientWrapper.get(uri.to_s.strip, :accept => 'application/x-yaml').to_s
- d.uri = uri unless d.uri
- when "application/rdf+xml"
- owl = OpenTox::Owl.from_uri(uri.to_s.strip, "Dataset")
- d = Dataset.new(owl)
- else
- raise "cannot get datset with accept header: "+accept_header.to_s
- end
- d
+ def self.create(uri=CONFIG[:services]["opentox-dataset"], subjectid=nil)
+ dataset = Dataset.new(nil,subjectid)
+ dataset.save(subjectid)
+ dataset
+ end
+
+ # Create dataset from CSV file (format specification: http://toxcreate.org/help)
+ # - loads data_entries, compounds, features
+ # - sets metadata (warnings) for parser errors
+ # - you will have to set remaining metadata manually
+ # @param [String] file CSV file path
+ # @return [OpenTox::Dataset] Dataset object with CSV data
+ def self.create_from_csv_file(file, subjectid=nil)
+ dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
+ parser = Parser::Spreadsheets.new
+ parser.dataset = dataset
+ parser.load_csv(File.open(file).read)
+ dataset.save(subjectid)
+ dataset
end
- # converts a dataset represented in owl to yaml
- # (uses a temporary dataset)
- # note: to_yaml is overwritten, loads complete owl dataset values
- def self.owl_to_yaml( owl_data, uri)
- owl = OpenTox::Owl.from_data(owl_data, uri, "Dataset")
- d = Dataset.new(owl)
- d.to_yaml
+ # Find a dataset and load all data. This can be time consuming, use Dataset.new together with one of the load_* methods for a fine grained control over data loading.
+ # @param [String] uri Dataset URI
+ # @return [OpenTox::Dataset] Dataset object with all data
+ def self.find(uri, subjectid=nil)
+ return nil unless uri
+ dataset = Dataset.new(uri, subjectid)
+ dataset.load_all(subjectid)
+ dataset
end
- # creates a new dataset, using only those compounsd specified in new_compounds
- # returns uri of new dataset
- def create_new_dataset( new_compounds, new_features, new_title, new_creator )
-
- LOGGER.debug "create new dataset with "+new_compounds.size.to_s+"/"+compounds.size.to_s+" compounds"
- raise "no new compounds selected" unless new_compounds and new_compounds.size>0
-
- # load require features
- if ((defined? @dirty_features) && (@dirty_features & new_features).size > 0)
- (@dirty_features & new_features).each{|f| load_feature_values(f)}
- end
-
- dataset = OpenTox::Dataset.new
- dataset.title = new_title
- dataset.creator = new_creator
- dataset.features = new_features
- dataset.compounds = new_compounds
-
- # Copy dataset data for compounds and features
- # PENDING: why storing feature values in an array?
- new_compounds.each do |c|
- data_c = []
- raise "no data for compound '"+c.to_s+"'" if @data[c]==nil
- @data[c].each do |d|
- m = {}
- new_features.each do |f|
- m[f] = d[f]
- end
- data_c << m
- end
- dataset.data[c] = data_c
+ # replaces find as exist check, takes not as long, does NOT raise an un-authorized exception
+ # @param [String] uri Dataset URI
+ # @return [Boolean] true if dataset exists and user has get rights, false else
+ def self.exist?(uri, subjectid=nil)
+ return false unless uri
+ dataset = Dataset.new(uri, subjectid)
+ begin
+ dataset.load_metadata( subjectid ).size > 0
+ rescue
+ false
end
- return dataset.save
+ end
+
+ # Get all datasets from a service
+ # @param [optional,String] uri URI of the dataset service, defaults to service specified in configuration
+ # @return [Array] Array of dataset object without data (use one of the load_* methods to pull data from the server)
+ def self.all(uri=CONFIG[:services]["opentox-dataset"], subjectid=nil)
+ RestClientWrapper.get(uri,{:accept => "text/uri-list",:subjectid => subjectid}).to_s.each_line.collect{|u| Dataset.new(u, subjectid)}
+ end
+
+ # Load YAML representation into the dataset
+ # @param [String] yaml YAML representation of the dataset
+ # @return [OpenTox::Dataset] Dataset object with YAML data
+ def load_yaml(yaml)
+ copy YAML.load(yaml)
+ end
+
+ def load_rdfxml(rdfxml)
+ raise "rdfxml data is empty" if rdfxml.to_s.size==0
+ file = Tempfile.new("ot-rdfxml")
+ file.puts rdfxml
+ file.close
+ load_rdfxml_file file
+ file.delete
+ end
+
+ # Load RDF/XML representation from a file
+ # @param [String] file File with RDF/XML representation of the dataset
+ # @return [OpenTox::Dataset] Dataset object with RDF/XML data
+ def load_rdfxml_file(file, subjectid=nil)
+ parser = Parser::Owl::Dataset.new @uri, subjectid
+ parser.uri = file.path
+ copy parser.load_uri(subjectid)
+ end
+
+ # Load CSV string (format specification: http://toxcreate.org/help)
+ # - loads data_entries, compounds, features
+ # - sets metadata (warnings) for parser errors
+ # - you will have to set remaining metadata manually
+ # @param [String] csv CSV representation of the dataset
+ # @return [OpenTox::Dataset] Dataset object with CSV data
+ def load_csv(csv, subjectid=nil)
+ save(subjectid) unless @uri # get a uri for creating features
+ parser = Parser::Spreadsheets.new
+ parser.dataset = self
+ parser.load_csv(csv)
+ end
+
+ # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
+ # - loads data_entries, compounds, features
+ # - sets metadata (warnings) for parser errors
+ # - you will have to set remaining metadata manually
+ # @param [Excel] book Excel workbook object (created with roo gem)
+ # @return [OpenTox::Dataset] Dataset object with Excel data
+ def load_spreadsheet(book, subjectid=nil)
+ save(subjectid) unless @uri # get a uri for creating features
+ parser = Parser::Spreadsheets.new
+ parser.dataset = self
+ parser.load_spreadsheet(book)
end
- # returns classification value
- def get_predicted_class(compound, feature)
- v = get_value(compound, feature)
- if v.is_a?(Hash)
- k = v.keys.grep(/classification/).first
- unless k.empty?
- #if v.has_key?(:classification)
- return v[k]
- else
- return "no classification key"
- end
- elsif v.is_a?(Array)
- raise "predicted class value is an array\n"+
- "value "+v.to_s+"\n"+
- "value-class "+v.class.to_s+"\n"+
- "dataset "+@uri.to_s+"\n"+
- "compound "+compound.to_s+"\n"+
- "feature "+feature.to_s+"\n"
+ # Load and return only metadata of a Dataset object
+ # @return [Hash] Metadata of the dataset
+ def load_metadata(subjectid=nil)
+ add_metadata Parser::Owl::Dataset.new(@uri, subjectid).load_metadata(subjectid)
+ self.uri = @uri if @uri # keep uri
+ @metadata
+ end
+
+ # Load all data (metadata, data_entries, compounds and features) from URI
+ def load_all(subjectid=nil)
+ if (CONFIG[:yaml_hosts].include?(URI.parse(@uri).host))
+ copy YAML.load(RestClientWrapper.get(@uri, {:accept => "application/x-yaml", :subjectid => subjectid}))
else
- return v
+ parser = Parser::Owl::Dataset.new(@uri, subjectid)
+ copy parser.load_uri(subjectid)
end
end
-
- # returns regression value
- def get_predicted_regression(compound, feature)
- v = get_value(compound, feature)
- if v.is_a?(Hash)
- k = v.keys.grep(/regression/).first
- unless k.empty?
- return v[k]
- else
- return "no regression key"
- end
- elsif v.is_a?(Array)
- raise "predicted regression value is an array\n"+
- "value "+v.to_s+"\n"+
- "value-class "+v.class.to_s+"\n"+
- "dataset "+@uri.to_s+"\n"+
- "compound "+compound.to_s+"\n"+
- "feature "+feature.to_s+"\n"
- else
- return v
+
+ # Load and return only compound URIs from the dataset service
+ # @return [Array] Compound URIs in the dataset
+ def load_compounds(subjectid=nil)
+ RestClientWrapper.get(File.join(uri,"compounds"),{:accept=> "text/uri-list", :subjectid => subjectid}).to_s.each_line do |compound_uri|
+ @compounds << compound_uri.chomp
end
+ @compounds.uniq!
end
-
- # returns prediction confidence if available
- def get_prediction_confidence(compound, feature)
- v = get_value(compound, feature)
- if v.is_a?(Hash)
- k = v.keys.grep(/confidence/).first
- unless k.empty?
- #if v.has_key?(:confidence)
- return v[k].abs
- #return v["http://ot-dev.in-silico.ch/model/lazar#confidence"].abs
+
+ # Load and return only features from the dataset service
+ # @return [Hash] Features of the dataset
+ def load_features(subjectid=nil)
+ parser = Parser::Owl::Dataset.new(@uri, subjectid)
+ @features = parser.load_features(subjectid)
+ @features
+ end
+
+ # Detect feature type(s) in the dataset
+ # @return [String] `classification", "regression", "mixed" or unknown`
+ def feature_type
+ feature_types = @features.collect{|f,metadata| metadata[OT.isA]}.uniq
+ if feature_types.size > 1
+ "mixed"
+ else
+ case feature_types.first
+ when /NominalFeature/
+ "classification"
+ when /NumericFeature/
+ "regression"
else
- # PENDING: return nil isntead of raising an exception
- raise "no confidence key"
+ "unknown"
end
- else
- LOGGER.warn "no confidence for compound: "+compound.to_s+", feature: "+feature.to_s
- return 1
end
end
+
+ # Get Spreadsheet representation
+ # @return [Spreadsheet::Workbook] Workbook which can be written with the spreadsheet gem (data_entries only, metadata will will be discarded))
+ def to_spreadsheet
+ Serializer::Spreadsheets.new(self).to_spreadsheet
+ end
+
+ # Get Excel representation (alias for to_spreadsheet)
+ # @return [Spreadsheet::Workbook] Workbook which can be written with the spreadsheet gem (data_entries only, metadata will will be discarded))
+ def to_xls
+ to_spreadsheet
+ end
+
+ # Get CSV string representation (data_entries only, metadata will be discarded)
+ # @return [String] CSV representation
+ def to_csv
+ Serializer::Spreadsheets.new(self).to_csv
+ end
+
+ # Get OWL-DL in ntriples format
+ # @return [String] N-Triples representation
+ def to_ntriples
+ s = Serializer::Owl.new
+ s.add_dataset(self)
+ s.to_ntriples
+ end
+
+ # Get OWL-DL in RDF/XML format
+ # @return [String] RDF/XML representation
+ def to_rdfxml
+ s = Serializer::Owl.new
+ s.add_dataset(self)
+ s.to_rdfxml
+ end
+
+ # Get name (DC.title) of a feature
+ # @param [String] feature Feature URI
+ # @return [String] Feture title
+ def feature_name(feature)
+ @features[feature][DC.title]
+ end
+
+ def title
+ @metadata[DC.title]
+ end
+
+ # Insert a statement (compound_uri,feature_uri,value)
+ # @example Insert a statement (compound_uri,feature_uri,value)
+ # dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", true
+ # @param [String] compound Compound URI
+ # @param [String] feature Compound URI
+ # @param [Boolean,Float] value Feature value
+ def add (compound,feature,value)
+ @compounds << compound unless @compounds.include? compound
+ @features[feature] = {} unless @features[feature]
+ @data_entries[compound] = {} unless @data_entries[compound]
+ @data_entries[compound][feature] = [] unless @data_entries[compound][feature]
+ @data_entries[compound][feature] << value if value!=nil
+ end
+
+ # Add/modify metadata, existing entries will be overwritten
+ # @example
+ # dataset.add_metadata({DC.title => "any_title", DC.creator => "my_email"})
+ # @param [Hash] metadata Hash mapping predicate_uris to values
+ def add_metadata(metadata)
+ metadata.each { |k,v| @metadata[k] = v }
+ end
+
+ # Add a feature
+ # @param [String] feature Feature URI
+ # @param [Hash] metadata Hash with feature metadata
+ def add_feature(feature,metadata={})
+ @features[feature] = metadata
+ end
+
+ # Add/modify metadata for a feature
+ # @param [String] feature Feature URI
+ # @param [Hash] metadata Hash with feature metadata
+ def add_feature_metadata(feature,metadata)
+ metadata.each { |k,v| @features[feature][k] = v }
+ end
- # return compound-feature value
- def get_value(compound, feature)
- if (defined? @dirty_features) && @dirty_features.include?(feature)
- load_feature_values(feature)
- end
-
- v = @data[compound]
- return nil if v == nil # missing values for all features
- if v.is_a?(Array)
- # PENDING: why using an array here?
- v.each do |e|
- if e.is_a?(Hash)
- if e.has_key?(feature)
- return e[feature]
+ # Add a new compound
+ # @param [String] compound Compound URI
+ def add_compound (compound)
+ @compounds << compound unless @compounds.include? compound
+ end
+
+ # Creates a new dataset, by splitting the current dataset, i.e. using only a subset of compounds and features
+ # @param [Array] compounds List of compound URIs
+ # @param [Array] features List of feature URIs
+ # @param [Hash] metadata Hash containing the metadata for the new dataset
+ # @param [String] subjectid
+ # @return [OpenTox::Dataset] newly created dataset, already saved
+ def split( compounds, features, metadata, subjectid=nil)
+ LOGGER.debug "split dataset using "+compounds.size.to_s+"/"+@compounds.size.to_s+" compounds"
+ raise "no new compounds selected" unless compounds and compounds.size>0
+ dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+ if features.size==0
+ compounds.each{ |c| dataset.add_compound(c) }
+ else
+ compounds.each do |c|
+ features.each do |f|
+ unless @data_entries[c][f]
+ dataset.add(c,f,nil)
+ else
+ @data_entries[c][f].each do |v|
+ dataset.add(c,f,v)
+ end
end
- else
- raise "invalid internal value type"
end
end
- return nil #missing value
- else
- raise "value is not an array\n"+
- "value "+v.to_s+"\n"+
- "value-class "+v.class.to_s+"\n"+
- "dataset "+@uri.to_s+"\n"+
- "compound "+compound.to_s+"\n"+
- "feature "+feature.to_s+"\n"
end
+ dataset.add_metadata(metadata)
+ dataset.save(subjectid)
+ dataset
end
- # loads specified feature and removes dirty-flag, loads all features if feature is nil
- def load_feature_values(feature=nil)
- if feature
- raise "feature already loaded" unless @dirty_features.include?(feature)
- @owl.load_dataset_feature_values(@compounds, @data, [feature])
- @dirty_features.delete(feature)
+ # Save dataset at the dataset service
+ # - creates a new dataset if uri is not set
+ # - overwrites dataset if uri exists
+ # @return [String] Dataset URI
+ def save(subjectid=nil)
+ # TODO: rewrite feature URI's ??
+ @compounds.uniq!
+ if @uri
+ if (CONFIG[:yaml_hosts].include?(URI.parse(@uri).host))
+ RestClientWrapper.post(@uri,self.to_yaml,{:content_type => "application/x-yaml", :subjectid => subjectid})
+ else
+ File.open("ot-post-file.rdf","w+") { |f| f.write(self.to_rdfxml); @path = f.path }
+ task_uri = RestClient.post(@uri, {:file => File.new(@path)},{:accept => "text/uri-list" , :subjectid => subjectid}).to_s.chomp
+ #task_uri = `curl -X POST -H "Accept:text/uri-list" -F "file=@#{@path};type=application/rdf+xml" http://apps.ideaconsult.net:8080/ambit2/dataset`
+ Task.find(task_uri).wait_for_completion
+ self.uri = RestClientWrapper.get(task_uri,{:accept => 'text/uri-list', :subjectid => subjectid})
+ end
else
- @data = {} unless @data
- @owl.load_dataset_feature_values(@compounds, @data, @dirty_features)
- @dirty_features.clear
+ # create dataset if uri is empty
+ self.uri = RestClientWrapper.post(CONFIG[:services]["opentox-dataset"],{:subjectid => subjectid}).to_s.chomp
end
+ @uri
end
-
- # overwrite to yaml:
- # in case dataset is loaded from owl:
- # * load all values
- def to_yaml
- # loads all features
- if ((defined? @dirty_features) && @dirty_features.size > 0)
- load_feature_values
+
+ # Delete dataset at the dataset service
+ def delete(subjectid=nil)
+ RestClientWrapper.delete(@uri, :subjectid => subjectid)
+ end
+
+ private
+ # Copy a dataset (rewrites URI)
+ def copy(dataset)
+ @metadata = dataset.metadata
+ @data_entries = dataset.data_entries
+ @compounds = dataset.compounds
+ @features = dataset.features
+ if @uri
+ self.uri = @uri
+ else
+ @uri = dataset.metadata[XSD.anyURI]
end
- super
end
-
- # * remove @owl from yaml, not necessary
- def to_yaml_properties
- super - ["@owl"]
+ end
+
+ # Class with special methods for lazar prediction datasets
+ class LazarPrediction < Dataset
+
+ # Find a prediction dataset and load all data.
+ # @param [String] uri Prediction dataset URI
+ # @return [OpenTox::Dataset] Prediction dataset object with all data
+ def self.find(uri, subjectid=nil)
+ prediction = LazarPrediction.new(uri, subjectid)
+ prediction.load_all(subjectid)
+ prediction
+ end
+
+ def value(compound)
+ @data_entries[compound.uri].collect{|f,v| v.first if f.match(/prediction/)}.compact.first
end
- # saves (changes) as new dataset in dataset service
- # returns uri
- # uses to yaml method (which is overwritten)
- def save
- OpenTox::RestClientWrapper.post(@@config[:services]["opentox-dataset"],{:content_type => "application/x-yaml"},self.to_yaml).strip
+ def confidence(compound)
+ feature_uri = @data_entries[compound.uri].collect{|f,v| f if f.match(/prediction/)}.compact.first
+ @features[feature_uri][OT.confidence]
end
+
+ def descriptors(compound)
+ @data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/descriptor/)}.compact if @data_entries[compound.uri]
+ end
+
+ def measured_activities(compound)
+ source = @metadata[OT.hasSource]
+ @data_entries[compound.uri].collect{|f,v| v if f.match(/#{source}/)}.compact.flatten
+ end
+
+ def neighbors(compound)
+ @data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact
+ end
+
+# def errors(compound)
+# features = @data_entries[compound.uri].keys
+# features.collect{|f| @features[f][OT.error]}.join(" ") if features
+# end
+
end
end