From 1c74a684b8b2992d7e6fce5f1a4677d397ec4dd6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 18 Jul 2015 08:52:39 +0200 Subject: dataset tests pass --- lib/dataset.rb | 334 +++++++++++++++++++++++---------------------------------- 1 file changed, 132 insertions(+), 202 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 1c938da..bcbacb2 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -4,95 +4,48 @@ module OpenTox # Ruby wrapper for OpenTox Dataset Webservices (http://opentox.org/dev/apis/api-1.2/dataset). class Dataset - - def initialize uri=nil - super uri - @data["features"] ||= [] - @data["compounds"] ||= [] - @data["data_entries"] ||= [] + include Mongoid::Document + + field :feature_ids, type: Array + field :inchis, type: Array + field :data_entries, type: Array + field :warnings, type: Array + field :source, type: String + + def initialize + super + self.feature_ids = [] + self.inchis = [] + self.data_entries = [] + self.warnings = [] end - def data_entries - @data["data_entries"] + # Readers + + def compounds + inchis.collect{|i| OpenTox::Compound.new i} end - [:features, :compounds, :data_entries].each do |method| - send :define_method, "#{method}=" do |value| - @data[method.to_s] = value.collect{|v| v.uri} - end - send :define_method, "#{method}<<" do |value| - @data[method.to_s] << value.uri - end + def features + self.feature_ids.collect{|id| OpenTox::Feature.find(id)} end - # Get data (lazy loading from dataset service) - # overrides {OpenTox#metadata} to only load the metadata instead of the whole dataset - # @return [Hash] the metadata - def metadata force_update=false - if @data.empty? or force_update - uri = File.join(@data["uri"],"metadata") - #begin - @data = JSON.parse RestClientWrapper.get(uri,{},{:accept => "application/json"}) - #parse_ntriples RestClientWrapper.get(uri,{},{:accept => "text/plain"}) - #rescue # fall back to rdfxml - #parse_rdfxml RestClientWrapper.get(uri,{},{:accept => "application/rdf+xml"}) - #end - #@data = @rdf.to_hash[RDF::URI.new(@data["uri"])].inject({}) { |h, (predicate, values)| h[predicate] = values.collect{|v| v.to_s}; h } - end - @data + # Writers + + def compounds=(compounds) + self.inchis = compounds.collect{|c| c.inchi} end - # @return [Array] feature objects (NOT uris) - def features force_update=false - if @data["features"].empty? or force_update - uri = File.join(@data["uri"],"features") - begin - uris = JSON.parse RestClientWrapper.get(uri,{},{:accept => "application/json"}) # ordered datasets return ordered features - rescue - uris = [] - end - @data["features"] = uris#.collect{|uri| Feature.new(uri)} - end - @data["features"].collect{|uri| Feature.new uri} + def add_compound(compound) + self.inchis << compound.id end - # @return [Array] compound objects (NOT uris) - def compounds force_update=false - if @data["compounds"].empty? or force_update - uri = File.join(@data["uri"],"compounds") - begin - uris = JSON.parse RestClientWrapper.get(uri,{},{:accept => "application/json"}) # ordered datasets return ordered compounds - rescue - uris = [] - end - @data["compounds"] = uris - end - @data["compounds"].collect{|uri| Compound.new(uri)} + def features=(features) + self.feature_ids = features.collect{|f| f.id} end - # @return [Array] with two dimensions, - # first index: compounds, second index: features, values: compound feature values - def data_entries force_update=false - if @data["data_entries"].empty? or force_update - sparql = "SELECT ?cidx ?fidx ?value FROM <#{uri}> WHERE { - ?data_entry <#{RDF::OLO.index}> ?cidx ; - <#{RDF::OT.values}> ?v . - ?v <#{RDF::OT.feature}> ?f; - <#{RDF::OT.value}> ?value . - ?f <#{RDF::OLO.index}> ?fidx. - } ORDER BY ?fidx ?cidx" - RestClientWrapper.get(service_uri,{:query => sparql},{:accept => "text/uri-list"}).split("\n").each do |row| - r,c,v = row.split("\t") - @data["data_entries"][r.to_i] ||= [] - # adjust value class depending on feature type, StringFeature takes precedence over NumericFeature - if features[c.to_i][RDF.type].include? RDF::OT.NumericFeature and ! features[c.to_i][RDF.type].include? RDF::OT.StringFeature - v = v.to_f if v - end - @data["data_entries"][r.to_i][c.to_i] = v if v - end - # TODO: fallbacks for external and unordered datasets - end - @data["data_entries"] + def add_feature(feature) + self.feature_ids << feature.id end # Find data entry values for a given compound and feature @@ -100,8 +53,8 @@ module OpenTox # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values def values(compound, feature) - rows = (0 ... compounds.length).select { |r| compounds[r].uri == compound.uri } - col = features.collect{|f| f.uri}.index feature.uri + rows = (0 ... inchis.length).select { |r| inchis[r].uri == compound.uri } + col = feature_ids.collect{|f| f.uri}.index feature.uri rows.collect{|row| data_entries[row][col]} end @@ -122,7 +75,7 @@ module OpenTox end # for prediction result datasets - # assumes that there are features with title prediction and confidence + # assumes that there are feature_ids with title prediction and confidence # @return [Array] of Hashes with keys { :compound, :value ,:confidence } (compound value is object not uri) def predictions predictions = [] @@ -149,18 +102,17 @@ module OpenTox end # Adding data methods - # (Alternatively, you can directly change @data["features"] and @data["compounds"]) + # (Alternatively, you can directly change @data["feature_ids"] and @data["compounds"]) # Create a dataset from file (csv,sdf,...) # @param filename [String] # @return [String] dataset uri def upload filename, wait=true - uri = RestClientWrapper.put(@data["uri"], {:file => File.new(filename)}) - wait_for_task uri if URI.task?(uri) and wait - compounds true - features true - metadata true - @data["uri"] + self.title = File.basename(filename) + self.source = filename + table = CSV.read filename, :skip_blanks => true + from_table table + save end # @param compound [OpenTox::Compound] @@ -182,6 +134,7 @@ module OpenTox end end + # TODO: remove? might be dangerous if feature ordering is incorrect # MG: I would not remove this because add_data_entry is very slow (4 times searching in arrays) # CH: do you have measurements? compound and feature arrays are not that big, I suspect that feature search/creation is the time critical step @@ -193,11 +146,11 @@ module OpenTox # d << [ Compound.new("c1ccccc1"), feature-value-a, feature-value-b ] def << row compound = row.shift # removes the compound from the array - bad_request_error "Dataset features are empty." unless @data["features"] - bad_request_error "Row size '#{row.size}' does not match features size '#{@data["features"].size}'." unless row.size == @data["features"].size + bad_request_error "Dataset features are empty." unless feature_ids + bad_request_error "Row size '#{row.size}' does not match features size '#{feature_ids.size}'." unless row.size == feature_ids.size bad_request_error "First column is not a OpenTox::Compound" unless compound.class == OpenTox::Compound - @data["compounds"] << compound.uri - @data["data_entries"] << row + self.inchis << compound.inchi + self.data_entries << row end # Serialisation @@ -213,107 +166,6 @@ module OpenTox end end -=begin - RDF_FORMATS.each do |format| - - # redefine rdf parse methods for all formats e.g. parse_rdfxml - send :define_method, "parse_#{format}".to_sym do |rdf| - # TODO: parse ordered dataset - # TODO: parse data entries - # TODO: parse metadata - @rdf = RDF::Graph.new - RDF::Reader.for(format).new(rdf) do |reader| - reader.each_statement{ |statement| @rdf << statement } - end - query = RDF::Query.new({ :uri => { RDF.type => RDF::OT.Compound } }) - @data["compounds"] = query.execute(@rdf).collect { |solution| OpenTox::Compound.new solution.uri } - query = RDF::Query.new({ :uri => { RDF.type => RDF::OT.Feature } }) - @data["features"] = query.execute(@rdf).collect { |solution| OpenTox::Feature.new solution.uri } - @data["compounds"].each_with_index do |c,i| - @data["features"].each_with_index do |f,j| - end - end - end - - - # redefine rdf serialization methods - send :define_method, "to_#{format}".to_sym do - @data[RDF.type] = [RDF::OT.Dataset, RDF::OT.OrderedDataset] - create_rdf - @data["features"].each_with_index do |feature,i| - @rdf << [RDF::URI.new(feature.uri), RDF::URI.new(RDF.type), RDF::URI.new(RDF::OT.Feature)] - @rdf << [RDF::URI.new(feature.uri), RDF::URI.new(RDF::OLO.index), RDF::Literal.new(i)] - end - @data["compounds"].each_with_index do |compound,i| - @rdf << [RDF::URI.new(compound.uri), RDF::URI.new(RDF.type), RDF::URI.new(RDF::OT.Compound)] - if defined? @neighbors and neighbors.include? compound - @rdf << [RDF::URI.new(compound.uri), RDF::URI.new(RDF.type), RDF::URI.new(RDF::OT.Neighbor)] - end - - @rdf << [RDF::URI.new(compound.uri), RDF::URI.new(RDF::OLO.index), RDF::Literal.new(i)] - data_entry_node = RDF::Node.new - @rdf << [RDF::URI.new(@data["uri"]), RDF::URI.new(RDF::OT.dataEntry), data_entry_node] - @rdf << [data_entry_node, RDF::URI.new(RDF.type), RDF::URI.new(RDF::OT.DataEntry)] - @rdf << [data_entry_node, RDF::URI.new(RDF::OLO.index), RDF::Literal.new(i)] - @rdf << [data_entry_node, RDF::URI.new(RDF::OT.compound), RDF::URI.new(compound.uri)] - @data["data_entries"][i].each_with_index do |value,j| - value_node = RDF::Node.new - @rdf << [data_entry_node, RDF::URI.new(RDF::OT.values), value_node] - @rdf << [value_node, RDF::URI.new(RDF::OT.feature), RDF::URI.new(@data["features"][j].uri)] - @rdf << [value_node, RDF::URI.new(RDF::OT.value), RDF::Literal.new(value)] - end - end - RDF::Writer.for(format).buffer do |writer| - writer << @rdf - end - end - - end -=end - -# TODO: fix bug that affects data_entry positions # DG: who wrotes this comment ? - def to_ntriples # redefined string version for better performance - ntriples = "" - @data[RDF.type] = [ RDF::OT.Dataset, RDF::OT.OrderedDataset ] - @data.each do |predicate,values| - [values].flatten.each do |value| - URI.valid?(value) ? value = "<#{value}>" : value = "\"#{value}\"" - ntriples << "<#{@data["uri"]}> <#{predicate}> #{value} .\n" #\n" - end - end - @parameters.each_with_index do |parameter,i| - p_node = "_:parameter"+ i.to_s - ntriples << "<#{@data["uri"]}> <#{RDF::OT.parameters}> #{p_node} .\n" - ntriples << "#{p_node} <#{RDF.type}> <#{RDF::OT.Parameter}> .\n" - parameter.each { |k,v| ntriples << "#{p_node} <#{k}> \"#{v.to_s.tr('"', '\'')}\" .\n" } - end - @data["features"].each_with_index do |feature,i| - ntriples << "<#{feature.uri}> <#{RDF.type}> <#{RDF::OT.Feature}> .\n" - ntriples << "<#{feature.uri}> <#{RDF::OLO.index}> \"#{i}\"^^ .\n" # sorting at dataset service does not work without type information - end - @data["compounds"].each_with_index do |compound,i| - ntriples << "<#{compound.uri}> <#{RDF.type}> <#{RDF::OT.Compound}> .\n" - if defined? @neighbors and neighbors.include? compound - ntriples << "<#{compound.uri}> <#{RDF.type}> <#{RDF::OT.Neighbor}> .\n" - end - - ntriples << "<#{compound.uri}> <#{RDF::OLO.index}> \"#{i}\"^^ .\n" # sorting at dataset service does not work without type information - data_entry_node = "_:dataentry"+ i.to_s - ntriples << "<#{@data["uri"]}> <#{RDF::OT.dataEntry}> #{data_entry_node} .\n" - ntriples << "#{data_entry_node} <#{RDF.type}> <#{RDF::OT.DataEntry}> .\n" - ntriples << "#{data_entry_node} <#{RDF::OLO.index}> \"#{i}\"^^ .\n" # sorting at dataset service does not work without type information - ntriples << "#{data_entry_node} <#{RDF::OT.compound}> <#{compound.uri}> .\n" - @data["data_entries"][i].each_with_index do |value,j| - value_node = data_entry_node+ "_value"+ j.to_s - ntriples << "#{data_entry_node} <#{RDF::OT.values}> #{value_node} .\n" - ntriples << "#{value_node} <#{RDF::OT.feature}> <#{@data["features"][j].uri}> .\n" - ntriples << "#{value_node} <#{RDF::OT.value}> \"#{value}\" .\n" - end unless @data["data_entries"][i].nil? - end - ntriples - - end - # Methods for for validation service # create a new dataset with the specified compounds and features @@ -350,32 +202,32 @@ module OpenTox # @param dataset [OpenTox::Dataset] dataset that should be mapped to this dataset (fully loaded) # @param compound_index [Fixnum], corresponding to dataset def compound_index( dataset, compound_index ) - compound_uri = dataset.compounds[compound_index]#.uri - self_indices = compound_indices(compound_uri) + compound_inchi = dataset.compounds[compound_index].inchi + self_indices = compound_indices(compound_inchi) if self_indices==nil nil else - dataset_indices = dataset.compound_indices(compound_uri) + dataset_indices = dataset.compound_indices(compound_inchi) if self_indices.size==1 self_indices.first elsif self_indices.size==dataset_indices.size # we do assume that the order is preseverd (i.e., the nth occurences in both datasets are mapped to each other)! self_indices[dataset_indices.index(compound_index)] else - raise "cannot map compound #{compound_uri} from dataset #{dataset.uri} to dataset #{uri}, "+ + raise "cannot map compound #{compound_inchi} from dataset #{dataset.id} to dataset #{self.id}, "+ "compound occurs #{dataset_indices.size} times and #{self_indices.size} times" end end end # returns the inidices of the compound in the dataset - # @param compound_uri [String] + # @param compound_inchi [String] # @return [Array] compound index (position) of the compound in the dataset, array-size is 1 unless multiple occurences - def compound_indices( compound_uri ) - unless defined?(@cmp_indices) and @cmp_indices.has_key?(compound_uri) + def compound_indices( compound_inchi ) + unless defined?(@cmp_indices) and @cmp_indices.has_key?(compound_inchi) @cmp_indices = {} compounds().size.times do |i| - c = @data["compounds"][i]#.uri + c = self.compounds[i].inchi if @cmp_indices[c]==nil @cmp_indices[c] = [i] else @@ -383,7 +235,7 @@ module OpenTox end end end - @cmp_indices[compound_uri] + @cmp_indices[compound_inchi] end # returns compound feature value using the compound-index and the feature_uri @@ -392,6 +244,84 @@ module OpenTox col = @data["features"].collect{|f| f.uri}.index feature_uri @data["data_entries"][compound_index] ? @data["data_entries"][compound_index][col] : nil end - end + def from_table table + + # features + feature_names = table.shift.collect{|f| f.strip} + self.warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size + compound_format = feature_names.shift.strip + bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + ignored_feature_indices = [] + numeric = [] + feature_names.each_with_index do |f,i| + values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + metadata = {"title" => f} + if values.size == 0 # empty feature + elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes + metadata["numeric"] = true + numeric[i] = true + else + metadata["nominal"] = true + metadata["string"] = true + metadata["accept_values"] = values + numeric[i] = false + end + feature = OpenTox::Feature.find_or_create_by metadata + self.feature_ids << feature.id unless feature.nil? + end + + # compounds and values + r = -1 + table.each_with_index do |values,j| + compound = values.shift + begin + case compound_format + when /SMILES/i + c = OpenTox::Compound.from_smiles(compound) + if c.inchi.empty? + self.warnings << "Cannot parse #{compound_format} compound '#{compound.strip}' at position #{j+2}, all entries are ignored." + next + else + inchi = c.inchi + end + when /InChI/i + # TODO validate inchi + inchi = compound + else + raise "wrong compound format" #should be checked above + end + rescue + self.warnings << "Cannot parse #{compound_format} compound '#{compound}' at position #{j+2}, all entries are ignored." + next + end + + r += 1 + self.inchis << inchi + unless values.size == self.feature_ids.size + self.warnings << "Number of values at position #{j+2} (#{values.size}) is different than header size (#{self.feature_ids.size}), all entries are ignored." + next + end + + self.data_entries << [] + values.each_with_index do |v,i| + if v.blank? + self.data_entries.last << nil + self.warnings << "Empty value for compound '#{compound}' (row #{r+2}) and feature '#{feature_names[i]}' (column #{i+2})." + next + elsif numeric[i] + self.data_entries.last << v.to_f + else + self.data_entries.last << v.strip + end + end + end + self.inchis.duplicates.each do |inchi| + positions = [] + self.inchis.each_with_index{|c,i| positions << i+1 if !c.blank? and c == inchi} + self.warnings << "Duplicate compound #{inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + end + end + end end -- cgit v1.2.3