From cd1cba67830505cd2d23ec83e64c0beed42a9f28 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 22 Jul 2015 20:08:12 +0200 Subject: mongo batch import workaround --- lib/compound.rb | 86 ++++++---- lib/data_entry.rb | 36 +++++ lib/dataset.rb | 271 +++++++++++++------------------ lib/feature.rb | 86 +++++----- lib/format-conversion.rb | 406 +++++++++++++++++++++++++++++++++++++++++++++++ lib/opentox-client.rb | 44 ++--- lib/opentox.rb | 24 +-- lib/overwrite.rb | 142 +++-------------- 8 files changed, 687 insertions(+), 408 deletions(-) create mode 100644 lib/data_entry.rb create mode 100644 lib/format-conversion.rb diff --git a/lib/compound.rb b/lib/compound.rb index 6cc4707..4a8089b 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,16 +1,20 @@ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" require 'openbabel' +require "base64" module OpenTox - # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure). class Compound - attr_reader :inchi - - def initialize inchi - @inchi = inchi - end + field :inchi, type: String + attr_readonly :inchi + field :smiles, type: String + field :inchikey, type: String + field :names, type: Array + field :cid, type: String + field :chemblid, type: String + field :image_id, type: BSON::ObjectId + field :sdf_id, type: BSON::ObjectId def == compound self.inchi == compound.inchi @@ -22,21 +26,23 @@ module OpenTox # @param [String] smiles Smiles string # @return [OpenTox::Compound] Compound def self.from_smiles smiles - OpenTox::Compound.new obconversion(smiles,"smi","inchi") + # do not store smiles because it might be noncanonical + Compound.find_or_create_by :inchi => obconversion(smiles,"smi","inchi") end # Create a compound from inchi string # @param inchi [String] smiles InChI string # @return [OpenTox::Compound] Compound def self.from_inchi inchi - OpenTox::Compound.new inchi + Compound.find_or_create_by :inchi => inchi end # Create a compound from sdf string # @param sdf [String] smiles SDF string # @return [OpenTox::Compound] Compound def self.from_sdf sdf - OpenTox::Compound.new obconversion(sdf,"sdf","inchi") + # do not store sdf because it might be 2D + Compound.find_or_create_by :inchi => obconversion(sdf,"sdf","inchi") end # Create a compound from name. Relies on an external service for name lookups. @@ -45,31 +51,32 @@ module OpenTox # @param name [String] can be also an InChI/InChiKey, CAS number, etc # @return [OpenTox::Compound] Compound def self.from_name name - OpenTox::Compound.new RestClientWrapper.get File.join(CACTUS_URI,URI.escape(name),"stdinchi") + Compound.find_or_create_by :inchi => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"stdinchi")) end # Get InChIKey # @return [String] InChI string def inchikey - obconversion(@inchi,"inchi","inchikey") + update(:inchikey => obconversion(inchi,"inchi","inchikey")) unless self["inchikey"] + self["inchikey"] end # Get (canonical) smiles # @return [String] Smiles string def smiles - obconversion(@inchi,"inchi","smi") # "can" gives nonn-canonical smiles?? + update(:smiles => obconversion(inchi,"inchi","smi")) unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results + self["smiles"] end # Get sdf # @return [String] SDF string def sdf - obconversion(@inchi,"inchi","sdf") - end - - # Get gif image - # @return [image/gif] Image data - def gif - RestClientWrapper.get File.join(CACTUS_URI,inchi,"image") + if sdf_id.nil? + sdf = obconversion(inchi,"inchi","sdf") + file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile") + sdf_id = $gridfs.insert_one file + end + $gridfs.find_one(_id: sdf_id).data end # Get png image @@ -77,41 +84,37 @@ module OpenTox # image = compound.png # @return [image/png] Image data def png - obconversion(@inchi,"inchi","_png2") - end + if image_id.nil? + png = obconversion(inchi,"inchi","_png2") + file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png") + update(:image_id => $gridfs.insert_one(file)) + end + Base64.decode64($gridfs.find_one(_id: image_id).data) -=begin - # Get URI of compound image - # @return [String] Compound image URI - def image_uri - File.join @data["uri"], "image" end -=end # Get all known compound names. Relies on an external service for name lookups. # @example # names = compound.names # @return [String] Compound names def names - RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n") + update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"] + self["names"] end # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem def cid pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/" - @cid ||= RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip - end - - # @todo - def chebi - internal_server_error "not yet implemented" + update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] + self["cid"] end # @return [String] ChEMBL database compound id, derieved via restcall to chembl def chemblid # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" - @chemblid = JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"] + update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"] + self["chemblid"] end private @@ -125,6 +128,19 @@ module OpenTox case output_format when /smi|can|inchi/ obconversion.write_string(obmol).gsub(/\s/,'').chomp + when /sdf/ + OpenBabel::OBOp.find_type("Gen3D").do(obmol) + sdf = obconversion.write_string(obmol) + if sdf.match(/.nan/) + $logger.warn "3D generation failed for compound #{compound.inchi}, trying to calculate 2D structure" + OpenBabel::OBOp.find_type("Gen2D").do(obmol) + sdf = obconversion.write_string(obmol) + if sdf.match(/.nan/) + $logger.warn "2D generation failed for compound #{compound.inchi}" + sdf = nil + end + end + sdf else obconversion.write_string(obmol) end diff --git a/lib/data_entry.rb b/lib/data_entry.rb new file mode 100644 index 0000000..9f6e786 --- /dev/null +++ b/lib/data_entry.rb @@ -0,0 +1,36 @@ +module OpenTox + + class DataEntry + #field :feature_id, type: BSON::ObjectId + #field :compound_id, type: BSON::ObjectId + # Kludge because csv import removes type information + field :feature_id, type: String + field :compound_id, type: String + field :value + field :warnings, type: String + field :unit, type: String + store_in collection: "data_entries" + + # preferred method for the insertion of data entries + # @example DataEntry.find_or_create compound,feature,value + # @param compound [OpenTox::Compound] + # @param feature [OpenTox::Feature] + # @param value + def self.find_or_create compound, feature, value + self.find_or_create_by( + :compound_id => compound.id, + :feature_id => feature.id, + :value => value + ) + end + + # preferred method for accessing values + # @example DataEntry[compound,feature] + # @param compound [OpenTox::Compound] + # @param feature [OpenTox::Feature] + # @return value + def self.[](compound,feature) + self.where(:compound_id => compound.id.to_s, :feature_id => feature.id.to_s).distinct(:value).first + end + end +end diff --git a/lib/dataset.rb b/lib/dataset.rb index 5e9da44..503e409 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -1,34 +1,20 @@ require 'csv' +require 'tempfile' module OpenTox - class MeasuredDataset < Dataset - end - - class CalculatedDataset < Dataset - field :creator, type: String - end - - class LazarPredictionDataset < CalculatedDataset - field :dependent_variables, type: BSON::ObjectId - field :predicted_variables, type: Array - end - - # provides a basic datastructure similar to R dataframes - # descriptor/feature c class Dataset include Mongoid::Document field :feature_ids, type: Array, default: [] - field :inchis, type: Array, default: [] - field :data_entries, type: Array, default: [] + field :compound_ids, type: Array, default: [] field :source, type: String field :warnings, type: Array, default: [] # Readers def compounds - inchis.collect{|i| OpenTox::Compound.new i} + self.compound_ids.collect{|id| OpenTox::Compound.find id} end def features @@ -38,132 +24,34 @@ module OpenTox # Writers def compounds=(compounds) - self.inchis = compounds.collect{|c| c.inchi} + self.compound_ids = compounds.collect{|c| c.id} end - def add_compound(compound) - self.inchis << compound.inchi + def add_compound compound + self.compound_ids << compound.id end def features=(features) self.feature_ids = features.collect{|f| f.id} end - def add_feature(feature) + def add_feature feature self.feature_ids << feature.id end - # Find data entry values for a given compound and feature - # @param compound [OpenTox::Compound] OpenTox Compound object - # @param feature [OpenTox::Feature] OpenTox Feature object - # @return [Array] Data entry values - def values(compound, feature) - rows = (0 ... inchis.length).select { |r| inchis[r] == compound.inchi } - col = feature_ids.index feature.id - rows.collect{|row| data_entries[row][col]} - end - - # Convenience methods to search by compound/feature URIs - - # Search a dataset for a feature given its URI - # @param uri [String] Feature URI - # @return [OpenTox::Feature] Feature object, or nil if not present - #def find_feature_uri(uri) - #features.select{|f| f.uri == uri}.first - #end - - # Search a dataset for a compound given its URI - # @param uri [String] Compound URI - # @return [OpenTox::Compound] Compound object, or nil if not present - def find_compound_uri(uri) - compounds.select{|f| f.uri == uri}.first + def self.create compounds, features, warnings=[], source=nil + dataset = Dataset.new(:warnings => warnings) + dataset.compounds = compounds + dataset.features = features + dataset end # for prediction result datasets # assumes that there are feature_ids with title prediction and confidence # @return [Array] of Hashes with keys { :compound, :value ,:confidence } (compound value is object not uri) - def predictions - predictions = [] - prediction_feature = nil - confidence_feature = nil - metadata[RDF::OT.predictedVariables].each do |uri| - feature = OpenTox::Feature.new uri - case feature.title - when /prediction$/ - prediction_feature = feature - when /confidence$/ - confidence_feature = feature - end - end - if prediction_feature and confidence_feature - compounds.each do |compound| - value = values(compound,prediction_feature).first - value = value.to_f if prediction_feature[RDF.type].include? RDF::OT.NumericFeature and ! prediction_feature[RDF.type].include? RDF::OT.StringFeature - confidence = values(compound,confidence_feature).first.to_f - predictions << {:compound => compound, :value => value, :confidence => confidence} if value and confidence - end - end - predictions - end - - # Adding data methods - # (Alternatively, you can directly change @data["feature_ids"] and @data["compounds"]) - - # Create a dataset from file (csv,sdf,...) - # @param filename [String] - # @return [String] dataset uri - def upload filename, wait=true - self.title = File.basename(filename) - self.source = filename - table = CSV.read filename, :skip_blanks => true - from_table table - save - end - - def self.from_csv_file file - dataset = self.new - dataset.upload file - dataset - end - - # @param compound [OpenTox::Compound] - # @param feature [OpenTox::Feature] - # @param value [Object] (will be converted to String) - # @return [Array] data_entries - def add_data_entry compound, feature, value - # TODO: optimize - add_compound compound unless self.compounds.include?(compound) - row = self.compounds.index(compound) - add_feature feature unless self.features.include?(feature) - col = self.features.index(feature) - if self.data_entries[row] and self.data_entries[row][col] # duplicated values - add_compound compound - row = self.compounds.rindex(compound) - end - if value - self.data_entries[row] ||= [] - self.data_entries[row][col] = value - end - end - - - # TODO: remove? might be dangerous if feature ordering is incorrect - # MG: I would not remove this because add_data_entry is very slow (4 times searching in arrays) - # CH: do you have measurements? compound and feature arrays are not that big, I suspect that feature search/creation is the time critical step - # @param row [Array] - # @example - # d = Dataset.new - # d.features << Feature.new(a) - # d.features << Feature.new(b) - # d << [ Compound.new("c1ccccc1"), feature-value-a, feature-value-b ] - def << row - compound = row.shift # removes the compound from the array - bad_request_error "Dataset features are empty." unless feature_ids - bad_request_error "Row size '#{row.size}' does not match features size '#{feature_ids.size}'." unless row.size == feature_ids.size - bad_request_error "First column is not a OpenTox::Compound" unless compound.class == OpenTox::Compound - self.inchis << compound.inchi - self.data_entries << row - end + # TODO + #def predictions + #end # Serialisation @@ -185,6 +73,7 @@ module OpenTox # @param feats [Array] features objects # @param metadata [Hash] # @return [OpenTox::Dataset] + # TODO def split( compound_indices, feats, metadata) bad_request_error "Dataset.split : Please give compounds as indices" if compound_indices.size==0 or !compound_indices[0].is_a?(Fixnum) @@ -213,6 +102,7 @@ module OpenTox # ** number of occurences is not equal in both datasets? cannot map, raise error # @param dataset [OpenTox::Dataset] dataset that should be mapped to this dataset (fully loaded) # @param compound_index [Fixnum], corresponding to dataset + # TODO def compound_index( dataset, compound_index ) compound_inchi = dataset.compounds[compound_index].inchi self_indices = compound_indices(compound_inchi) @@ -235,6 +125,7 @@ module OpenTox # returns the inidices of the compound in the dataset # @param compound_inchi [String] # @return [Array] compound index (position) of the compound in the dataset, array-size is 1 unless multiple occurences + # TODO def compound_indices( compound_inchi ) unless defined?(@cmp_indices) and @cmp_indices.has_key?(compound_inchi) @cmp_indices = {} @@ -250,90 +141,142 @@ module OpenTox @cmp_indices[compound_inchi] end - # returns compound feature value using the compound-index and the feature_uri - def data_entry_value(compound_index, feature_uri) - data_entries(true) if @data["data_entries"].empty? - col = @data["features"].collect{|f| f.uri}.index feature_uri - @data["data_entries"][compound_index] ? @data["data_entries"][compound_index][col] : nil + # Adding data methods + # (Alternatively, you can directly change @data["feature_ids"] and @data["compounds"]) + + # Create a dataset from file (csv,sdf,...) + # @param filename [String] + # @return [String] dataset uri + # TODO + #def self.from_sdf_file + #end + + def self.from_csv_file file, source=nil, bioassay=true + source ||= file + table = CSV.read file, :skip_blanks => true + from_table table, source, bioassay end - def from_table table + # parse data in tabular format (e.g. from csv) + # does a lot of guesswork in order to determine feature types + def self.from_table table, source, bioassay=true + + time = Time.now # features feature_names = table.shift.collect{|f| f.strip} - self.warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size + dataset = Dataset.new(:source => source) + dataset.warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - ignored_feature_indices = [] + numeric = [] + # guess feature types feature_names.each_with_index do |f,i| values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq - metadata = {"title" => f} + metadata = {"name" => f, "source" => source} if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes metadata["numeric"] = true numeric[i] = true else metadata["nominal"] = true - metadata["string"] = true metadata["accept_values"] = values numeric[i] = false end - feature = OpenTox::Feature.find_or_create_by metadata - self.feature_ids << feature.id unless feature.nil? + if bioassay + if metadata["numeric"] + feature = NumericBioAssay.find_or_create_by(metadata) + elsif metadata["nominal"] + feature = NominalBioAssay.find_or_create_by(metadata) + end + else + metadata.merge({:measured => false, :calculated => true}) + if metadata["numeric"] + feature = NumericFeature.find_or_create_by(metadata) + elsif metadata["nominal"] + feature = NominalFeature.find_or_create_by(metadata) + end + end + dataset.feature_ids << OpenTox::Feature.find_or_create_by(metadata).id end + feature_ids = dataset.features.collect{|f| f.id.to_s} + + $logger.debug "Feature values: #{Time.now-time}" + time = Time.now # compounds and values r = -1 - table.each_with_index do |values,j| - compound = values.shift + csv = ["compound_id,feature_id,value"] + + compound_time = 0 + value_time = 0 + + table.each_with_index do |vals,j| + ct = Time.now + identifier = vals.shift begin case compound_format when /SMILES/i - c = OpenTox::Compound.from_smiles(compound) - if c.inchi.empty? - self.warnings << "Cannot parse #{compound_format} compound '#{compound.strip}' at position #{j+2}, all entries are ignored." + compound = OpenTox::Compound.from_smiles(identifier) + if compound.inchi.empty? + dataset.warnings << "Cannot parse #{compound_format} compound '#{compound.strip}' at position #{j+2}, all entries are ignored." next - else - inchi = c.inchi end when /InChI/i - # TODO validate inchi - inchi = compound - else - raise "wrong compound format" #should be checked above + compound = OpenTox::Compound.from_inchi(identifier) end rescue - self.warnings << "Cannot parse #{compound_format} compound '#{compound}' at position #{j+2}, all entries are ignored." + dataset.warnings << "Cannot parse #{compound_format} compound '#{compound}' at position #{j+2}, all entries are ignored." next end - + compound_time += Time.now-ct + dataset.compound_ids << compound.id + r += 1 - self.inchis << inchi - unless values.size == self.feature_ids.size - self.warnings << "Number of values at position #{j+2} (#{values.size}) is different than header size (#{self.feature_ids.size}), all entries are ignored." + unless vals.size == feature_ids.size # way cheaper than accessing dataset.features + dataset.warnings << "Number of values at position #{j+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end - self.data_entries << [] - values.each_with_index do |v,i| + cid = compound.id.to_s + vals.each_with_index do |v,i| if v.blank? - self.data_entries.last << nil - self.warnings << "Empty value for compound '#{compound}' (row #{r+2}) and feature '#{feature_names[i]}' (column #{i+2})." + dataset.warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[i]}' (column #{i+2})." next elsif numeric[i] - self.data_entries.last << v.to_f + csv << "#{cid},#{feature_ids[i]},#{v.to_f}" # retrieving ids from dataset.{compounds|features} kills performance else - self.data_entries.last << v.strip + csv << "#{cid},#{feature_ids[i]},#{v.strip}" # retrieving ids from dataset.{compounds|features} kills performance end end end - self.inchis.duplicates.each do |inchi| + dataset.compounds.duplicates.each do |duplicates| + # TODO fix and check positions = [] - self.inchis.each_with_index{|c,i| positions << i+1 if !c.blank? and c == inchi} - self.warnings << "Duplicate compound #{inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c == compound} + dataset.warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end + + $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" + time = Time.now + + # Workaround for mongo bulk insertions (insertion of single data_entries is far too slow) + # Skip ruby JSON serialisation: + # - to_json is too slow to write to file + # - json (or bson) serialisation is probably causing very long parse times of Mongo::BulkWrite, or any other ruby insert operation + f = Tempfile.new("#{dataset.id.to_s}.csv","/tmp") + f.puts csv.join("\n") + f.close + $logger.debug "Write file: #{Time.now-time}" + time = Time.now + # TODO DB name from config + `mongoimport --db opentox --collection data_entries --type csv --headerline --file #{f.path}` + $logger.debug "Bulk insert: #{Time.now-time}" + time = Time.now + + dataset end end end diff --git a/lib/feature.rb b/lib/feature.rb index b2ddce4..b2f9a93 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -1,48 +1,62 @@ module OpenTox - # TODO subclass features class Feature + field :name, as: :title, type: String + field :nominal, type: Boolean + field :numeric, type: Boolean + field :measured, type: Boolean + field :calculated, type: Boolean + field :supervised, type: Boolean + field :source, as: :title, type: String + end - field :string, type: Boolean, default: false - field :nominal, type: Boolean, default: false - field :numeric, type: Boolean, default: false - field :substructure, type: Boolean, default: false - field :prediction, type: Boolean - field :smarts, type: String - field :pValue, type: Float - field :effect, type: String + class NominalFeature < Feature field :accept_values, type: Array - - # Find out feature type - # Classification takes precedence - # @return [String] Feature type - def feature_type - if nominal - "classification" - elsif numeric - "regression" - else - "unknown" - end + def initialize params + super params + nominal = true end + end - # Get accept values - # - # @return[Array] Accept values - #def accept_values - #self[RDF::OT.acceptValue] ? self[RDF::OT.acceptValue].sort : nil - #end - - # Create value map - # @param [OpenTox::Feature] Feature - # @return [Hash] A hash with keys 1...feature.training_classes.size and values training classes - def value_map - unless defined? @value_map - accept_values ? @value_map = accept_values.each_index.inject({}) { |h,idx| h[idx+1]=accept_values[idx]; h } : @value_map = nil - end - @value_map + class NumericFeature < Feature + def initialize params + super params + numeric = true end + end + + class Smarts < NominalFeature + field :name, as: :smarts, type: String # causes warnings + field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptors.smarts_match" + field :parameters, type: Hash, default: {:count => false} + def initialize params + super params + nominal = true + end + end + + class FminerSmarts < Smarts + field :training_algorithm, type: String + field :training_compound_ids, type: Array + field :training_feature_id, type: BSON::ObjectId + field :training_parameters, type: Hash + def initialize params + super params + supervised = true + end + end + + class NominalBioAssay < NominalFeature + field :description, type: String + end + + class NumericBioAssay < NumericFeature + field :description, type: String + end + class PhysChemDescriptor < NumericFeature + field :algorithm, type: String + field :parameters, type: Hash end end diff --git a/lib/format-conversion.rb b/lib/format-conversion.rb new file mode 100644 index 0000000..7563b94 --- /dev/null +++ b/lib/format-conversion.rb @@ -0,0 +1,406 @@ +# defaults to stderr, may be changed to file output (e.g in opentox-service) +$logger = OTLogger.new(STDERR) +$logger.level = Logger::DEBUG + +module OpenTox + + # Ruby interface + + attr_accessor :data + + # Create a new OpenTox object + # @param uri [optional,String] URI + # @return [OpenTox] OpenTox object + def initialize uri=nil + @data = {} + if uri + @data[:uri] = uri.to_s.chomp + get + else + @data[:uuid] = SecureRandom.uuid + @data[:uri] = File.join(service_uri, @data[:uuid]) + end + end + + # Object metadata (lazy loading) + # @return [Hash] Object metadata + def metadata force_update=false + get #if (@metadata.nil? or @metadata.empty? or force_update) and URI.accessible? @uri + @data + end + + # Metadata values + # @param predicate [String] Predicate URI + # @return [Array, String] Predicate value(s) + def [](predicate) + predicate = predicate.to_s + return nil if metadata[predicate].nil? + metadata[predicate].size == 1 ? metadata[predicate].first : metadata[predicate] + end + + # Set a metadata entry + # @param predicate [String] Predicate URI + # @param values [Array, String] Predicate value(s) + def []=(predicate,values) + predicate = predicate.to_s + @data[predicate] = [values].flatten + end + +=begin + # Object parameters (lazy loading) + # {http://opentox.org/dev/apis/api-1.2/interfaces OpenTox API} + # @return [Hash] Object parameters + def parameters force_update=false + if (@parameters.empty? or force_update) and URI.accessible? @uri + get #if @rdf.empty? or force_update + params = {} + query = RDF::Query.new({ + :parameter => { + RDF.type => RDF::OT.Parameter, + :property => :value, + } + }) + query.execute(@rdf).each do |solution| + params[solution.parameter] = {} unless params[solution.parameter] + params[solution.parameter][solution.property] = solution.value + end + @parameters = params.values + end + @parameters + end + + # Parameter value + # @param [String] title + # @return [String] value + def parameter_value title + @parameters.collect{|p| p[RDF::OT.paramValue] if p[RDF::DC.title] == title}.compact.first + end +=end + + # Get object from webservice + # @param [String,optional] mime_type + def get mime_type="application/json" + bad_request_error "Mime type #{mime_type} is not supported. Please use 'application/json' (default), 'text/plain' (ntriples) or mime_type == 'application/rdf+xml'." unless mime_type == "application/json" or mime_type == "text/plain" or mime_type == "application/rdf+xml" + p @data[:uri] + response = RestClientWrapper.get(@data[:uri],{},{:accept => mime_type}) + if URI.task?(response) + uri = wait_for_task response + response = RestClientWrapper.get(uri,{},{:accept => mime_type}) + p uri + end + case mime_type + when 'application/json' + p response + @data = JSON.parse(response) if response + when "text/plain" + parse_ntriples response + when "application/rdf+xml" + parse_rdfxml response + end + end + +=begin + # Post object to webservice (append to object), rarely useful and deprecated + # @deprecated + def post wait=true, mime_type="text/plain" + bad_request_error "Mime type #{mime_type} is not supported. Please use 'text/plain' (default) or 'application/rdf+xml'." unless mime_type == "text/plain" or mime_type == "application/rdf+xml" + case mime_type + when 'text/plain' + body = self.to_ntriples + when 'application/rdf+xml' + body = self.to_rdfxml + end + #Authorization.check_policy(@uri) if $aa[:uri] + uri = RestClientWrapper.post @uri.to_s, body, { :content_type => mime_type} + wait ? wait_for_task(uri) : uri + end +=end + + # Save object at webservice (replace or create object) + def put wait=true, mime_type="application/json" + bad_request_error "Mime type #{mime_type} is not supported. Please use 'application/json' (default)." unless mime_type == "application/json" or mime_type == "text/plain" or mime_type == "application/rdf+xml" + @data[:created_at] = DateTime.now unless URI.accessible? @data[:uri] + #@metadata[RDF::DC.modified] = DateTime.now + @data[:uri] ? @data[:uri] = uri.to_s.chomp : @data[:uri] = File.join(service_uri, SecureRandom.uuid) + case mime_type + when 'text/plain' + body = self.to_ntriples + when 'application/rdf+xml' + body = self.to_rdfxml + when 'application/json' + body = self.to_json + end + uri = RestClientWrapper.put @data[:uri], body, { :content_type => mime_type} + wait ? wait_for_task(uri) : uri + end + + # Delete object at webservice + def delete + RestClientWrapper.delete(@data[:uri]) + #Authorization.delete_policies_from_uri(@data[:uri]) if $aa[:uri] + end + + def service_uri + self.class.service_uri + end + + def create_rdf + #$logger.debug "#{eval("RDF::OT."+self.class.to_s.split('::').last)}\n" + @rdf = RDF::Graph.new + # DG: since model is no self.class anymore + @metadata[RDF.type] ||= (eval("RDF::OT."+self.class.to_s.split('::').last) =~ /Lazar|Generic/) ? RDF::URI.new(RDF::OT.Model) : RDF::URI.new(eval("RDF::OT."+self.class.to_s.split('::').last)) + #@metadata[RDF.type] ||= RDF::URI.new(eval("RDF::OT."+self.class.to_s.split('::').last)) + @metadata[RDF::DC.date] ||= DateTime.now + # DG: uri in object should be in brackets, otherwise query for uri-list ignores the object. + # see: http://www.w3.org/TR/rdf-testcases/#sec-uri-encoding + @metadata.each do |predicate,values| + [values].flatten.each{ |value| @rdf << [RDF::URI.new(@data[:uri]), predicate, (URI.valid?(value) ? RDF::URI.new(value) : value)] unless value.nil? } + end + @parameters.each do |parameter| + p_node = RDF::Node.new + @rdf << [RDF::URI.new(@data[:uri]), RDF::OT.parameters, p_node] + @rdf << [p_node, RDF.type, RDF::OT.Parameter] + parameter.each { |k,v| @rdf << [p_node, k, v] unless v.nil?} + end + end + + # as defined in opentox-client.rb + RDF_FORMATS.each do |format| + + # rdf parse methods for all formats e.g. parse_rdfxml + send :define_method, "parse_#{format}".to_sym do |rdf| + @rdf = RDF::Graph.new + RDF::Reader.for(format).new(rdf) do |reader| + reader.each_statement{ |statement| @rdf << statement } + end + # return values as plain strings instead of RDF objects + @metadata = @rdf.to_hash[RDF::URI.new(@data[:uri])].inject({}) { |h, (predicate, values)| h[predicate] = values.collect{|v| v.to_s}; h } + end + +=begin + # rdf serialization methods for all formats e.g. to_rdfxml + send :define_method, "to_#{format}".to_sym do + create_rdf + # if encoding is used iteration is necessary + # see: http://rubydoc.info/github/ruby-rdf/rdf/RDF/NTriples/Writer + RDF::Writer.for(format).buffer(:encoding => Encoding::ASCII) do |writer| + @rdf.each_statement do |statement| + writer << statement + end + end + end +=end + end + + # @return [String] converts object to turtle-string + def to_turtle # redefined to use prefixes (not supported by RDF::Writer) + prefixes = {:rdf => "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} + ['OT', 'DC', 'XSD', 'OLO'].each{|p| prefixes[p.downcase.to_sym] = eval("RDF::#{p}.to_s") } + create_rdf + RDF::Turtle::Writer.for(:turtle).buffer(:prefixes => prefixes) do |writer| + writer << @rdf + end + end + + def to_json + @data.to_json + end + + # @return [String] converts OpenTox object into html document (by first converting it to a string) + def to_html + to_turtle.to_html + end + + # short access for metadata keys title, description and type + [ :title , :description , :type , :uri, :uuid ].each do |method| + send :define_method, method do + self.data[method] + end + send :define_method, "#{method}=" do |value| + self.data[method] = value + end + end + + # define class methods within module + def self.included(base) + base.extend(ClassMethods) + end + + module ClassMethods + def service_uri + service = self.to_s.split('::')[1].downcase + eval("$#{service}[:uri]") + rescue + bad_request_error "$#{service}[:uri] variable not set. Please set $#{service}[:uri] or use an explicit uri as first constructor argument " + end + def subjectid + RestClientWrapper.subjectid + end + def subjectid=(subjectid) + RestClientWrapper.subjectid = subjectid + end + end + + # create default OpenTox classes with class methods + # (defined in opentox-client.rb) + CLASSES.each do |klass| + c = Class.new do + include OpenTox + + def self.all + uris = RestClientWrapper.get(service_uri, {},{:accept => 'text/uri-list'}).split("\n").compact + uris.collect{|uri| self.new(uri)} + end + + #@example fetching a model + # OpenTox::Model.find() -> model-object + def self.find uri + URI.accessible?(uri) ? self.new(uri) : nil + end + + def self.create metadata + object = self.new + object.data = metadata + object.put + object + end + + def self.find_or_create metadata + uris = RestClientWrapper.get(service_uri,{:query => @data},{:accept => "text/uri-list"}).split("\n") + uris.empty? ? self.create(@data) : self.new(uris.first) + end + end + OpenTox.const_set klass,c + end + +end + +# from overwrite.rb +class String + + # encloses URI in text with with link tag + # @return [String] new text with marked links + def link_urls + self.gsub(/(?i)http(s?):\/\/[^\r\n\s']*/, '\0') + end + + # produces a html page for making web services browser friendly + # format of text (=string params) is preserved (e.g. line breaks) + # urls are marked as links + # + # @param related_links [optional,String] uri on related resources + # @param description [optional,String] general info + # @param png_image [optional,String] imagename + # @return [String] html page + def to_html(related_links=nil, description=nil, png_image=nil ) + + # TODO add title as parameter + title = nil #$sinatra.to($sinatra.request.env['PATH_INFO'], :full) if $sinatra + html = "" + html << ""+title+"" if title + #html += "<\/img>" + + html << "

Description

"+description.link_urls+"

" if description + html << "

Related links

"+related_links.link_urls+"

" if related_links + html << "

Content

" if description || related_links + html << "

" + html << "\n" if png_image + html << self.link_urls + html << "

" + html + end + + def uri? + URI.valid?(self) + end +end + +module Kernel + +=begin + # overwrite backtick operator to catch system errors + # Override raises an error if _cmd_ returns a non-zero exit status. CH: I do not understand this comment + # Returns stdout if _cmd_ succeeds. Note that these are simply concatenated; STDERR is not inline. CH: I do not understand this comment + def ` cmd + stdout, stderr = '' + status = Open4::popen4(cmd) do |pid, stdin_stream, stdout_stream, stderr_stream| + stdout = stdout_stream.read + stderr = stderr_stream.read + end + internal_server_error "`" + cmd + "` failed.\n" + stdout + stderr unless status.success? + return stdout + rescue + internal_server_error $!.message + end +=end + + # @return [String] uri of task result, if task fails, an error according to task is raised + def wait_for_task uri + if URI.task?(uri) + t = OpenTox::Task.new uri + t.wait + unless t.completed? + error = OpenTox::RestClientWrapper.known_errors.select{|error| error[:code] == t.code}.first + error_method = error ? error[:method] : :internal_server_error + report = t.error_report + error_message = report ? report[:message] : $!.message + error_cause = report ? report[:errorCause] : nil + Object.send(error_method,error_message,t.uri,error_cause) + end + uri = t.resultURI + end + uri + end + + +end +module URI + + def self.compound? uri + uri =~ /compound/ and URI.valid? uri + end + + def self.task? uri + uri =~ /task/ and URI.valid? uri + end + + def self.dataset? uri + uri =~ /dataset/ and URI.accessible? uri + end + + def self.model? uri + uri =~ /model/ and URI.accessible? uri + end + + def self.ssl? uri + URI.parse(uri).instance_of? URI::HTTPS + end + + # @return [Boolean] checks if resource exists by making a HEAD-request + def self.accessible?(uri) + parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : "")) + http_code = URI.task?(uri) ? 600 : 400 + http = Net::HTTP.new(parsed_uri.host, parsed_uri.port) + unless (URI.ssl? uri) == true + http = Net::HTTP.new(parsed_uri.host, parsed_uri.port) + request = Net::HTTP::Head.new(parsed_uri.request_uri) + http.request(request).code.to_i < http_code + else + http = Net::HTTP.new(parsed_uri.host, parsed_uri.port) + http.use_ssl = true + http.verify_mode = OpenSSL::SSL::VERIFY_NONE + request = Net::HTTP::Head.new(parsed_uri.request_uri) + http.request(request).code.to_i < http_code + end + rescue + false + end + + def self.valid? uri + u = URI.parse(uri) + u.scheme!=nil and u.host!=nil + rescue URI::InvalidURIError + false + end + +end diff --git a/lib/opentox-client.rb b/lib/opentox-client.rb index 40f87cf..6bffc39 100644 --- a/lib/opentox-client.rb +++ b/lib/opentox-client.rb @@ -1,38 +1,17 @@ require 'rubygems' require "bundler/setup" -#require 'rdf' -#require 'rdf/raptor' -#require 'rdf/turtle' require "rest-client" -#require 'uri' require 'yaml' require 'json' require 'logger' -#require "securerandom" require 'mongoid' -default_config = File.join(ENV["HOME"],".opentox","config","default.rb") -client_config = File.join(ENV["HOME"],".opentox","config","opentox-client.rb") - -puts "Could not find configuration files #{default_config} or #{client_config}" unless File.exist? default_config or File.exist? client_config -require default_config if File.exist? default_config -require client_config if File.exist? client_config -# TODO switch to production +# TODO store development/test, validation, production in separate databases ENV["MONGOID_ENV"] = "development" Mongoid.load!("#{ENV['HOME']}/.opentox/config/mongoid.yml") -# define constants and global variables -#RDF::OT = RDF::Vocabulary.new 'http://www.opentox.org/api/1.2#' -#RDF::OT1 = RDF::Vocabulary.new 'http://www.opentox.org/api/1.1#' -#RDF::OTA = RDF::Vocabulary.new 'http://www.opentox.org/algorithmTypes.owl#' -#RDF::OLO = RDF::Vocabulary.new 'http://purl.org/ontology/olo/core#' -#RDF::TB = RDF::Vocabulary.new "http://onto.toxbank.net/api/" -#RDF::ISA = RDF::Vocabulary.new "http://onto.toxbank.net/isa/" -#RDF::OWL = RDF::Vocabulary.new "http://www.w3.org/2002/07/owl#" - -#CLASSES = ["Compound", "Feature", "Dataset", "Validation", "Task", "Investigation"] -CLASSES = ["Feature", "Dataset", "Validation", "Task", "Investigation"] -#RDF_FORMATS = [:rdfxml,:ntriples,:turtle] +CLASSES = ["Compound", "Feature", "DataEntry","Dataset"]#, "Validation", "Task", "Investigation"] +#CLASSES = ["Feature", "Dataset", "Validation", "Task", "Investigation"] # Regular expressions for parsing classification data TRUE_REGEXP = /^(true|active|1|1.0|tox|activating|carcinogen|mutagenic)$/i @@ -41,14 +20,15 @@ FALSE_REGEXP = /^(false|inactive|0|0.0|low tox|deactivating|non-carcinogen|non-m [ "overwrite.rb", "rest-client-wrapper.rb", - "error.rb", - "authorization.rb", - "policy.rb", - "otlogger.rb", + #"error.rb", + #"authorization.rb", + #"policy.rb", + #"otlogger.rb", "opentox.rb", - "task.rb", + #"task.rb", "compound.rb", "feature.rb", + "data_entry.rb", "dataset.rb", #"algorithm.rb", #"model.rb", @@ -61,11 +41,11 @@ FALSE_REGEXP = /^(false|inactive|0|0.0|low tox|deactivating|non-carcinogen|non-m #end # defaults to stderr, may be changed to file output (e.g in opentox-service) -$logger = OTLogger.new(STDOUT) # STDERR did not work on my development machine (CH) +$logger = Logger.new STDOUT #OTLogger.new(STDOUT) # STDERR did not work on my development machine (CH) $logger.level = Logger::DEBUG #Mongo::Logger.logger = $logger Mongo::Logger.level = Logger::WARN -#$mongo = Mongo::Client.new($mongodb[:uri]) +$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox') +$gridfs = $mongo.database.fs Mongoid.logger.level = Logger::WARN Mongoid.logger = $logger -#Moped.logger = $logger diff --git a/lib/opentox.rb b/lib/opentox.rb index 554e686..33293ac 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -12,33 +12,11 @@ module OpenTox include Mongoid::Document include Mongoid::Timestamps store_in collection: klass.downcase.pluralize + field :title, as: :name, type: String - field :title, type: String - field :description, type: String - field :parameters, type: Array, default: [] - field :creator, type: String - - # TODO check if needed - def self.subjectid - RestClientWrapper.subjectid - end - def self.subjectid=(subjectid) - RestClientWrapper.subjectid = subjectid - end end OpenTox.const_set klass,c end - def type - self.class.to_s.split('::').last - end - - # Serialisation - - # @return [String] converts OpenTox object into html document (by first converting it to a string) - def to_html - self.to_json.to_html - end - end diff --git a/lib/overwrite.rb b/lib/overwrite.rb index 4dafe8d..2eb0b39 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -36,61 +36,40 @@ class String bad_request_error "invalid value for Boolean: \"#{self}\"" end - # encloses URI in text with with link tag - # @return [String] new text with marked links - def link_urls - self.gsub(/(?i)http(s?):\/\/[^\r\n\s']*/, '\0') - end - - # produces a html page for making web services browser friendly - # format of text (=string params) is preserved (e.g. line breaks) - # urls are marked as links - # - # @param related_links [optional,String] uri on related resources - # @param description [optional,String] general info - # @param png_image [optional,String] imagename - # @return [String] html page - def to_html(related_links=nil, description=nil, png_image=nil ) - - # TODO add title as parameter - title = nil #$sinatra.to($sinatra.request.env['PATH_INFO'], :full) if $sinatra - html = "" - html << ""+title+"" if title - #html += "<\/img>" - - html << "

Description

"+description.link_urls+"

" if description - html << "

Related links

"+related_links.link_urls+"

" if related_links - html << "

Content

" if description || related_links - html << "

" - html << "\n" if png_image - html << self.link_urls - html << "

" - html - end +end - def uri? - URI.valid?(self) +class File + # @return [String] mime_type including charset using linux cmd command + def mime_type + `file -ib '#{self.path}'`.chomp end - end -module URI +class Array - def self.compound? uri - uri =~ /compound/ and URI.valid? uri + # Sum up the size of single arrays in an array of arrays + # @param [Array] Array of arrays + # @return [Integer] Sum of size of array elements + def sum_size + self.inject(0) { |s,a| + if a.respond_to?('size') + s+=a.size + else + internal_server_error "No size available: #{a.inspect}" + end + } end - def self.task? uri - uri =~ /task/ and URI.valid? uri + # For symbolic features + # @param [Array] Array to test. + # @return [Boolean] Whether the array has just one unique value. + def zero_variance? + return self.uniq.size == 1 end - def self.dataset? uri - uri =~ /dataset/ and URI.accessible? uri - end +end - def self.model? uri - uri =~ /model/ and URI.accessible? uri - end +module URI def self.ssl? uri URI.parse(uri).instance_of? URI::HTTPS @@ -124,76 +103,3 @@ module URI end end - -class File - # @return [String] mime_type including charset using linux cmd command - def mime_type - `file -ib '#{self.path}'`.chomp - end -end - -module Kernel - -=begin - # overwrite backtick operator to catch system errors - # Override raises an error if _cmd_ returns a non-zero exit status. CH: I do not understand this comment - # Returns stdout if _cmd_ succeeds. Note that these are simply concatenated; STDERR is not inline. CH: I do not understand this comment - def ` cmd - stdout, stderr = '' - status = Open4::popen4(cmd) do |pid, stdin_stream, stdout_stream, stderr_stream| - stdout = stdout_stream.read - stderr = stderr_stream.read - end - internal_server_error "`" + cmd + "` failed.\n" + stdout + stderr unless status.success? - return stdout - rescue - internal_server_error $!.message - end -=end - - # @return [String] uri of task result, if task fails, an error according to task is raised - def wait_for_task uri - if URI.task?(uri) - t = OpenTox::Task.new uri - t.wait - unless t.completed? - error = OpenTox::RestClientWrapper.known_errors.select{|error| error[:code] == t.code}.first - error_method = error ? error[:method] : :internal_server_error - report = t.error_report - error_message = report ? report[:message] : $!.message - error_cause = report ? report[:errorCause] : nil - Object.send(error_method,error_message,t.uri,error_cause) - end - uri = t.resultURI - end - uri - end - - -end - - -class Array - - # Sum up the size of single arrays in an array of arrays - # @param [Array] Array of arrays - # @return [Integer] Sum of size of array elements - def sum_size - self.inject(0) { |s,a| - if a.respond_to?('size') - s+=a.size - else - internal_server_error "No size available: #{a.inspect}" - end - } - end - - # For symbolic features - # @param [Array] Array to test. - # @return [Boolean] Whether the array has just one unique value. - def zero_variance? - return self.uniq.size == 1 - end - -end - -- cgit v1.2.3