From 3cea6abe3606ea586b733e943737f77d58f215f9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 27 Jul 2015 20:51:25 +0200 Subject: reasonable query performace for data_entries --- lib/compound.rb | 8 ++-- lib/data_entry.rb | 12 ++++-- lib/dataset.rb | 113 +++++++++++++++++++++++++++++++++++++++++++----------- lib/feature.rb | 9 +++-- 4 files changed, 108 insertions(+), 34 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 4a8089b..93f609f 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -15,10 +15,12 @@ module OpenTox field :chemblid, type: String field :image_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId + #belongs_to :dataset + #belongs_to :data_entry - def == compound - self.inchi == compound.inchi - end + #def == compound + #self.inchi == compound.inchi + #end # Create a compound from smiles string # @example diff --git a/lib/data_entry.rb b/lib/data_entry.rb index 9f6e786..4eeb66d 100644 --- a/lib/data_entry.rb +++ b/lib/data_entry.rb @@ -1,15 +1,18 @@ module OpenTox class DataEntry - #field :feature_id, type: BSON::ObjectId - #field :compound_id, type: BSON::ObjectId + field :feature_id, type: BSON::ObjectId + field :compound_id, type: BSON::ObjectId # Kludge because csv import removes type information - field :feature_id, type: String - field :compound_id, type: String + #field :feature_id, type: String + #field :compound_id, type: String field :value field :warnings, type: String field :unit, type: String store_in collection: "data_entries" + belongs_to :dataset + has_one :compound + has_one :feature # preferred method for the insertion of data entries # @example DataEntry.find_or_create compound,feature,value @@ -32,5 +35,6 @@ module OpenTox def self.[](compound,feature) self.where(:compound_id => compound.id.to_s, :feature_id => feature.id.to_s).distinct(:value).first end + end end diff --git a/lib/dataset.rb b/lib/dataset.rb index 503e409..2ade033 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -3,14 +3,33 @@ require 'tempfile' module OpenTox + class LazarPrediction < Dataset + field :creator, type: String + end + + class FminerDataset < Dataset + field :training_algorithm, type: String + field :training_dataset_id, type: BSON::ObjectId + field :training_feature_id, type: BSON::ObjectId + field :training_parameters, type: Hash + end + class Dataset include Mongoid::Document + attr_accessor :bulk + + # associations like has_many, belongs_to deteriorate performance field :feature_ids, type: Array, default: [] field :compound_ids, type: Array, default: [] field :source, type: String field :warnings, type: Array, default: [] + def initialize params=nil + super params + @bulk = [] + end + # Readers def compounds @@ -21,6 +40,49 @@ module OpenTox self.feature_ids.collect{|id| OpenTox::Feature.find(id)} end + def [](compound,feature) + bad_request_error "Incorrect parameter type. The first argument is a OpenTox::Compound the second a OpenTox::Feature." unless compound.is_a? Compound and feature.is_a? Feature + DataEntry.where(dataset_id: self.id, compound_id: compound.id, feature_id: feature.id).distinct(:value).first + end + + def fingerprint(compound) + data_entries[compound.id] + end + + def data_entries + unless @data_entries + entries = {} + t = Time.now + DataEntry.where(dataset_id: self.id).each do |de| + entries[de.compound_id] ||= {} + entries[de.compound_id][de.feature_id] = de.value.first + end + $logger.debug "Retrieving data: #{Time.now-t}" + t = Time.now + @data_entries = {} + # TODO: check performance overhead + compound_ids.each do |cid| + @data_entries[cid] = [] + feature_ids.each_with_index do |fid,i| + @data_entries[cid][i] = entries[cid][fid] + end + end + $logger.debug "Create @data_entries: #{Time.now-t}" + end + @data_entries + end + + # Find data entry values for a given compound and feature + # @param compound [OpenTox::Compound] OpenTox Compound object + # @param feature [OpenTox::Feature] OpenTox Feature object + # @return [Array] Data entry values + def values(compound, feature) + data_entries.where(:compound_id => compound.id, :feature_id => feature.id).distinct(:value) + #rows = (0 ... compound_ids.length).select { |r| compound_ids[r] == compound.id } + #col = feature_ids.index feature.id + #rows.collect{|row| data_entries[row][col]} + end + # Writers def compounds=(compounds) @@ -151,21 +213,40 @@ module OpenTox #def self.from_sdf_file #end + def bulk_write + time = Time.now + # Workaround for mongo bulk insertions (insertion of single data_entries is far too slow) + # Skip ruby JSON serialisation: + # - to_json is too slow to write to file + # - json (or bson) serialisation is probably causing very long parse times of Mongo::BulkWrite, or any other ruby insert operation + # this method causes a noticeable overhead compared to direct string serialisation (e.g. total processing time 16" instead of 12" for rat fminer dataset), but it can be reused at different places + dataset_id = self.id.to_s + f = Tempfile.new("#{dataset_id}.json","/tmp") + f.puts @bulk.collect{|row| "{'dataset_id': {'$oid': '#{dataset_id}'},'compound_id': {'$oid': '#{row[0]}'}, 'feature_id': {'$oid': '#{row[1]}'}, 'value': #{row[2]}}"}.join("\n") + f.close + $logger.debug "Write JSON file: #{Time.now-time}" + # TODO DB name from config + puts `mongoimport --db opentox --collection data_entries --type json --file #{f.path} 2>&1` + $logger.debug "Bulk import: #{Time.now-time}" + @bulk = [] + end + def self.from_csv_file file, source=nil, bioassay=true source ||= file table = CSV.read file, :skip_blanks => true - from_table table, source, bioassay + parse_table table, source, bioassay end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def self.from_table table, source, bioassay=true + def self.parse_table table, source, bioassay=true time = Time.now # features feature_names = table.shift.collect{|f| f.strip} dataset = Dataset.new(:source => source) + dataset_id = dataset.id.to_s dataset.warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i @@ -175,7 +256,7 @@ module OpenTox feature_names.each_with_index do |f,i| values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq - metadata = {"name" => f, "source" => source} + metadata = {"name" => File.basename(f), "source" => source} if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes metadata["numeric"] = true @@ -208,7 +289,6 @@ module OpenTox # compounds and values r = -1 - csv = ["compound_id,feature_id,value"] compound_time = 0 value_time = 0 @@ -246,36 +326,23 @@ module OpenTox dataset.warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[i]}' (column #{i+2})." next elsif numeric[i] - csv << "#{cid},#{feature_ids[i]},#{v.to_f}" # retrieving ids from dataset.{compounds|features} kills performance + dataset.bulk << [cid,feature_ids[i],v.to_f] else - csv << "#{cid},#{feature_ids[i]},#{v.strip}" # retrieving ids from dataset.{compounds|features} kills performance + dataset.bulk << [cid,feature_ids[i],v.split] end end end - dataset.compounds.duplicates.each do |duplicates| - # TODO fix and check + dataset.compounds.duplicates.each do |compound| positions = [] - compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c == compound} + dataset.compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi} dataset.warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" time = Time.now + dataset.bulk_write + dataset.save - # Workaround for mongo bulk insertions (insertion of single data_entries is far too slow) - # Skip ruby JSON serialisation: - # - to_json is too slow to write to file - # - json (or bson) serialisation is probably causing very long parse times of Mongo::BulkWrite, or any other ruby insert operation - f = Tempfile.new("#{dataset.id.to_s}.csv","/tmp") - f.puts csv.join("\n") - f.close - $logger.debug "Write file: #{Time.now-time}" - time = Time.now - # TODO DB name from config - `mongoimport --db opentox --collection data_entries --type csv --headerline --file #{f.path}` - $logger.debug "Bulk insert: #{Time.now-time}" - time = Time.now - dataset end end diff --git a/lib/feature.rb b/lib/feature.rb index b2f9a93..de8e4c9 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -8,6 +8,8 @@ module OpenTox field :calculated, type: Boolean field :supervised, type: Boolean field :source, as: :title, type: String + #belongs_to :dataset + #belongs_to :data_entry end class NominalFeature < Feature @@ -36,10 +38,9 @@ module OpenTox end class FminerSmarts < Smarts - field :training_algorithm, type: String - field :training_compound_ids, type: Array - field :training_feature_id, type: BSON::ObjectId - field :training_parameters, type: Hash + field :pValue, type: Float + field :effect, type: String + field :dataset_id def initialize params super params supervised = true -- cgit v1.2.3