From e68c69f90036bb7c47c57acb1ee3652b73c835c1 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 31 Jul 2015 10:59:53 +0200 Subject: descriptor tests working --- lib/compound.rb | 13 +++++++------ lib/dataset.rb | 38 +++++++++++++++----------------------- lib/feature.rb | 3 ++- 3 files changed, 24 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 93f609f..4d36915 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -73,12 +73,13 @@ module OpenTox # Get sdf # @return [String] SDF string def sdf - if sdf_id.nil? + if self.sdf_id.nil? sdf = obconversion(inchi,"inchi","sdf") file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile") sdf_id = $gridfs.insert_one file + update :sdf_id => sdf_id end - $gridfs.find_one(_id: sdf_id).data + $gridfs.find_one(_id: self.sdf_id).data end # Get png image @@ -86,12 +87,12 @@ module OpenTox # image = compound.png # @return [image/png] Image data def png - if image_id.nil? + if self.image_id.nil? png = obconversion(inchi,"inchi","_png2") file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png") update(:image_id => $gridfs.insert_one(file)) end - Base64.decode64($gridfs.find_one(_id: image_id).data) + Base64.decode64($gridfs.find_one(_id: self.image_id).data) end @@ -134,11 +135,11 @@ module OpenTox OpenBabel::OBOp.find_type("Gen3D").do(obmol) sdf = obconversion.write_string(obmol) if sdf.match(/.nan/) - $logger.warn "3D generation failed for compound #{compound.inchi}, trying to calculate 2D structure" + $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" OpenBabel::OBOp.find_type("Gen2D").do(obmol) sdf = obconversion.write_string(obmol) if sdf.match(/.nan/) - $logger.warn "2D generation failed for compound #{compound.inchi}" + $logger.warn "2D generation failed for compound #{identifier}" sdf = nil end end diff --git a/lib/dataset.rb b/lib/dataset.rb index 1392de5..92ef7b5 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -18,7 +18,7 @@ module OpenTox include Mongoid::Document attr_accessor :bulk - #attr_writer :data_entries + attr_writer :data_entries # associations like has_many, belongs_to deteriorate performance field :feature_ids, type: Array, default: [] @@ -62,19 +62,29 @@ module OpenTox end def [](row,col) - #bad_request_error "Incorrect parameter type. The first argument is a OpenTox::Compound the second a OpenTox::Feature." unless compound.is_a? Compound and feature.is_a? Feature - #DataEntry.where(dataset_id: self.id, compound_id: compound.id, feature_id: feature.id).distinct(:value).first - #data_entries[compound_ids.index(compound.id)][feature_ids.index(feature.id)] @data_entries[row,col] end def []=(row,col,v) @data_entries ||= [] @data_entries[row] ||= [] - #@data_entries ||= Array.new(compound_ids.size){Array.new(feature_ids.size)} @data_entries[row][col] = v end + # merge dataset (i.e. append features) + def +(dataset) + bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset + bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids + self.feature_ids ||= [] + self.feature_ids = self.feature_ids + dataset.feature_ids + @data_entries ||= Array.new(compound_ids.size){[]} + @data_entries.each_with_index do |row,i| + @data_entries[i] = row + dataset.fingerprint(compounds[i]) + end + self + + end + def fingerprint(compound) data_entries[compound_ids.index(compound.id)] end @@ -232,24 +242,6 @@ module OpenTox #def self.from_sdf_file #end - def bulk_write - time = Time.now - # Workaround for mongo bulk insertions (insertion of single data_entries is far too slow) - # Skip ruby JSON serialisation: - # - to_json is too slow to write to file - # - json (or bson) serialisation is probably causing very long parse times of Mongo::BulkWrite, or any other ruby insert operation - # this method causes a noticeable overhead compared to direct string serialisation (e.g. total processing time 16" instead of 12" for rat fminer dataset), but it can be reused at different places - dataset_id = self.id.to_s - f = Tempfile.new("#{dataset_id}.json","/tmp") - f.puts @bulk.collect{|row| "{'dataset_id': {'$oid': '#{dataset_id}'},'compound_id': {'$oid': '#{row[0]}'}, 'feature_id': {'$oid': '#{row[1]}'}, 'value': #{row[2]}}"}.join("\n") - f.close - $logger.debug "Write JSON file: #{Time.now-time}" - # TODO DB name from config - puts `mongoimport --db opentox --collection data_entries --type json --file #{f.path} 2>&1` - $logger.debug "Bulk import: #{Time.now-time}" - @bulk = [] - end - def self.from_csv_file file, source=nil, bioassay=true source ||= file table = CSV.read file, :skip_blanks => true diff --git a/lib/feature.rb b/lib/feature.rb index de8e4c9..e565875 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -56,8 +56,9 @@ module OpenTox end class PhysChemDescriptor < NumericFeature - field :algorithm, type: String + field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem" field :parameters, type: Hash + field :creator, type: String end end -- cgit v1.2.3