summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/compound.rb8
-rw-r--r--lib/data_entry.rb12
-rw-r--r--lib/dataset.rb113
-rw-r--r--lib/feature.rb9
4 files changed, 108 insertions, 34 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 4a8089b..93f609f 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -15,10 +15,12 @@ module OpenTox
field :chemblid, type: String
field :image_id, type: BSON::ObjectId
field :sdf_id, type: BSON::ObjectId
+ #belongs_to :dataset
+ #belongs_to :data_entry
- def == compound
- self.inchi == compound.inchi
- end
+ #def == compound
+ #self.inchi == compound.inchi
+ #end
# Create a compound from smiles string
# @example
diff --git a/lib/data_entry.rb b/lib/data_entry.rb
index 9f6e786..4eeb66d 100644
--- a/lib/data_entry.rb
+++ b/lib/data_entry.rb
@@ -1,15 +1,18 @@
module OpenTox
class DataEntry
- #field :feature_id, type: BSON::ObjectId
- #field :compound_id, type: BSON::ObjectId
+ field :feature_id, type: BSON::ObjectId
+ field :compound_id, type: BSON::ObjectId
# Kludge because csv import removes type information
- field :feature_id, type: String
- field :compound_id, type: String
+ #field :feature_id, type: String
+ #field :compound_id, type: String
field :value
field :warnings, type: String
field :unit, type: String
store_in collection: "data_entries"
+ belongs_to :dataset
+ has_one :compound
+ has_one :feature
# preferred method for the insertion of data entries
# @example DataEntry.find_or_create compound,feature,value
@@ -32,5 +35,6 @@ module OpenTox
def self.[](compound,feature)
self.where(:compound_id => compound.id.to_s, :feature_id => feature.id.to_s).distinct(:value).first
end
+
end
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 503e409..2ade033 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -3,14 +3,33 @@ require 'tempfile'
module OpenTox
+ class LazarPrediction < Dataset
+ field :creator, type: String
+ end
+
+ class FminerDataset < Dataset
+ field :training_algorithm, type: String
+ field :training_dataset_id, type: BSON::ObjectId
+ field :training_feature_id, type: BSON::ObjectId
+ field :training_parameters, type: Hash
+ end
+
class Dataset
include Mongoid::Document
+ attr_accessor :bulk
+
+ # associations like has_many, belongs_to deteriorate performance
field :feature_ids, type: Array, default: []
field :compound_ids, type: Array, default: []
field :source, type: String
field :warnings, type: Array, default: []
+ def initialize params=nil
+ super params
+ @bulk = []
+ end
+
# Readers
def compounds
@@ -21,6 +40,49 @@ module OpenTox
self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
end
+ def [](compound,feature)
+ bad_request_error "Incorrect parameter type. The first argument is a OpenTox::Compound the second a OpenTox::Feature." unless compound.is_a? Compound and feature.is_a? Feature
+ DataEntry.where(dataset_id: self.id, compound_id: compound.id, feature_id: feature.id).distinct(:value).first
+ end
+
+ def fingerprint(compound)
+ data_entries[compound.id]
+ end
+
+ def data_entries
+ unless @data_entries
+ entries = {}
+ t = Time.now
+ DataEntry.where(dataset_id: self.id).each do |de|
+ entries[de.compound_id] ||= {}
+ entries[de.compound_id][de.feature_id] = de.value.first
+ end
+ $logger.debug "Retrieving data: #{Time.now-t}"
+ t = Time.now
+ @data_entries = {}
+ # TODO: check performance overhead
+ compound_ids.each do |cid|
+ @data_entries[cid] = []
+ feature_ids.each_with_index do |fid,i|
+ @data_entries[cid][i] = entries[cid][fid]
+ end
+ end
+ $logger.debug "Create @data_entries: #{Time.now-t}"
+ end
+ @data_entries
+ end
+
+ # Find data entry values for a given compound and feature
+ # @param compound [OpenTox::Compound] OpenTox Compound object
+ # @param feature [OpenTox::Feature] OpenTox Feature object
+ # @return [Array] Data entry values
+ def values(compound, feature)
+ data_entries.where(:compound_id => compound.id, :feature_id => feature.id).distinct(:value)
+ #rows = (0 ... compound_ids.length).select { |r| compound_ids[r] == compound.id }
+ #col = feature_ids.index feature.id
+ #rows.collect{|row| data_entries[row][col]}
+ end
+
# Writers
def compounds=(compounds)
@@ -151,21 +213,40 @@ module OpenTox
#def self.from_sdf_file
#end
+ def bulk_write
+ time = Time.now
+ # Workaround for mongo bulk insertions (insertion of single data_entries is far too slow)
+ # Skip ruby JSON serialisation:
+ # - to_json is too slow to write to file
+ # - json (or bson) serialisation is probably causing very long parse times of Mongo::BulkWrite, or any other ruby insert operation
+ # this method causes a noticeable overhead compared to direct string serialisation (e.g. total processing time 16" instead of 12" for rat fminer dataset), but it can be reused at different places
+ dataset_id = self.id.to_s
+ f = Tempfile.new("#{dataset_id}.json","/tmp")
+ f.puts @bulk.collect{|row| "{'dataset_id': {'$oid': '#{dataset_id}'},'compound_id': {'$oid': '#{row[0]}'}, 'feature_id': {'$oid': '#{row[1]}'}, 'value': #{row[2]}}"}.join("\n")
+ f.close
+ $logger.debug "Write JSON file: #{Time.now-time}"
+ # TODO DB name from config
+ puts `mongoimport --db opentox --collection data_entries --type json --file #{f.path} 2>&1`
+ $logger.debug "Bulk import: #{Time.now-time}"
+ @bulk = []
+ end
+
def self.from_csv_file file, source=nil, bioassay=true
source ||= file
table = CSV.read file, :skip_blanks => true
- from_table table, source, bioassay
+ parse_table table, source, bioassay
end
# parse data in tabular format (e.g. from csv)
# does a lot of guesswork in order to determine feature types
- def self.from_table table, source, bioassay=true
+ def self.parse_table table, source, bioassay=true
time = Time.now
# features
feature_names = table.shift.collect{|f| f.strip}
dataset = Dataset.new(:source => source)
+ dataset_id = dataset.id.to_s
dataset.warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
compound_format = feature_names.shift.strip
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
@@ -175,7 +256,7 @@ module OpenTox
feature_names.each_with_index do |f,i|
values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
types = values.collect{|v| v.numeric? ? true : false}.uniq
- metadata = {"name" => f, "source" => source}
+ metadata = {"name" => File.basename(f), "source" => source}
if values.size == 0 # empty feature
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
metadata["numeric"] = true
@@ -208,7 +289,6 @@ module OpenTox
# compounds and values
r = -1
- csv = ["compound_id,feature_id,value"]
compound_time = 0
value_time = 0
@@ -246,36 +326,23 @@ module OpenTox
dataset.warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[i]}' (column #{i+2})."
next
elsif numeric[i]
- csv << "#{cid},#{feature_ids[i]},#{v.to_f}" # retrieving ids from dataset.{compounds|features} kills performance
+ dataset.bulk << [cid,feature_ids[i],v.to_f]
else
- csv << "#{cid},#{feature_ids[i]},#{v.strip}" # retrieving ids from dataset.{compounds|features} kills performance
+ dataset.bulk << [cid,feature_ids[i],v.split]
end
end
end
- dataset.compounds.duplicates.each do |duplicates|
- # TODO fix and check
+ dataset.compounds.duplicates.each do |compound|
positions = []
- compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c == compound}
+ dataset.compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
dataset.warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
time = Time.now
+ dataset.bulk_write
+ dataset.save
- # Workaround for mongo bulk insertions (insertion of single data_entries is far too slow)
- # Skip ruby JSON serialisation:
- # - to_json is too slow to write to file
- # - json (or bson) serialisation is probably causing very long parse times of Mongo::BulkWrite, or any other ruby insert operation
- f = Tempfile.new("#{dataset.id.to_s}.csv","/tmp")
- f.puts csv.join("\n")
- f.close
- $logger.debug "Write file: #{Time.now-time}"
- time = Time.now
- # TODO DB name from config
- `mongoimport --db opentox --collection data_entries --type csv --headerline --file #{f.path}`
- $logger.debug "Bulk insert: #{Time.now-time}"
- time = Time.now
-
dataset
end
end
diff --git a/lib/feature.rb b/lib/feature.rb
index b2f9a93..de8e4c9 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -8,6 +8,8 @@ module OpenTox
field :calculated, type: Boolean
field :supervised, type: Boolean
field :source, as: :title, type: String
+ #belongs_to :dataset
+ #belongs_to :data_entry
end
class NominalFeature < Feature
@@ -36,10 +38,9 @@ module OpenTox
end
class FminerSmarts < Smarts
- field :training_algorithm, type: String
- field :training_compound_ids, type: Array
- field :training_feature_id, type: BSON::ObjectId
- field :training_parameters, type: Hash
+ field :pValue, type: Float
+ field :effect, type: String
+ field :dataset_id
def initialize params
super params
supervised = true