From 6efd73ed92c0a1eee46464ec11d0ed41df3570e9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 3 Aug 2015 18:04:58 +0200 Subject: initial classification validation --- lib/dataset.rb | 105 +++++++++++++++------------------------------------------ 1 file changed, 28 insertions(+), 77 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 45f7119..152545b 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,9 +5,10 @@ module OpenTox class LazarPrediction < Dataset field :creator, type: String - def value compound - end - def confidence compound + field :prediction_feature_id, type: String + + def prediction_feature + Feature.find prediction_feature_id end end @@ -159,80 +160,32 @@ module OpenTox end end - # Methods for for validation service - - # create a new dataset with the specified compounds and features - # @param compound_indices [Array] compound indices (integers) - # @param feats [Array] features objects - # @param metadata [Hash] - # @return [OpenTox::Dataset] - # TODO - def split( compound_indices, feats, metadata) - - bad_request_error "Dataset.split : Please give compounds as indices" if compound_indices.size==0 or !compound_indices[0].is_a?(Fixnum) - bad_request_error "Dataset.split : Please give features as feature objects (given: #{feats})" if feats!=nil and feats.size>0 and !feats[0].is_a?(OpenTox::Feature) - dataset = OpenTox::Dataset.new - dataset.metadata = metadata - dataset.features = (feats ? feats : self.features) - compound_indices.each do |c_idx| - d = [ self.compounds[c_idx] ] - dataset.features.each_with_index.each do |f,f_idx| - d << (self.data_entries[c_idx] ? self.data_entries[c_idx][f_idx] : nil) - end - dataset << d - end - dataset.put - dataset - end - - - # maps a compound-index from another dataset to a compound-index from this dataset - # mapping works as follows: - # (compound c is the compound identified by the compound-index of the other dataset) - # * c occurs only once in this dataset? map compound-index of other dataset to index in this dataset - # * c occurs >1 in this dataset? - # ** number of occurences is equal in both datasets? assume order is preserved(!) and map accordingly - # ** number of occurences is not equal in both datasets? cannot map, raise error - # @param dataset [OpenTox::Dataset] dataset that should be mapped to this dataset (fully loaded) - # @param compound_index [Fixnum], corresponding to dataset - # TODO - def compound_index( dataset, compound_index ) - compound_inchi = dataset.compounds[compound_index].inchi - self_indices = compound_indices(compound_inchi) - if self_indices==nil - nil - else - dataset_indices = dataset.compound_indices(compound_inchi) - if self_indices.size==1 - self_indices.first - elsif self_indices.size==dataset_indices.size - # we do assume that the order is preseverd (i.e., the nth occurences in both datasets are mapped to each other)! - self_indices[dataset_indices.index(compound_index)] - else - raise "cannot map compound #{compound_inchi} from dataset #{dataset.id} to dataset #{self.id}, "+ - "compound occurs #{dataset_indices.size} times and #{self_indices.size} times" - end + # split dataset into n folds + def folds n + len = self.compound_ids.size + indices = (0..len-1).to_a.shuffle + mid = (len/n) + chunks = [] + start = 0 + 1.upto(n) do |i| + last = start+mid + last = last-1 unless len%n >= i + test_idxs = indices[start..last] || [] + test_cids = test_idxs.collect{|i| self.compound_ids[i]} + test_data_entries = test_idxs.collect{|i| self.data_entries[i]} + test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) + training_idxs = indices-test_idxs + training_cids = training_idxs.collect{|i| self.compound_ids[i]} + training_data_entries = training_idxs.collect{|i| self.data_entries[i]} + training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) + test_dataset.save_all + training_dataset.save_all + chunks << [training_dataset,test_dataset] + start = last+1 end + chunks end - # returns the inidices of the compound in the dataset - # @param compound_inchi [String] - # @return [Array] compound index (position) of the compound in the dataset, array-size is 1 unless multiple occurences - # TODO - def compound_indices( compound_inchi ) - unless defined?(@cmp_indices) and @cmp_indices.has_key?(compound_inchi) - @cmp_indices = {} - compounds().size.times do |i| - c = self.compounds[i].inchi - if @cmp_indices[c]==nil - @cmp_indices[c] = [i] - else - @cmp_indices[c] = @cmp_indices[c]+[i] - end - end - end - @cmp_indices[compound_inchi] - end # Adding data methods # (Alternatively, you can directly change @data["feature_ids"] and @data["compounds"]) @@ -247,7 +200,7 @@ module OpenTox def self.from_csv_file file, source=nil, bioassay=true source ||= file table = CSV.read file, :skip_blanks => true - dataset = Dataset.new(:source => source, :name => File.basename(file)) + dataset = self.new(:source => source, :name => File.basename(file)) dataset.parse_table table, bioassay dataset end @@ -295,7 +248,6 @@ module OpenTox end feature_ids << OpenTox::Feature.find_or_create_by(metadata).id end - #feature_ids = dataset.features.collect{|f| f.id.to_s} $logger.debug "Feature values: #{Time.now-time}" time = Time.now @@ -319,7 +271,6 @@ module OpenTox next end when /InChI/i - # compounds and values compound = OpenTox::Compound.from_inchi(identifier) end rescue -- cgit v1.2.3