From ca2bb0f90335b1f2c4ecc28ee423e85b281ffcf0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 Nov 2015 17:50:17 +0100 Subject: neighbor search delegated to database backend --- lib/dataset.rb | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index d989bdf..af116a9 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -105,10 +105,18 @@ module OpenTox test_cids = test_idxs.collect{|i| self.compound_ids[i]} test_data_entries = test_idxs.collect{|i| self.data_entries[i]} test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) + test_dataset.compounds.each do |compound| + compound.dataset_ids << test_dataset.id + compound.save + end training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| self.compound_ids[i]} training_data_entries = training_idxs.collect{|i| self.data_entries[i]} training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) + training_dataset.compounds.each do |compound| + compound.dataset_ids << training_dataset.id + compound.save + end test_dataset.save_all training_dataset.save_all chunks << [training_dataset,test_dataset] @@ -229,7 +237,7 @@ module OpenTox table.each_with_index do |vals,i| ct = Time.now - identifier = vals.shift + identifier = vals.shift.strip warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format @@ -246,7 +254,7 @@ module OpenTox warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - # TODO insert empty compounds to keep positions? + compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id compound_time += Time.now-ct r += 1 @@ -263,10 +271,15 @@ module OpenTox warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] - self.data_entries.last[j] = v.to_f + v = v.to_f else - self.data_entries.last[j] = v.strip + v = v.strip end + self.data_entries.last[j] = v + #i = compound.feature_ids.index feature_ids[j] + compound.features[feature_ids[j].to_s] ||= [] + compound.features[feature_ids[j].to_s] << v + compound.save end end compounds.duplicates.each do |compound| -- cgit v1.2.3 From d6eced29e104b9bc1923b2ac89b2700a48adf07a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 8 Jan 2016 11:00:20 +0100 Subject: mg-mmol conversion fixed --- lib/dataset.rb | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 366c79f..55cde63 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -126,6 +126,17 @@ module OpenTox end # Diagnostics + + def duplicates feature=self.features.first + col = feature_ids.index feature.id + dups = {} + compound_ids.each_with_index do |cid,i| + rows = compound_ids.each_index.select{|r| compound_ids[r] == cid } + values = rows.collect{|row| data_entries[row][col]} + dups[cid] = values if values.size > 1 + end + dups + end def correlation_plot training_dataset # TODO: create/store svg @@ -162,10 +173,10 @@ module OpenTox # TODO #def self.from_sdf_file #end - + # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil, bioassay=true + def self.from_csv_file file, source=nil, bioassay=true#, layout={} source ||= file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) @@ -175,7 +186,7 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table, bioassay + dataset.parse_table table, bioassay#, layout end dataset end -- cgit v1.2.3 From e778475c578f13f30af4437845716d7e781c2609 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 13 Feb 2016 13:15:29 +0100 Subject: improved handling of duplicates in validations --- lib/dataset.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 55cde63..7925bcd 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -93,6 +93,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n + # TODO fix splits for duplicates len = self.compound_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) -- cgit v1.2.3 From c4b56b22fd6e65633deb7e52bd99865e3bee8f00 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 29 Feb 2016 13:02:37 +0100 Subject: crossvalidation folds fixed for duplicates --- lib/dataset.rb | 102 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 46 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 7925bcd..59a68e5 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,25 +5,12 @@ module OpenTox class Dataset - #attr_writer :data_entries - # associations like has_many, belongs_to deteriorate performance field :feature_ids, type: Array, default: [] field :compound_ids, type: Array, default: [] - #field :data_entries_id, type: BSON::ObjectId field :data_entries, type: Array, default: [] field :source, type: String - # Save all data including data_entries - # Should be used instead of save - def save_all - save - #dump = Marshal.dump(@data_entries) - #file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries") - #entries_id = $gridfs.insert_one(file) - #update(:data_entries_id => entries_id) - end - # Readers # Get all compounds @@ -38,33 +25,6 @@ module OpenTox @features end -=begin - # Get all data_entries - def data_entries - unless @data_entries - t = Time.now - data_entry_file = $gridfs.find_one(_id: data_entries_id) - if data_entry_file.nil? - @data_entries = [] - else - @data_entries = Marshal.load(data_entry_file.data) - bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array - unless @data_entries.first.size == feature_ids.size - # TODO: fix (unknown) source of empty data_entries - sleep 1 - data_entry_file = $gridfs.find_one(_id: data_entries_id) - @data_entries = Marshal.load(data_entry_file.data) - end - bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size - # TODO: data_entries can be empty, poorly reproducible, mongo problem? - bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size - #$logger.debug "Retrieving data: #{Time.now-t}" - end - end - @data_entries - end -=end - # Find data entry values for a given compound and feature # @param compound [OpenTox::Compound] OpenTox Compound object # @param feature [OpenTox::Feature] OpenTox Feature object @@ -92,9 +52,11 @@ module OpenTox # Split a dataset into n folds # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] +=begin def folds n # TODO fix splits for duplicates - len = self.compound_ids.size + unique_compound_ids = compound_ids.uniq + len = unique_compond_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -103,7 +65,7 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| self.compound_ids[i]} + test_cids = test_idxs.collect{|i| unique_compond_ids[i]} test_data_entries = test_idxs.collect{|i| self.data_entries[i]} test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) test_dataset.compounds.each do |compound| @@ -111,20 +73,68 @@ module OpenTox compound.save end training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| self.compound_ids[i]} + training_cids = training_idxs.collect{|i| unique_compond_ids[i]} training_data_entries = training_idxs.collect{|i| self.data_entries[i]} training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) training_dataset.compounds.each do |compound| compound.dataset_ids << training_dataset.id compound.save end - test_dataset.save_all - training_dataset.save_all + test_dataset.save + training_dataset.save chunks << [training_dataset,test_dataset] start = last+1 end chunks end +=end + + # Split a dataset into n folds + # @param [Integer] number of folds + # @return [Array] Array with folds [training_dataset,test_dataset] + def folds n + unique_compound_data = {} + compound_ids.each_with_index do |cid,i| + unique_compound_data[cid] ||= [] + unique_compound_data[cid] << data_entries[i] + end + unique_compound_ids = unique_compound_data.keys + len = unique_compound_ids.size + indices = (0..len-1).to_a.shuffle + mid = (len/n) + chunks = [] + start = 0 + 1.upto(n) do |i| + last = start+mid + last = last-1 unless len%n >= i + test_idxs = indices[start..last] || [] + test_cids = test_idxs.collect{|i| unique_compound_ids[i]} + training_idxs = indices-test_idxs + training_cids = training_idxs.collect{|i| unique_compound_ids[i]} + chunk = [training_cids,test_cids].collect do |unique_cids| + cids = [] + data_entries = [] + unique_cids.each do |cid| + unique_compound_data[cid].each do |de| + cids << cid + data_entries << de + end + end + dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) +=begin + dataset.compounds.each do |compound| + compound.dataset_ids << dataset.id + compound.save + end +=end + dataset + end + start = last+1 + chunks << chunk + end + puts chunks.inspect + chunks + end # Diagnostics @@ -337,7 +347,7 @@ module OpenTox scaled_dataset.centers = centers scaled_dataset.scales = scales scaled_dataset.data_entries = scaled_data_entries - scaled_dataset.save_all + scaled_dataset.save scaled_dataset end end -- cgit v1.2.3 From 24b1524f20eccd3bfd59171f1f7151fcc272a427 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 14 Mar 2016 10:06:22 +0100 Subject: folds split on unique compounds instead of data entries --- lib/dataset.rb | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 59a68e5..b9c2187 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -49,46 +49,6 @@ module OpenTox # Dataset operations - # Split a dataset into n folds - # @param [Integer] number of folds - # @return [Array] Array with folds [training_dataset,test_dataset] -=begin - def folds n - # TODO fix splits for duplicates - unique_compound_ids = compound_ids.uniq - len = unique_compond_ids.size - indices = (0..len-1).to_a.shuffle - mid = (len/n) - chunks = [] - start = 0 - 1.upto(n) do |i| - last = start+mid - last = last-1 unless len%n >= i - test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| unique_compond_ids[i]} - test_data_entries = test_idxs.collect{|i| self.data_entries[i]} - test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) - test_dataset.compounds.each do |compound| - compound.dataset_ids << test_dataset.id - compound.save - end - training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| unique_compond_ids[i]} - training_data_entries = training_idxs.collect{|i| self.data_entries[i]} - training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) - training_dataset.compounds.each do |compound| - compound.dataset_ids << training_dataset.id - compound.save - end - test_dataset.save - training_dataset.save - chunks << [training_dataset,test_dataset] - start = last+1 - end - chunks - end -=end - # Split a dataset into n folds # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] @@ -121,18 +81,15 @@ module OpenTox end end dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) -=begin dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save end -=end dataset end start = last+1 chunks << chunk end - puts chunks.inspect chunks end -- cgit v1.2.3 From 0c5d2e678908a2d4aea43efbedbedc2c0439be30 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 14 Mar 2016 15:25:50 +0100 Subject: descriptor tests --- lib/dataset.rb | 2 -- 1 file changed, 2 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index b9c2187..af851b5 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -132,7 +132,6 @@ module OpenTox end end - # Parsers # Create a dataset from file (csv,sdf,...) @@ -211,7 +210,6 @@ module OpenTox value_time = 0 # compounds and values - #@data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)} self.data_entries = [] table.each_with_index do |vals,i| -- cgit v1.2.3 From 7c3bd90c26dfeea2db3cf74a1cefc23d8dece7c0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 15 Mar 2016 17:40:40 +0100 Subject: validation tests pass --- lib/dataset.rb | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index af851b5..5d8aeaf 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -85,6 +85,7 @@ module OpenTox compound.dataset_ids << dataset.id compound.save end + dataset.save dataset end start = last+1 @@ -283,28 +284,6 @@ module OpenTox end end - def scale - scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)} - centers = [] - scales = [] - feature_ids.each_with_index do |feature_id,col| - R.assign "x", data_entries.collect{|de| de[col]} - R.eval "scaled = scale(x,center=T,scale=T)" - centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby - scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby - R.eval("scaled").to_ruby.each_with_index do |value,row| - scaled_data_entries[row][col] = value - end - end - scaled_dataset = ScaledDataset.new(attributes) - scaled_dataset["_id"] = BSON::ObjectId.new - scaled_dataset["_type"] = "OpenTox::ScaledDataset" - scaled_dataset.centers = centers - scaled_dataset.scales = scales - scaled_dataset.data_entries = scaled_data_entries - scaled_dataset.save - scaled_dataset - end end # Dataset for lazar predictions -- cgit v1.2.3