From 511b3033b7359a8bf23cac42852003e94044cd47 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 19 Oct 2015 11:42:16 +0200 Subject: GridFS storage for data_entries removed (will break kazius/fminer models), lazy creation of classification confidence plots. --- lib/crossvalidation.rb | 36 +++++++++++++++++++----------------- lib/dataset.rb | 31 ++++++++++++++++++------------- test/dataset.rb | 1 + test/setup.rb | 4 ++-- 4 files changed, 40 insertions(+), 32 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index cbffb7c..2e6dabb 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -128,26 +128,28 @@ module OpenTox end def confidence_plot - tmpfile = "/tmp/#{id.to_s}_confidence.svg" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - if p[1] and p[2] - p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[3] + unless confidence_plot_id + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + if p[1] and p[2] + p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[3] + end end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) $gridfs.find_one(_id: confidence_plot_id).data end diff --git a/lib/dataset.rb b/lib/dataset.rb index 60f3bb5..d989bdf 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,21 +5,23 @@ module OpenTox class Dataset - attr_writer :data_entries + #attr_writer :data_entries # associations like has_many, belongs_to deteriorate performance field :feature_ids, type: Array, default: [] field :compound_ids, type: Array, default: [] - field :data_entries_id, type: BSON::ObjectId#, default: [] + #field :data_entries_id, type: BSON::ObjectId + field :data_entries, type: Array, default: [] field :source, type: String # Save all data including data_entries # Should be used instead of save def save_all - dump = Marshal.dump(@data_entries) - file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries") - entries_id = $gridfs.insert_one(file) - update(:data_entries_id => entries_id) + save + #dump = Marshal.dump(@data_entries) + #file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries") + #entries_id = $gridfs.insert_one(file) + #update(:data_entries_id => entries_id) end # Readers @@ -36,6 +38,7 @@ module OpenTox @features end +=begin # Get all data_entries def data_entries unless @data_entries @@ -60,6 +63,7 @@ module OpenTox end @data_entries end +=end # Find data entry values for a given compound and feature # @param compound [OpenTox::Compound] OpenTox Compound object @@ -220,7 +224,8 @@ module OpenTox value_time = 0 # compounds and values - @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)} + #@data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)} + self.data_entries = [] table.each_with_index do |vals,i| ct = Time.now @@ -251,16 +256,16 @@ module OpenTox end compound_ids << compound.id - table.first.size == 0 ? @data_entries << Array.new(0) : @data_entries << Array.new(table.first.size-1) + table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1) vals.each_with_index do |v,j| if v.blank? warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] - @data_entries.last[j] = v.to_f + self.data_entries.last[j] = v.to_f else - @data_entries.last[j] = v.strip + self.data_entries.last[j] = v.strip end end end @@ -272,7 +277,7 @@ module OpenTox $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" time = Time.now - save_all + save $logger.debug "Saving: #{Time.now-time}" end @@ -281,9 +286,9 @@ module OpenTox # @param any value def fill_nil_with n (0 .. compound_ids.size-1).each do |i| - @data_entries[i] ||= [] + data_entries[i] ||= [] (0 .. feature_ids.size-1).each do |j| - @data_entries[i][j] ||= n + data_entries[i][j] ||= n end end end diff --git a/test/dataset.rb b/test/dataset.rb index 60f917c..47a6c25 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -141,6 +141,7 @@ class DatasetTest < MiniTest::Test def test_from_csv d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + p d assert_equal Dataset, d.class assert_equal 1, d.features.size assert_equal 85, d.compounds.size diff --git a/test/setup.rb b/test/setup.rb index 3825282..dc577b3 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -4,5 +4,5 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -#$mongo.database.drop -#$gridfs = $mongo.database.fs +$mongo.database.drop +$gridfs = $mongo.database.fs -- cgit v1.2.3