From b2d80ad2e470fcb41af4b747142e5693f2fa4615 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 24 May 2016 13:05:53 +0200 Subject: dataset tests fixed --- lib/dataset.rb | 43 ++++++------------- lib/validation-statistics.rb | 1 + test/dataset.rb | 98 ++++++++++---------------------------------- test/setup.rb | 4 +- 4 files changed, 37 insertions(+), 109 deletions(-) diff --git a/lib/dataset.rb b/lib/dataset.rb index 205f640..38a55a8 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,8 +5,6 @@ module OpenTox class Dataset - #field :substance_ids, type: Array, default: [] - #field :feature_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers @@ -27,7 +25,6 @@ module OpenTox # Get all features def features - #@features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq @features end @@ -44,16 +41,6 @@ module OpenTox # Writers - # Set compounds - def compounds=(compounds) - self.substance_ids = compounds.collect{|c| c.id}.uniq - end - - # Set features - def features=(features) - self.feature_ids = features.collect{|f| f.id} - end - def add(substance,feature,value) substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature @@ -87,8 +74,6 @@ module OpenTox chunk = [training_substances,test_substances].collect do |substances| dataset = self.class.create(:source => self.id ) substances.each do |substance| - #dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) - #dataset.substances.each do |substance| substance.dataset_ids << dataset.id substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} @@ -108,7 +93,7 @@ module OpenTox # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| - compound = Substance.find(substance_ids.first).is_a? Compound + compound = substances.first.is_a? Compound if compound csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} else @@ -128,11 +113,7 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s] - row << data_entries[substance.id.to_s][f.id.to_s] - else - row << "" - end + values(substance,f) ? row << values(substance,f)[i] : row << "" end csv << row end @@ -152,8 +133,8 @@ module OpenTox # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil - source ||= file + def self.from_csv_file file, accept_empty_values=false + source = file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) if dataset @@ -162,14 +143,14 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table + dataset.parse_table table, accept_empty_values end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def parse_table table + def parse_table table, accept_empty_values # features feature_names = table.shift.collect{|f| f.strip} @@ -200,24 +181,25 @@ module OpenTox # substances and values + all_substances = [] table.each_with_index do |vals,i| identifier = vals.shift.strip - warn "No feature values for compound at position #{i+2}." if vals.compact.empty? + warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values begin case compound_format when /SMILES/i substance = OpenTox::Compound.from_smiles(identifier) when /InChI/i substance = OpenTox::Compound.from_inchi(identifier) - # TODO nanoparticle end rescue substance = nil end if substance.nil? # compound parsers may return nil - warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." next end + all_substances << substance substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id substance.save @@ -237,10 +219,11 @@ module OpenTox end add substance, features[j], v end + data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values end - substances.duplicates.each do |substance| + all_substances.duplicates.each do |substance| positions = [] - substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} + all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end save diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 2d6b56e..3c52b15 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -68,6 +68,7 @@ module OpenTox x = [] y = [] predictions.each do |cid,pred| + p pred if pred[:value] and pred[:measured] x << pred[:measured].median y << pred[:value] diff --git a/test/dataset.rb b/test/dataset.rb index 9bb3409..7ec9973 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -30,7 +30,7 @@ class DatasetTest < MiniTest::Test csv.shift csv.each do |row| c = Compound.from_smiles row.shift - assert_equal row, c.toxicities[d.features.first.id.to_s][d.id.to_s] + assert_equal row, d.values(c,d.features.first) end d.delete end @@ -45,7 +45,7 @@ class DatasetTest < MiniTest::Test # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1 c = d.compounds[491] assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC" - assert_equal c.toxicities[d.feature_ids.first.to_s][d.id.to_s][0], "1" + assert_equal ["1"], d.values(c,d.features.first) d.delete end @@ -64,9 +64,8 @@ class DatasetTest < MiniTest::Test d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal true, d.features.first.nominal - assert_equal csv.size-1-errors.size, d.compounds.size + assert_equal 1056, d.compounds.size assert_equal csv.first.size-1, d.features.size - puts d.warnings.to_yaml errors.each do |smi| refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}} end @@ -94,17 +93,13 @@ class DatasetTest < MiniTest::Test assert_equal csv.first.size-1, d.features.size assert_match "EPAFHM_log10.csv", d.source assert_equal "EPAFHM_log10", d.name - refute_nil d.warnings - #p d.warnings - #assert_equal 74, d.warnings.size feature = d.features.first assert_kind_of NumericFeature, feature - assert_match /row 13/, d.warnings.join - assert_equal -Math.log10(0.0113), d.compounds.first.toxicities[feature.id.to_s][d.id.to_s].first - assert_equal -Math.log10(0.00323), d.compounds[5].toxicities[feature.id.to_s][d.id.to_s].first + assert_equal -Math.log10(0.0113), d.values(d.compounds.first,feature).first + assert_equal -Math.log10(0.00323), d.values(d.compounds[4],feature).first d2 = Dataset.find d.id - assert_equal -Math.log10(0.0113), d2.compounds[0].toxicities[feature.id.to_s][d.id.to_s].first - assert_equal -Math.log10(0.00323), d2.compounds[5].toxicities[feature.id.to_s][d.id.to_s].first + assert_equal -Math.log10(0.0113), d2.values(d2.compounds[0],feature).first + assert_equal -Math.log10(0.00323), d2.values(d2.compounds[4],feature).first d.delete end @@ -112,11 +107,11 @@ class DatasetTest < MiniTest::Test def test_create_without_features_smiles_and_inchi ["smiles", "inchi"].each do |type| - d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv") + d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv"), true assert_equal Dataset, d.class refute_nil d.id dataset = Dataset.find d.id - assert_equal 3, d.compounds.size.to_i + assert_equal 3, d.compounds.size d.delete end end @@ -130,8 +125,8 @@ class DatasetTest < MiniTest::Test assert_operator d.compounds.size, :>=, d.compounds.uniq.size end assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size - assert_equal dataset.substance_ids.size, fold.first.substance_ids.size + fold.last.substance_ids.size - assert_empty (fold.first.substance_ids & fold.last.substance_ids) + assert_equal dataset.substances.size, fold.first.substances.size + fold.last.substances.size + assert_empty (fold.first.substances & fold.last.substances) end end @@ -184,13 +179,13 @@ class DatasetTest < MiniTest::Test # get features assert_equal 6, new_dataset.features.size assert_equal 5, new_dataset.compounds.uniq.size - de = new_dataset.compounds.last.toxicities - fid = new_dataset.features.first.id.to_s - assert_equal ["1"], de[fid][d.id.to_s] - fid = new_dataset.features.last.id.to_s - assert_equal [1.0], de[fid][d.id.to_s] - fid = new_dataset.features[2].id.to_s - assert_equal ["false"], de[fid][d.id.to_s] + c = new_dataset.compounds.last + f = new_dataset.features.first + assert_equal ["1"], new_dataset.values(c,f) + f = new_dataset.features.last.id.to_s + assert_equal [1.0], new_dataset.values(c,f) + f = new_dataset.features[2] + assert_equal ["false"], new_dataset.values(c,f) d.delete end @@ -208,7 +203,7 @@ class DatasetTest < MiniTest::Test csv.shift csv.each do |row| c = Compound.from_smiles row.shift - assert_equal row, c.toxicities[d.feature_ids.first.to_s][d.id.to_s] + assert_equal row, d.values(c,d.features.first) end d.delete end @@ -217,7 +212,7 @@ class DatasetTest < MiniTest::Test def test_from_csv2 File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") } dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv" - assert_equal "Cannot parse SMILES compound '' at position 3, all entries are ignored.", dataset.warnings.join + assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join File.delete "#{DATA_DIR}/temp_test.csv" dataset.features.each{|f| feature = Feature.find f.id; feature.delete} dataset.delete @@ -251,9 +246,7 @@ class DatasetTest < MiniTest::Test csv.each do |row| c = Compound.from_smiles(row.shift) p row - p c.toxicities - p d.feature_ids.first.to_s - assert_equal row, c.toxicities[d.feature_ids.first.to_s][d.id.to_s] + assert_equal row, d.values(c,d.features.first) end d.delete end @@ -287,54 +280,5 @@ class DatasetTest < MiniTest::Test assert_nil Dataset.find d.id end - def test_client_create - skip - d = Dataset.new - assert_equal Dataset, d.class - d.name = "Create dataset test" - - # add data entries - features = ["test1", "test2"].collect do |title| - f = Feature.new - f.name = title - f.numeric = true - f.save - f - end - - # manual low-level insertions without consistency checks for runtime efficiency - compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi| - Compound.from_smiles smi - end - data_entries = [] - data_entries << [1,2] - data_entries << [4,5] - data_entries << [6,7] - compounds.each_with_index do |c,i| - features.each_with_index do |f,j| - d.substance_ids << c.id - d.feature_ids << f.id - c.toxicities[f.id.to_s] = data_entries[i][j] - end - end - - assert_equal 3, d.compounds.size - assert_equal 2, d.features.size - #assert_equal [[1,2],[4,5],[6,7]], d.data_entries - d.save - # check if dataset has been saved correctly - new_dataset = Dataset.find d.id - assert_equal 3, new_dataset.compounds.size - assert_equal 2, new_dataset.features.size - new_dataset.compounds.each_with_index do |c,i| - new_dataset.features.each_with_index do |f,j| - assert_equal data_entries[i][j], c.toxicities[f.id.to_s].first - end - end - d.delete - assert_nil Dataset.find d.id - assert_nil Dataset.find new_dataset.id - end - end diff --git a/test/setup.rb b/test/setup.rb index 6c97282..e7c32b4 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -#$mongo.database.drop -#$gridfs = $mongo.database.fs +$mongo.database.drop +$gridfs = $mongo.database.fs -- cgit v1.2.3