summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-05-24 13:05:53 +0200
committerChristoph Helma <helma@in-silico.ch>2016-05-24 13:05:53 +0200
commitb2d80ad2e470fcb41af4b747142e5693f2fa4615 (patch)
tree0bf14d8ee8acba4609354e576a03736f085ae720
parentc90644211e214a50f6fdb3a936bf247f45f1f4be (diff)
dataset tests fixed
-rw-r--r--lib/dataset.rb43
-rw-r--r--lib/validation-statistics.rb1
-rw-r--r--test/dataset.rb98
-rw-r--r--test/setup.rb4
4 files changed, 37 insertions, 109 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 205f640..38a55a8 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -5,8 +5,6 @@ module OpenTox
class Dataset
- #field :substance_ids, type: Array, default: []
- #field :feature_ids, type: Array, default: []
field :data_entries, type: Hash, default: {}
# Readers
@@ -27,7 +25,6 @@ module OpenTox
# Get all features
def features
- #@features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)}
@features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
@features
end
@@ -44,16 +41,6 @@ module OpenTox
# Writers
- # Set compounds
- def compounds=(compounds)
- self.substance_ids = compounds.collect{|c| c.id}.uniq
- end
-
- # Set features
- def features=(features)
- self.feature_ids = features.collect{|f| f.id}
- end
-
def add(substance,feature,value)
substance = substance.id if substance.is_a? Substance
feature = feature.id if feature.is_a? Feature
@@ -87,8 +74,6 @@ module OpenTox
chunk = [training_substances,test_substances].collect do |substances|
dataset = self.class.create(:source => self.id )
substances.each do |substance|
- #dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id )
- #dataset.substances.each do |substance|
substance.dataset_ids << dataset.id
substance.save
dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
@@ -108,7 +93,7 @@ module OpenTox
# @return [String]
def to_csv(inchi=false)
CSV.generate() do |csv|
- compound = Substance.find(substance_ids.first).is_a? Compound
+ compound = substances.first.is_a? Compound
if compound
csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
else
@@ -128,11 +113,7 @@ module OpenTox
(0..nr_measurements.first-1).each do |i|
row = [name]
features.each do |f|
- if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s]
- row << data_entries[substance.id.to_s][f.id.to_s]
- else
- row << ""
- end
+ values(substance,f) ? row << values(substance,f)[i] : row << ""
end
csv << row
end
@@ -152,8 +133,8 @@ module OpenTox
# Create a dataset from CSV file
# TODO: document structure
- def self.from_csv_file file, source=nil
- source ||= file
+ def self.from_csv_file file, accept_empty_values=false
+ source = file
name = File.basename(file,".*")
dataset = self.find_by(:source => source, :name => name)
if dataset
@@ -162,14 +143,14 @@ module OpenTox
$logger.debug "Parsing #{file}."
table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
dataset = self.new(:source => source, :name => name)
- dataset.parse_table table
+ dataset.parse_table table, accept_empty_values
end
dataset
end
# parse data in tabular format (e.g. from csv)
# does a lot of guesswork in order to determine feature types
- def parse_table table
+ def parse_table table, accept_empty_values
# features
feature_names = table.shift.collect{|f| f.strip}
@@ -200,24 +181,25 @@ module OpenTox
# substances and values
+ all_substances = []
table.each_with_index do |vals,i|
identifier = vals.shift.strip
- warn "No feature values for compound at position #{i+2}." if vals.compact.empty?
+ warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
begin
case compound_format
when /SMILES/i
substance = OpenTox::Compound.from_smiles(identifier)
when /InChI/i
substance = OpenTox::Compound.from_inchi(identifier)
- # TODO nanoparticle
end
rescue
substance = nil
end
if substance.nil? # compound parsers may return nil
- warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
+ warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
next
end
+ all_substances << substance
substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id
substance.save
@@ -237,10 +219,11 @@ module OpenTox
end
add substance, features[j], v
end
+ data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values
end
- substances.duplicates.each do |substance|
+ all_substances.duplicates.each do |substance|
positions = []
- substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
+ all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
save
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 2d6b56e..3c52b15 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -68,6 +68,7 @@ module OpenTox
x = []
y = []
predictions.each do |cid,pred|
+ p pred
if pred[:value] and pred[:measured]
x << pred[:measured].median
y << pred[:value]
diff --git a/test/dataset.rb b/test/dataset.rb
index 9bb3409..7ec9973 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -30,7 +30,7 @@ class DatasetTest < MiniTest::Test
csv.shift
csv.each do |row|
c = Compound.from_smiles row.shift
- assert_equal row, c.toxicities[d.features.first.id.to_s][d.id.to_s]
+ assert_equal row, d.values(c,d.features.first)
end
d.delete
end
@@ -45,7 +45,7 @@ class DatasetTest < MiniTest::Test
# 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1
c = d.compounds[491]
assert_equal c.smiles, "COc1cc(Cl)c(cc1Cl)OC"
- assert_equal c.toxicities[d.feature_ids.first.to_s][d.id.to_s][0], "1"
+ assert_equal ["1"], d.values(c,d.features.first)
d.delete
end
@@ -64,9 +64,8 @@ class DatasetTest < MiniTest::Test
d = OpenTox::Dataset.from_csv_file f
csv = CSV.read f
assert_equal true, d.features.first.nominal
- assert_equal csv.size-1-errors.size, d.compounds.size
+ assert_equal 1056, d.compounds.size
assert_equal csv.first.size-1, d.features.size
- puts d.warnings.to_yaml
errors.each do |smi|
refute_empty d.warnings.grep %r{#{Regexp.escape(smi)}}
end
@@ -94,17 +93,13 @@ class DatasetTest < MiniTest::Test
assert_equal csv.first.size-1, d.features.size
assert_match "EPAFHM_log10.csv", d.source
assert_equal "EPAFHM_log10", d.name
- refute_nil d.warnings
- #p d.warnings
- #assert_equal 74, d.warnings.size
feature = d.features.first
assert_kind_of NumericFeature, feature
- assert_match /row 13/, d.warnings.join
- assert_equal -Math.log10(0.0113), d.compounds.first.toxicities[feature.id.to_s][d.id.to_s].first
- assert_equal -Math.log10(0.00323), d.compounds[5].toxicities[feature.id.to_s][d.id.to_s].first
+ assert_equal -Math.log10(0.0113), d.values(d.compounds.first,feature).first
+ assert_equal -Math.log10(0.00323), d.values(d.compounds[4],feature).first
d2 = Dataset.find d.id
- assert_equal -Math.log10(0.0113), d2.compounds[0].toxicities[feature.id.to_s][d.id.to_s].first
- assert_equal -Math.log10(0.00323), d2.compounds[5].toxicities[feature.id.to_s][d.id.to_s].first
+ assert_equal -Math.log10(0.0113), d2.values(d2.compounds[0],feature).first
+ assert_equal -Math.log10(0.00323), d2.values(d2.compounds[4],feature).first
d.delete
end
@@ -112,11 +107,11 @@ class DatasetTest < MiniTest::Test
def test_create_without_features_smiles_and_inchi
["smiles", "inchi"].each do |type|
- d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv")
+ d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv"), true
assert_equal Dataset, d.class
refute_nil d.id
dataset = Dataset.find d.id
- assert_equal 3, d.compounds.size.to_i
+ assert_equal 3, d.compounds.size
d.delete
end
end
@@ -130,8 +125,8 @@ class DatasetTest < MiniTest::Test
assert_operator d.compounds.size, :>=, d.compounds.uniq.size
end
assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size
- assert_equal dataset.substance_ids.size, fold.first.substance_ids.size + fold.last.substance_ids.size
- assert_empty (fold.first.substance_ids & fold.last.substance_ids)
+ assert_equal dataset.substances.size, fold.first.substances.size + fold.last.substances.size
+ assert_empty (fold.first.substances & fold.last.substances)
end
end
@@ -184,13 +179,13 @@ class DatasetTest < MiniTest::Test
# get features
assert_equal 6, new_dataset.features.size
assert_equal 5, new_dataset.compounds.uniq.size
- de = new_dataset.compounds.last.toxicities
- fid = new_dataset.features.first.id.to_s
- assert_equal ["1"], de[fid][d.id.to_s]
- fid = new_dataset.features.last.id.to_s
- assert_equal [1.0], de[fid][d.id.to_s]
- fid = new_dataset.features[2].id.to_s
- assert_equal ["false"], de[fid][d.id.to_s]
+ c = new_dataset.compounds.last
+ f = new_dataset.features.first
+ assert_equal ["1"], new_dataset.values(c,f)
+ f = new_dataset.features.last.id.to_s
+ assert_equal [1.0], new_dataset.values(c,f)
+ f = new_dataset.features[2]
+ assert_equal ["false"], new_dataset.values(c,f)
d.delete
end
@@ -208,7 +203,7 @@ class DatasetTest < MiniTest::Test
csv.shift
csv.each do |row|
c = Compound.from_smiles row.shift
- assert_equal row, c.toxicities[d.feature_ids.first.to_s][d.id.to_s]
+ assert_equal row, d.values(c,d.features.first)
end
d.delete
end
@@ -217,7 +212,7 @@ class DatasetTest < MiniTest::Test
def test_from_csv2
File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") }
dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv"
- assert_equal "Cannot parse SMILES compound '' at position 3, all entries are ignored.", dataset.warnings.join
+ assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join
File.delete "#{DATA_DIR}/temp_test.csv"
dataset.features.each{|f| feature = Feature.find f.id; feature.delete}
dataset.delete
@@ -251,9 +246,7 @@ class DatasetTest < MiniTest::Test
csv.each do |row|
c = Compound.from_smiles(row.shift)
p row
- p c.toxicities
- p d.feature_ids.first.to_s
- assert_equal row, c.toxicities[d.feature_ids.first.to_s][d.id.to_s]
+ assert_equal row, d.values(c,d.features.first)
end
d.delete
end
@@ -287,54 +280,5 @@ class DatasetTest < MiniTest::Test
assert_nil Dataset.find d.id
end
- def test_client_create
- skip
- d = Dataset.new
- assert_equal Dataset, d.class
- d.name = "Create dataset test"
-
- # add data entries
- features = ["test1", "test2"].collect do |title|
- f = Feature.new
- f.name = title
- f.numeric = true
- f.save
- f
- end
-
- # manual low-level insertions without consistency checks for runtime efficiency
- compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi|
- Compound.from_smiles smi
- end
- data_entries = []
- data_entries << [1,2]
- data_entries << [4,5]
- data_entries << [6,7]
- compounds.each_with_index do |c,i|
- features.each_with_index do |f,j|
- d.substance_ids << c.id
- d.feature_ids << f.id
- c.toxicities[f.id.to_s] = data_entries[i][j]
- end
- end
-
- assert_equal 3, d.compounds.size
- assert_equal 2, d.features.size
- #assert_equal [[1,2],[4,5],[6,7]], d.data_entries
- d.save
- # check if dataset has been saved correctly
- new_dataset = Dataset.find d.id
- assert_equal 3, new_dataset.compounds.size
- assert_equal 2, new_dataset.features.size
- new_dataset.compounds.each_with_index do |c,i|
- new_dataset.features.each_with_index do |f,j|
- assert_equal data_entries[i][j], c.toxicities[f.id.to_s].first
- end
- end
- d.delete
- assert_nil Dataset.find d.id
- assert_nil Dataset.find new_dataset.id
- end
-
end
diff --git a/test/setup.rb b/test/setup.rb
index 6c97282..e7c32b4 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb'
include OpenTox
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
-#$mongo.database.drop
-#$gridfs = $mongo.database.fs
+$mongo.database.drop
+$gridfs = $mongo.database.fs