From 1652fd5df948da7ace622c73d158010add656b9f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 24 Oct 2018 18:21:34 +0200 Subject: dataset map --- lib/compound.rb | 28 ++----- lib/dataset.rb | 178 +++++++++++++++++++++++++++++-------------- lib/feature.rb | 34 ++++++--- lib/lazar.rb | 2 +- lib/model.rb | 8 +- lib/opentox.rb | 10 +-- test/classification-model.rb | 47 ++++++------ test/dataset.rb | 89 ++++++++++++++-------- 8 files changed, 246 insertions(+), 150 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 22c8575..0714574 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -33,13 +33,11 @@ module OpenTox def fingerprint type=DEFAULT_FINGERPRINT unless fingerprints[type] return [] unless self.smiles - #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format - if type == "MP2D" + if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format fp = obconversion(smiles,"smi","mpd").strip.split("\t") name = fp.shift # remove Title fingerprints[type] = fp.uniq # no fingerprint counts - #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html - elsif type== "MNA" + elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html level = 2 # TODO: level as parameter, evaluate level 1, see paper fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n") fp.shift # remove Title @@ -128,17 +126,9 @@ module OpenTox # @param [String] smiles # @return [OpenTox::Compound] def self.from_smiles smiles - if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles - warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces." - return nil - end + return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) - if smiles.empty? - warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string." - return nil - else - Compound.find_or_create_by :smiles => smiles - end + smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) end # Create a compound from InChI string @@ -146,11 +136,7 @@ module OpenTox # @return [OpenTox::Compound] def self.from_inchi inchi smiles = obconversion(inchi,"inchi","can") - if smiles.empty? - Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."]) - else - Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) - end + smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) end # Create a compound from SDF @@ -328,11 +314,11 @@ module OpenTox print sdf if sdf.match(/.nan/) - warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" + #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS) sdf = obconversion.write_string(obmol) if sdf.match(/.nan/) - warn "2D generation failed for compound #{identifier}, rendering without coordinates." + #warn "2D generation failed for compound #{identifier}, rendering without coordinates." obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS) sdf = obconversion.write_string(obmol) end diff --git a/lib/dataset.rb b/lib/dataset.rb index bbb20be..aa66c9f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -8,6 +8,7 @@ module OpenTox class Dataset field :data_entries, type: Hash, default: {} + field :source, type: String field :md5, type: String # Readers @@ -52,6 +53,44 @@ module OpenTox end end + # Get OriginalId feature + # @return [OpenTox::OriginalId] + def original_id_feature + features.select{|f| f.is_a?(OriginalId)}.first + end + + # Get original id + # @param [OpenTox::Substance] substance + # @return [String] original id + def original_id substance + values(substance,original_id_feature).first + end + + # Get OriginalSmiles feature + # @return [OpenTox::OriginalSmiles] + def original_smiles_feature + features.select{|f| f.is_a?(OriginalSmiles)}.first + end + + # Get original SMILES + # @param [OpenTox::Substance] substance + # @return [String] original SMILES + def original_smiles substance + values(substance,original_smiles_feature).first + end + + # Get nominal and numeric bioactivity features + # @return [Array] + def bioactivity_features + features.select{|f| f.class.to_s.match("BioActivity")} + end + + # Get nominal and numeric bioactivity features + # @return [Array] + def transformed_bioactivity_features + features.select{|f| f.class.to_s.match(/Transformed.*BioActivity/)} + end + # Writers # Add a value for a given substance and feature @@ -188,41 +227,38 @@ module OpenTox # features feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + bad_request_error "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - original_id = nil if feature_names[0] =~ /ID/i # check ID column - feature_names.shift - original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => self.name+".ID") + original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) + else + original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") end + warnings = Warnings.find_or_create_by(:dataset_id => self.id) + compound_format = feature_names.shift bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + original_smiles = OriginalSmiles.create if compound_format.match(/SMILES/i) + numeric = [] features = [] # guess feature types bioactivity = true if feature_names.size == 1 + feature_names.each_with_index do |f,i| - original_id ? j = i+2 : j = i+1 + original_id.name.match(/LineID$/) ? j = i+1 : j = i+2 values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes numeric[i] = true - if bioactivity - feature = NumericBioActivity.find_or_create_by(:name => f) - else - feature = NumericSubstanceProperty.find_or_create_by(:name => f) - end + bioactivity ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f) else numeric[i] = false - if bioactivity - feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) - else - feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) - end + bioactivity ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) end features << feature if feature end @@ -231,32 +267,37 @@ module OpenTox all_substances = [] table.each_with_index do |vals,i| - original_id_value = vals.shift.strip if original_id + original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.strip identifier = vals.shift.strip begin case compound_format when /SMILES/i substance = Compound.from_smiles(identifier) + add substance, original_smiles, identifier when /InChI/i substance = Compound.from_inchi(identifier) end rescue substance = nil end + if substance.nil? # compound parsers may return nil - warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." + add substance, original_id, original_id_value + add substance, original_smiles, identifier + add substance, warnings, "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." next end + all_substances << substance substance.dataset_ids << self.id substance.dataset_ids.uniq! substance.save - add substance, original_id, original_id_value if original_id + add substance, original_id, original_id_value vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." + add substance, warnings, "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." v = nil elsif numeric[j] v = v.to_f @@ -265,13 +306,15 @@ module OpenTox end add substance, features[j], v end - data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions + #data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions end all_substances.duplicates.each do |substance| positions = [] - all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} - warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles} + all_substances.select{|s| s.smiles == substance.smiles}.each do |s| + add s, warnings, "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + end end save end @@ -280,13 +323,20 @@ module OpenTox # Convert dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] - def to_csv(inchi=false) + def to_csv inchi=false CSV.generate() do |csv| compound = substances.first.is_a? Compound + id = features.select{|f| f.is_a? OriginalId}.first + features.delete(id) + original_smiles = features.select{|f| f.is_a? OriginalSmiles}.first + features.delete(original_smiles) + warning = features.select{|f| f.is_a? Warnings}.first + features.delete(warning) + if compound - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + csv << [id.name, inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + ["OriginalSmiles", "Warnings"] else - csv << ["Name"] + features.collect{|f| f.name} + csv << [id.name, "Name"] + features.collect{|f| f.name} end substances.each do |substance| if compound @@ -294,19 +344,10 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq - - if nr_measurements.size > 1 - warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." - else - (0..nr_measurements.first-1).each do |i| - row = [name] - features.each do |f| - values(substance,f) ? row << values(substance,f)[i] : row << "" - end - csv << row - end - end + row = [values(substance,id).first,name] + features.collect{|f| values(substance,f).join(" ")} + row << values(substance,original_smiles).join(" ") + row << values(substance,warning).join(" ") + csv << row end end end @@ -332,18 +373,19 @@ module OpenTox # Merge an array of datasets # @param [Array] OpenTox::Dataset Array to be merged - # @param [Hash] feature modifications - # @param [Hash] value modifications + # @param [Array] OpenTox::Feature Array to be merged # @return [OpenTox::Dataset] merged dataset - def self.merge datasets, feature_map=nil, value_map=nil - dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) + def self.merge datasets, features + # TODO warnings + features.uniq! + dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) datasets.each do |d| d.substances.each do |s| - d.features.each do |f| + dataset.add s,d.original_id_feature,d.original_id(s) + dataset.add s,d.original_smiles_feature,d.original_smiles(s) + features.each do |f| d.values(s,f).each do |v| - f = feature_map[f] if feature_map and feature_map[f] - v = value_map[v] if value_map and value_map[v] - dataset.add s,f,v #unless dataset.values(s,f).include? v + dataset.add s,features.first,v #unless dataset.values(s,f).include? v end end end @@ -352,6 +394,17 @@ module OpenTox dataset end + # Copy a dataset + # @return OpenTox::Dataset dataset copy + def copy + dataset = Dataset.new + dataset.data_entries = data_entries + dataset.name = name + dataset.source = id.to_s + dataset.save + dataset + end + # Split a dataset into n folds # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] @@ -384,6 +437,19 @@ module OpenTox end chunks end + + # Change nominal feature values + # @param [NominalFeature] Original feature + # @param [Hash] how to change feature values + def map feature, map + dataset = self.copy + new_feature = TransformedNominalBioActivity.find_or_create_by(:name => feature.name + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort) + compounds.each do |c| + values(c,feature).each { |v| dataset.add c, new_feature, map[v] } + end + dataset.save + dataset + end def transform # TODO end @@ -397,9 +463,9 @@ module OpenTox end # Dataset for lazar predictions - class LazarPrediction #< Dataset + class LazarPrediction < Dataset field :creator, type: String - field :prediction_feature_id, type: BSON::ObjectId + #field :prediction_feature_id, type: BSON::ObjectId field :predictions, type: Hash, default: {} # Get prediction feature @@ -408,16 +474,16 @@ module OpenTox Feature.find prediction_feature_id end - # Get all compounds - # @return [Array] - def compounds - substances.select{|s| s.is_a? Compound} + def prediction compound end - # Get all substances - # @return [Array] - def substances - predictions.keys.collect{|id| Substance.find id} + def probability klass + end + + def prediction_interval + end + + def predictions end end diff --git a/lib/feature.rb b/lib/feature.rb index 2c10c26..056957b 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -1,32 +1,46 @@ module OpenTox - # Basic feature class - class Feature - end - # Original ID (e.g. from CSV input) class OriginalId < Feature field :dataset_id, type: BSON::ObjectId end - # Feature for categorical variables + # Original SMILES (e.g. from CSV input) + class OriginalSmiles < Feature + field :dataset_id, type: BSON::ObjectId + end + + # Warnings + class Warnings < Feature + field :dataset_id, type: BSON::ObjectId + end + + # Categorical variables class NominalFeature < Feature field :accept_values, type: Array end - # Feature for quantitative variables + # Quantitative variables class NumericFeature < Feature field :unit, type: String end # Nominal biological activity class NominalBioActivity < NominalFeature - field :original_feature_id, type: BSON::ObjectId - field :transformation, type: Hash end # Numeric biological activity class NumericBioActivity < NumericFeature + end + + # Transformed nominal biological activity + class TransformedNominalBioActivity < NominalFeature + field :original_feature_id, type: BSON::ObjectId + field :transformation, type: Hash + end + + # Transformed numeric biological activity + class TransformedNumericBioActivity < NumericFeature field :original_feature_id, type: BSON::ObjectId field :transformation, type: String end @@ -38,7 +52,6 @@ module OpenTox end class LazarPredictionProbability < NominalLazarPrediction - field :value, type: Float end # Numeric lazar prediction @@ -47,6 +60,9 @@ module OpenTox field :training_feature_id, type: BSON::ObjectId end + class LazarConfidenceInterval < NumericLazarPrediction + end + class NominalSubstanceProperty < NominalFeature end diff --git a/lib/lazar.rb b/lib/lazar.rb index 13ad1f8..7e813e4 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -72,7 +72,7 @@ PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" CHEMBL_URI = "https://www.ebi.ac.uk/chembl/api/data/molecule/" # OpenTox classes and includes -CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", diff --git a/lib/model.rb b/lib/model.rb index 7ee50fe..9858949 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -199,7 +199,6 @@ module OpenTox # @return [Hash] def predict_substance substance, threshold = self.algorithms[:similarity][:min] - #p substance.smiles t = Time.now @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] @@ -286,7 +285,6 @@ module OpenTox else # try again with a lower threshold predict_substance substance, 0.2 end - #p prediction #p Time.now - t prediction end @@ -330,11 +328,12 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset + warning_feature = InfoFeature.find_or_create_by(:name => "Warnings") if prediction_feature.is_a? NominalBioActivity f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) probability_features = {} prediction_feature.accept_values.each do |v| - probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => "probability(#{v})", :accept_values => prediction_feature.accept_values, :value => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) @@ -344,10 +343,11 @@ module OpenTox d = Dataset.new(:name => object.name) # add predictions to dataset predictions.each do |substance_id,p| - d.warnings += p[:warnings] + d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings] unless p[:value].nil? d.add substance_id,f,p[:value] p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} + # TODO prediction interval end end d.save diff --git a/lib/opentox.rb b/lib/opentox.rb index 03d65b0..9cc8260 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -11,13 +11,13 @@ module OpenTox include Mongoid::Timestamps store_in collection: klass.downcase.pluralize field :name, type: String - field :source, type: String - field :warnings, type: Array, default: [] + #field :source, type: String + #field :warnings, type: Array, default: [] - def warn warning +# def warn warning #$logger.warn warning - warnings << warning - end +# warnings << warning +# end end OpenTox.const_set klass,c end diff --git a/test/classification-model.rb b/test/classification-model.rb index b94b5e6..7a2a64f 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -22,37 +22,40 @@ class LazarClassificationTest < MiniTest::Test assert_kind_of Model::LazarClassification, model assert_equal algorithms, model.algorithms [ { - :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :compound => OpenTox::Compound.from_smiles("OCC(CN(CC(O)C)N=O)O"), :prediction => "false", },{ - :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), - :prediction => "false", + :compound => OpenTox::Compound.from_smiles("O=CNc1scc(n1)c1ccc(o1)[N+](=O)[O-]"), + :prediction => "true", } ].each do |example| prediction = model.predict example[:compound] - p example[:compound] - p prediction - #assert_equal example[:prediction], prediction[:value] + assert_equal example[:prediction], prediction[:value] end - compound = Compound.from_smiles "CCO" - prediction = model.predict compound - assert_equal "true", prediction[:value] - assert_equal ["false"], prediction[:measurements] - # make a dataset prediction - compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") + compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv") prediction_dataset = model.predict compound_dataset - assert_equal compound_dataset.compounds, prediction_dataset.compounds + puts prediction_dataset.to_csv + assert_equal compound_dataset.compounds.size, prediction_dataset.compounds.size + c = Compound.from_smiles "CC(CN(CC(O)C)N=O)O" + prediction_feature = prediction_dataset.features.select{|f| f.class == NominalLazarPrediction}[0] + assert_equal ["true"], prediction_dataset.values(c, prediction_feature) + p_true = LazarPredictionProbability.find_by(:name => "true") + p_false = LazarPredictionProbability.find_by(:name => "false") + p p_true + assert_equal [0.7], prediction_dataset.values(c,p_true) + assert_equal [0.0], prediction_dataset.values(c,p_false) + assert_equal 0.0, p_false - cid = prediction_dataset.compounds[7].id.to_s - assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] - expectations = ["Cannot create prediction: Only one similar compound in the training set.", - "Could not find similar substances with experimental data in the training dataset."] - prediction_dataset.predictions.each do |cid,pred| - assert_includes expectations, pred[:warnings][0] if pred[:value].nil? - end - cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_match "excluded", prediction_dataset.predictions[cid][:info] +# cid = prediction_dataset.compounds[7].id.to_s +# assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] +# expectations = ["Cannot create prediction: Only one similar compound in the training set.", +# "Could not find similar substances with experimental data in the training dataset."] +# prediction_dataset.predictions.each do |cid,pred| +# assert_includes expectations, pred[:warnings][0] if pred[:value].nil? +# end +# cid = Compound.from_smiles("CCOC(=O)N").id.to_s +# assert_match "excluded", prediction_dataset.predictions[cid][:info] end def test_classification_parameters diff --git a/test/dataset.rb b/test/dataset.rb index 2b439bb..163f178 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -26,8 +26,8 @@ class DatasetTest < MiniTest::Test def test_import_pubchem d = Dataset.from_pubchem_aid 1191 assert_equal 87, d.compounds.size - assert_equal 2, d.features.size - assert_equal ["Active"], d.values(d.compounds[10],d.features[1]) + assert_equal 3, d.features.size + assert_equal ["Active"], d.values(d.compounds[10],d.features[2]) # TODO endpoint name # TODO regression import end @@ -35,9 +35,9 @@ class DatasetTest < MiniTest::Test def test_import_csv_with_id d = Dataset.from_csv_file "#{DATA_DIR}/input_53.csv" assert_equal 53, d.compounds.size - assert_equal 1, d.features.size - f = d.features[0] - assert_equal "input_53.ID", f.name + assert_equal 2, d.features.size + f = d.features[1] + assert_equal "ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end @@ -45,16 +45,16 @@ class DatasetTest < MiniTest::Test def test_import_tsv_with_id d = Dataset.from_csv_file "#{DATA_DIR}/input_53.tsv" assert_equal 53, d.compounds.size - assert_equal 1, d.features.size - f = d.features[0] - assert_equal "input_53.ID", f.name + assert_equal 2, d.features.size + f = d.features[1] + assert_equal "ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end def test_import_sdf d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" - assert_equal 35, d.features.size + assert_equal 37, d.features.size assert_kind_of NumericSubstanceProperty, d.features[1] assert_equal NominalSubstanceProperty, d.features.last.class assert_equal 602, d.compounds.size @@ -64,7 +64,7 @@ class DatasetTest < MiniTest::Test def test_import_hamster d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal Dataset, d.class - assert_equal 1, d.features.size + assert_equal 3, d.features.size assert_equal 85, d.compounds.size assert_equal NominalBioActivity, d.features.first.class csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") @@ -81,7 +81,7 @@ class DatasetTest < MiniTest::Test d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size + assert_equal csv.first.size+1, d.features.size assert_empty d.warnings # 493 COC1=C(C=C(C(=C1)Cl)OC)Cl,1 c = d.compounds[491] @@ -121,8 +121,9 @@ class DatasetTest < MiniTest::Test d = OpenTox::Dataset.from_csv_file f csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size - d.delete + assert_equal csv.first.size+1, d.features.size + # TODO fix csv output (headers, column order) + puts d.to_csv end def test_import_epafhm @@ -131,7 +132,7 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class csv = CSV.read f assert_equal csv.size-1, d.compounds.size - assert_equal csv.first.size-1, d.features.size + assert_equal csv.first.size+1, d.features.size assert_match "EPAFHM_log10.csv", d.source assert_equal "EPAFHM_log10", d.name feature = d.features.first @@ -168,23 +169,6 @@ class DatasetTest < MiniTest::Test # dataset operations - def test_merge - skip # TODO use new Features - source_feature = Feature.where(:name => "Ames test categorisation").first - target_feature = Feature.where(:name => "Mutagenicity").first - kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" - hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" - efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - d = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} - #File.open("tmp.csv","w+"){|f| f.puts d.to_csv} - assert_equal 8281, d.compounds.size - c = Compound.from_smiles("C/C=C/C=O") - assert_equal ["mutagen"], d.values(c,target_feature) - assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source - p d.features - assert_equal 4, d.features.size - end - def test_folds dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") dataset.folds(10).each do |fold| @@ -197,10 +181,48 @@ class DatasetTest < MiniTest::Test end end + def test_copy + d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") + copy = d.copy + assert_equal d.data_entries, copy.data_entries + assert_equal d.name, copy.name + assert_equal d.id.to_s, copy.source + end + + def test_map + d = Dataset.from_csv_file("#{DATA_DIR}/hamster_carcinogenicity.csv") + assert_equal 1, d.bioactivity_features.size + map = {"true" => "carcinogen", "false" => "non-carcinogen"} + mapped = d.map(d.bioactivity_features.first, map) + c = d.compounds.sample + assert_equal d.values(c,d.bioactivity_features.first).collect{|v| map[v]}, mapped.values(c,mapped.transformed_bioactivity_features.first) + assert_equal d.original_id(c), mapped.original_id(c) + assert_equal d.bioactivity_features.first.name, mapped.bioactivity_features.first.name + assert_equal ["carcinogen","non-carcinogen"], mapped.transformed_bioactivity_features.first.accept_values + end + + def test_merge + kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" + efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" + hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + datasets = [kazius,hansen_mapped,efsa_mapped] + d = Dataset.merge datasets, datasets.collect{|d| d.bioactivity_features}.flatten.uniq + File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + assert_equal 8281, d.compounds.size + c = Compound.from_smiles("C/C=C/C=O") + assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first) + assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source + p d.features + assert_equal 4, d.features.size + end + # serialisation def test_to_csv d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" + # TODO warnings refute_nil d.warnings assert d.warnings.grep(/Duplicate compound/) assert d.warnings.grep(/3, 5/) @@ -268,6 +290,7 @@ class DatasetTest < MiniTest::Test def test_create_from_file_with_wrong_smiles_compound_entries d = Dataset.from_csv_file File.join(DATA_DIR,"wrong_dataset.csv") + p d.to_csv refute_nil d.warnings assert_match /2|3|4|5|6|7|8/, d.warnings.join d.delete @@ -289,6 +312,8 @@ class DatasetTest < MiniTest::Test def test_from_csv2 File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") } dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv" + p dataset + p dataset.to_csv assert_equal "Cannot parse SMILES compound '' at line 3 of /home/ist/lazar/test/data/temp_test.csv, all entries are ignored.", dataset.warnings.join File.delete "#{DATA_DIR}/temp_test.csv" dataset.features.each{|f| feature = Feature.find f.id; feature.delete} @@ -313,7 +338,7 @@ class DatasetTest < MiniTest::Test threads << Thread.new(t) do |up| d = OpenTox::Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" assert_equal OpenTox::Dataset, d.class - assert_equal 1, d.features.size + assert_equal 3, d.features.size assert_equal 85, d.compounds.size csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") csv.shift -- cgit v1.2.3