From 15f4ad23eb918a91d52779887ccfb51bc6547f1b Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Thu, 25 Oct 2018 18:58:19 +0200 Subject: dataset merge --- lib/dataset.rb | 111 +++++++++++++++++++++++------------------------------- lib/feature.rb | 5 +++ test/dataset.rb | 44 ++++++++-------------- test/use_cases.rb | 50 ++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 92 deletions(-) create mode 100644 test/use_cases.rb diff --git a/lib/dataset.rb b/lib/dataset.rb index c652b25..9611fff 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -29,7 +29,7 @@ module OpenTox # Get all substances # @return [Array] def substances - @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0] if row[0]}.compact.uniq + @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq @substances end @@ -43,47 +43,31 @@ module OpenTox # Get all values for a given substance and feature # @param [OpenTox::Substance,BSON::ObjectId] substance or substance id # @param [OpenTox::Feature,BSON::ObjectId] feature or feature id - # @return [TrueClass,FalseClass,Float] + # @return [Array] values def values substance,feature substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]} end - # Get OriginalId feature - # @return [OpenTox::OriginalId] - def original_id_feature - features.select{|f| f.is_a?(OriginalId)}.first + # Get OriginalId features + # @return [Array] original ID features (merged datasets may have multiple original IDs) + def original_id_features + features.select{|f| f.is_a?(OriginalId)} end - # Get original id - # @param [OpenTox::Substance] substance - # @return [String] original id - def original_id substance - values(substance,original_id_feature).first + # Get OriginalSmiles features + # @return [Array] original smiles features (merged datasets may have multiple original smiles) + def original_smiles_features + features.select{|f| f.is_a?(OriginalSmiles)} end - # Get OriginalSmiles feature - # @return [OpenTox::OriginalSmiles] - def original_smiles_feature - features.select{|f| f.is_a?(OriginalSmiles)}.first + # Get Warnings features + # @return [Array] warnings features (merged datasets may have multiple warnings) + def warnings_features + features.select{|f| f.is_a?(Warnings)} end - # Get original SMILES - # @param [OpenTox::Substance] substance - # @return [String] original SMILES - def original_smiles substance - values(substance,original_smiles_feature).first - end - - def warnings_feature - features.select{|f| f.is_a?(Warnings)}.first - end - - #def warnings - #data_entries.select{|row| row[1] == warnings_feature}.collect{|row| row[2]}.compact - #end - # Get nominal and numeric bioactivity features # @return [Array] def bioactivity_features @@ -93,13 +77,13 @@ module OpenTox # Get nominal and numeric bioactivity features # @return [Array] def transformed_bioactivity_features - features.select{|f| f.class.to_s.match(/Transformed.*BioActivity/)} + features.select{|f| f._type.match(/Transformed.*BioActivity/)} end # Get nominal and numeric substance property features # @return [Array] def substance_property_features - features.select{|f| f.class.to_s.match("SubstanceProperty")} + features.select{|f| f._type.match("SubstanceProperty")} end # Writers @@ -245,7 +229,7 @@ module OpenTox compound_format = feature_names.shift bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - original_smiles = OriginalSmiles.create if compound_format.match(/SMILES/i) + original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i) numeric = [] features = [] @@ -325,31 +309,29 @@ module OpenTox # Serialisation - # Convert dataset to csv format including compound smiles as first column, other column headers are feature names + # Convert dataset to csv format # @return [String] - def to_csv inchi=false + def to_csv #inchi=false CSV.generate() do |csv| - # TODO support multiple original id|smiles + compound = substances.first.is_a? Compound - f = features - [original_id_feature,original_smiles_feature,warnings_feature] - - if compound - csv << ["Original ID", inchi ? "InChI" : "SMILES", "Original SMILES"] + f.collect{|f| f.name} + ["Warnings"] - else - csv << ["Original ID", "Name"] + f.collect{|f| f.name} + ["Warnings"] - end + f = features - original_id_features - original_smiles_features - warnings_features + header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name} + header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound + compound ? header << "Canonical SMILES" : header << "Name" + header += f.collect{|f| f.name} + header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name} + csv << header substances.each do |substance| - if compound - name = (inchi ? substance.inchi : substance.smiles) - else - name = substance.name - end - row = [values(substance,original_id_feature).first,name,values(substance,original_smiles_feature).first] + row = original_id_features.collect{|f| values(substance,f).join(" ")} + row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound + compound ? row << substance.smiles : row << substance.name row += f.collect{|f| values(substance,f).join(" ")} - row << values(substance,warnings_feature).join(" ") + row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")} csv << row end + end end @@ -373,23 +355,13 @@ module OpenTox # Dataset operations # Merge an array of datasets - # @param [Array] OpenTox::Dataset Array to be merged - # @param [Array] OpenTox::Feature Array to be merged + # @param [Array] datasets to be merged # @return [OpenTox::Dataset] merged dataset - def self.merge datasets, features - # TODO warnings - features.uniq! + def self.merge datasets dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) datasets.each do |d| - d.substances.each do |s| - dataset.add s,d.original_id_feature,d.original_id(s) - dataset.add s,d.original_smiles_feature,d.original_smiles(s) - features.each do |f| - d.values(s,f).each do |v| - dataset.add s,features.first,v #unless dataset.values(s,f).include? v - end - end - end + dataset.data_entries += d.data_entries + dataset.warnings += d.warnings end dataset.save dataset @@ -400,6 +372,7 @@ module OpenTox def copy dataset = Dataset.new dataset.data_entries = data_entries + dataset.warnings = warnings dataset.name = name dataset.source = id.to_s dataset.save @@ -451,6 +424,16 @@ module OpenTox dataset.save dataset end + + def merge_nominal_features nominal_features, maps=[] + dataset = self.copy + new_feature = MergedNominalBioActivity.find_or_create_by(:name => nominal_features.collect{|f| f.name}.join("/") + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort) + + compounds.each do |c| + if map + values(c,feature).each { |v| dataset.add c, new_feature, map[v] } + else + end def transform # TODO end diff --git a/lib/feature.rb b/lib/feature.rb index 056957b..50dea77 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -33,6 +33,11 @@ module OpenTox class NumericBioActivity < NumericFeature end + # Merged nominal biological activity + class MergedNominalBioActivity < NominalFeature + field :original_feature_ids, type: Array + end + # Transformed nominal biological activity class TransformedNominalBioActivity < NominalFeature field :original_feature_id, type: BSON::ObjectId diff --git a/test/dataset.rb b/test/dataset.rb index 5a620dd..0beea2d 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -1,5 +1,3 @@ -# batch class - require_relative "setup.rb" class DatasetTest < MiniTest::Test @@ -123,8 +121,6 @@ class DatasetTest < MiniTest::Test csv = CSV.read f assert_equal csv.size-1, d.compounds.size assert_equal csv.first.size+1, d.features.size - # TODO fix csv output (headers, column order) - #puts d.to_csv end def test_import_epafhm @@ -197,48 +193,40 @@ class DatasetTest < MiniTest::Test mapped = d.map(d.bioactivity_features.first, map) c = d.compounds.sample assert_equal d.values(c,d.bioactivity_features.first).collect{|v| map[v]}, mapped.values(c,mapped.transformed_bioactivity_features.first) - assert_equal d.original_id(c), mapped.original_id(c) + assert_equal d.values(c,d.original_id_features.first), mapped.values(c,mapped.original_id_features.first) assert_equal d.bioactivity_features.first.name, mapped.bioactivity_features.first.name assert_equal ["carcinogen","non-carcinogen"], mapped.transformed_bioactivity_features.first.accept_values end def test_merge - skip kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} - efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} - datasets = [kazius,hansen_mapped,efsa_mapped] - d = Dataset.merge datasets, datasets.collect{|d| d.bioactivity_features}.flatten.uniq - File.open("tmp.csv","w+"){|f| f.puts d.to_csv} + #p "mapping hansen" + #hansen_mapped = hansen.map hansen.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + #p "mapping efsa" + #efsa_mapped = efsa.map efsa.bioactivity_features.first, {"1" => "mutagen", "0" => "nonmutagen"} + #datasets = [kazius,hansen_mapped,efsa_mapped] + datasets = [kazius,hansen,efsa] + d = Dataset.merge datasets#, datasets.collect{|d| d.bioactivity_features}.flatten.uniq assert_equal 8281, d.compounds.size c = Compound.from_smiles("C/C=C/C=O") assert_equal ["mutagen"], d.values(c,d.bioactivity_features.first) - assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source - assert_equal 4, d.features.size + assert_equal datasets.collect{|d| d.id.to_s}.join(", "), d.source + assert_equal 8, d.features.size + p "serializing" + File.open("tmp.csv","w+"){|f| f.puts d.to_csv} end # serialisation def test_to_csv - # TODO - skip d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" csv = CSV.parse(d.to_csv) - original_csv = CSV.read("#{DATA_DIR}/multicolumn.csv") - header = csv.shift - original_header = original_csv.shift.collect{|h| h.strip} - #p header, original_header - original_header.each_with_index do |name,i| - name = "Original SMILES" if name == "SMILES" - j = header.index name - original_csv.each_with_index do |row,k| - row.collect!{|c| c.strip} - assert_equal csv[k][j], original_csv[k][i] - end - end - d.delete + assert_equal "3 5", csv[3][0] + assert_match "3, 5", csv[3][9] + assert_match "Duplicate", csv[3][9] + assert_equal '7,c1nccc1,[N]1C=CC=C1,1,,false,,,1.0,', csv[5].join(",") end def test_to_sdf diff --git a/test/use_cases.rb b/test/use_cases.rb new file mode 100644 index 0000000..d9ae78b --- /dev/null +++ b/test/use_cases.rb @@ -0,0 +1,50 @@ +require_relative "setup.rb" + +class UseCasesTest < MiniTest::Test + + def test_PA + kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" + efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" + datasets = [kazius,hansen,efsa] + training_dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,map,map], keep_original_features: false, remove_duplicates: true + model = Model::Validation.create training_dataset: training_dataset, species: "Salmonella typhimurium", endpoint: "Mutagenicity" + pa = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" + prediction_dataset = model.predict pa + puts prediction_dataset.to_csv + assert_equal 8281, d.compounds.size + end + + def test_public_models + skip +=begin + #classification + aids = [ + 1205, #Rodents (multiple species/sites) + 1208, # rat carc + 1199 # mouse + # Mutagenicity + + + 1195 #MRDD + 1188 #FHM + 1208, # rat carc td50 + 1199 # mouse td50 + + # daphnia + # Blood Brain Barrier Penetration + # Lowest observed adverse effect level (LOAEL) + + # 1204 estrogen receptor + # 1259408, # GENE-TOX + # 1159563 HepG2 cytotoxicity assay + # 588209 hepatotoxicity + # 1259333 cytotoxicity + # 1159569 HepG2 cytotoxicity counterscreen Measured in Cell-Based System Using Plate Reader - 2153-03_Inhibitor_Dose_DryPowder_Activity + # 2122 HTS Counterscreen for Detection of Compound Cytotoxicity in MIN6 Cells + # 116724 Acute toxicity determined after intravenal administration in mice + # 1148549 Toxicity in po dosed mouse assessed as mortality after 7 days +=end + + end +end -- cgit v1.2.3