From 84222bae2bbb9fb3e0ce3e65de1be8e7f94d2147 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 12 Apr 2016 12:37:37 +0200 Subject: new dataset structure --- lib/compound.rb | 10 ++- lib/crossvalidation.rb | 1 - lib/dataset.rb | 173 ++++++++++++++++--------------------------------- lib/lazar.rb | 3 +- lib/model.rb | 39 +++++++++++ lib/nanoparticle.rb | 3 +- test/dataset.rb | 50 +++++++------- test/nanoparticles.rb | 9 ++- 8 files changed, 136 insertions(+), 152 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index a7518ed..84d8891 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -2,10 +2,8 @@ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" module OpenTox - class Compound + class Compound < Substance require_relative "unique_descriptors.rb" - include OpenTox - DEFAULT_FINGERPRINT = "MP2D" field :inchi, type: String @@ -347,14 +345,14 @@ module OpenTox end - # Convert mg to mmol + # Convert mmol to mg # @return [Float] value in mg def mmol_to_mg mmol mmol.to_f*molecular_weight end - # Convert mmol to mg - # @return [Float] value in mg + # Convert mg to mmol + # @return [Float] value in mmol def mg_to_mmol mg mg.to_f/molecular_weight end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15dfb21..b7cd7bf 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -297,5 +297,4 @@ module OpenTox end end - end diff --git a/lib/dataset.rb b/lib/dataset.rb index 2e48626..5c04382 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -6,21 +6,25 @@ module OpenTox class Dataset # associations like has_many, belongs_to deteriorate performance - field :feature_ids, type: Array, default: [] - field :compound_ids, type: Array, default: [] - field :data_entries, type: Array, default: [] + #field :feature_ids, type: Array, default: [] + #field :substance_ids, type: Array, default: [] + field :data_entries, type: Hash, default: {} # Readers - # Get all compounds def compounds - @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id} - @compounds + substances.select{|s| s.is_a? Compound} + end + + # Get all substances + def substances + @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} + @substances end # Get all features def features - @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.keys}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} @features end @@ -29,22 +33,20 @@ module OpenTox # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values def values(compound, feature) - rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id } - col = feature_ids.index feature.id - rows.collect{|row| data_entries[row][col]} + data_entries[compound.id,feature.id] end # Writers # Set compounds def compounds=(compounds) - self.compound_ids = compounds.collect{|c| c.id} + self.substance_ids = compounds.collect{|c| c.id} end # Set features - def features=(features) - self.feature_ids = features.collect{|f| f.id} - end + #def features=(features) + #self.feature_ids = features.collect{|f| f.id} + #end # Dataset operations @@ -52,13 +54,8 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - unique_compound_data = {} - compound_ids.each_with_index do |cid,i| - unique_compound_data[cid] ||= [] - unique_compound_data[cid] << data_entries[i] - end - unique_compound_ids = unique_compound_data.keys - len = unique_compound_ids.size + substance_ids = data_entries.keys + len = substance_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -67,19 +64,19 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| unique_compound_ids[i]} + test_cids = test_idxs.collect{|i| substance_ids[i]} training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| unique_compound_ids[i]} - chunk = [training_cids,test_cids].collect do |unique_cids| - cids = [] - data_entries = [] - unique_cids.each do |cid| - unique_compound_data[cid].each do |de| - cids << cid - data_entries << de + training_cids = training_idxs.collect{|i| substance_ids[i]} + chunk = [training_cids,test_cids].collect do |cids| + new_cids = [] + new_data_entries = [] + cids.each do |cid| + data_entries[cid].each do |de| + new_cids << cid + new_data_entries << de end end - dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) + dataset = self.class.new(:data_entries => data_entries, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save @@ -96,27 +93,7 @@ module OpenTox # Diagnostics def duplicates feature=self.features.first - col = feature_ids.index feature.id - dups = {} - compound_ids.each_with_index do |cid,i| - rows = compound_ids.each_index.select{|r| compound_ids[r] == cid } - values = rows.collect{|row| data_entries[row][col]} - dups[cid] = values if values.size > 1 - end - dups - end - - def correlation_plot training_dataset - # TODO: create/store svg - R.assign "features", data_entries - R.assign "activities", training_dataset.data_entries.collect{|de| de.first} - R.eval "featurePlot(features,activities)" - end - - def density_plot - # TODO: create/store svg - R.assign "acts", data_entries.collect{|r| r.first }#.compact - R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')" + data_entries.select{|sid,f| f[feature.id].size > 1} end # Serialisation @@ -124,10 +101,15 @@ module OpenTox # converts dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] def to_csv(inchi=false) - CSV.generate() do |csv| #{:force_quotes=>true} + CSV.generate() do |csv| csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - compounds.each_with_index do |c,i| - csv << [inchi ? c.inchi : c.smiles] + data_entries[i] + data_entries.each do |sid,f| + substance = Substance.find cid + features.each do |feature| + f[feature.id].each do |v| + csv << [inchi ? substance.inchi : substance.smiles , v] + end + end end end end @@ -143,7 +125,7 @@ module OpenTox # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil, bioassay=true#, layout={} + def self.from_csv_file file, source=nil source ||= file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) @@ -153,21 +135,22 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table, bioassay#, layout + dataset.parse_table table end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def parse_table table, bioassay=true + def parse_table table time = Time.now # features feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip + # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] @@ -176,30 +159,20 @@ module OpenTox metadata = {:name => f} values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes metadata["numeric"] = true numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) else metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) end - if bioassay - if metadata["numeric"] - feature = NumericBioAssay.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalBioAssay.find_or_create_by(metadata) - end - else - metadata.merge({:measured => false, :calculated => true}) - if metadata["numeric"] - feature = NumericFeature.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalFeature.find_or_create_by(metadata) - end - end - feature_ids << feature.id if feature + @features ||= [] + @features << feature if feature end $logger.debug "Feature values: #{Time.now-time}" @@ -210,7 +183,6 @@ module OpenTox value_time = 0 # compounds and values - self.data_entries = [] table.each_with_index do |vals,i| ct = Time.now @@ -222,6 +194,7 @@ module OpenTox compound = OpenTox::Compound.from_smiles(identifier) when /InChI/i compound = OpenTox::Compound.from_inchi(identifier) + # TODO nanoparticle end rescue compound = nil @@ -235,13 +208,13 @@ module OpenTox compound_time += Time.now-ct r += 1 - unless vals.size == feature_ids.size # way cheaper than accessing features + unless vals.size == @features.size warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end - compound_ids << compound.id - table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1) + #substance_ids << compound.id + #table.first.size == 0 ? self.data_entries[compound.id] = Array.new(0) : self.data_entries[compound.id] = Array.new(table.first.size-1) vals.each_with_index do |v,j| if v.blank? @@ -252,10 +225,13 @@ module OpenTox else v = v.strip end - self.data_entries.last[j] = v + self.data_entries[compound.id.to_s] ||= {} + self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] + self.data_entries[compound.id.to_s][@features[j].id.to_s] << v #i = compound.feature_ids.index feature_ids[j] - compound.features[feature_ids[j].to_s] ||= [] - compound.features[feature_ids[j].to_s] << v + #TODO + #compound.features[feature_ids[j].to_s] ||= [] + #compound.features[feature_ids[j].to_s] << v compound.save end end @@ -272,17 +248,6 @@ module OpenTox end - # Fill unset data entries - # @param any value - def fill_nil_with n - (0 .. compound_ids.size-1).each do |i| - data_entries[i] ||= [] - (0 .. feature_ids.size-1).each do |j| - data_entries[i][j] ||= n - end - end - end - end # Dataset for lazar predictions @@ -296,28 +261,4 @@ module OpenTox end - # Dataset for descriptors (physchem) - class DescriptorDataset < Dataset - field :feature_calculation_algorithm, type: String - - end - - class ScaledDataset < DescriptorDataset - - field :centers, type: Array, default: [] - field :scales, type: Array, default: [] - - def original_value value, i - value * scales[i] + centers[i] - end - end - - # Dataset for fminer descriptors - class FminerDataset < DescriptorDataset - field :training_algorithm, type: String - field :training_dataset_id, type: BSON::ObjectId - field :training_feature_id, type: BSON::ObjectId - field :training_parameters, type: Hash - end - end diff --git a/lib/lazar.rb b/lib/lazar.rb index 0e2cec2..2bcecc5 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -61,7 +61,8 @@ suppressPackageStartupMessages({ " # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment","Nanoparticle"]# Algorithm and Models are modules +#CLASSES = ["Feature","Substance::Compound","Substance::Nanoparticle","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", diff --git a/lib/model.rb b/lib/model.rb index 8e657b8..1f9942b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -227,6 +227,45 @@ module OpenTox end end + class NanoLazar + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + field :name, type: String + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + # algorithms + field :prediction_algorithm, type: String + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId + field :training_particle_ids, type: Array + + def self.create_all + nanoparticles = Nanoparticle.all + toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id} + tox = {} + toxfeatures.each do |t| + tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s} + end + tox.select!{|t,nps| nps.size > 50} + tox.collect do |t,nps| + find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id}) + end + end + + def predict nanoparticle + training = training_particle_ids.collect{|id| Nanoparticle.find id} + training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq + query_features = nanoparticle.physchem_descriptors.keys + common_features = (training_features & query_features) + p common_features + end + + end + end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index cda431a..c58dc8c 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -1,9 +1,8 @@ module OpenTox - class Nanoparticle + class Nanoparticle < Substance include OpenTox - #field :particle_id, type: String field :core, type: String field :coating, type: Array, default: [] diff --git a/test/dataset.rb b/test/dataset.rb index 297251e..a7b8769 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -36,38 +36,34 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class d.name = "Create dataset test" - # features not set - # << operator was removed for efficiency reasons (CH) - #assert_raises BadRequestError do - # d << [Compound.from_smiles("c1ccccc1NN"), 1,2] - #end - # add data entries - d.features = ["test1", "test2"].collect do |title| + features = ["test1", "test2"].collect do |title| f = Feature.new f.name = title f.numeric = true f.save f end - - # wrong feature size - # << operator was removed for efficiency reasons (CH) - #assert_raises BadRequestError do - # d << [Compound.from_smiles("c1ccccc1NN"), 1,2,3] - #end # manual low-level insertions without consistency checks for runtime efficiency + compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi| + Compound.from_smiles smi + end data_entries = [] - d.compound_ids << Compound.from_smiles("c1ccccc1NN").id data_entries << [1,2] - d.compound_ids << Compound.from_smiles("CC(C)N").id data_entries << [4,5] - d.compound_ids << Compound.from_smiles("C1C(C)CCCC1").id data_entries << [6,7] - d.data_entries = data_entries + compounds.each_with_index do |c,i| + features.each_with_index do |f,j| + d.data_entries[c.id.to_s] ||= {} + d.data_entries[c.id.to_s][f.id.to_s] ||= [] + d.data_entries[c.id.to_s][f.id.to_s] << data_entries[i][j] + end + end + assert_equal 3, d.compounds.size assert_equal 2, d.features.size + p d.data_entries assert_equal [[1,2],[4,5],[6,7]], d.data_entries d.save # check if dataset has been saved correctly @@ -89,8 +85,14 @@ class DatasetTest < MiniTest::Test assert_equal "multicolumn", new_dataset.name # get features assert_equal 6, new_dataset.features.size - assert_equal 7, new_dataset.compounds.size - assert_equal ["1", nil, "false", nil, nil, 1.0], new_dataset.data_entries.last + assert_equal 5, new_dataset.compounds.size + de = new_dataset.data_entries[new_dataset.compounds.last.id.to_s] + fid = new_dataset.features.first.id.to_s + assert_equal ["1"], de[fid] + fid = new_dataset.features.last.id.to_s + assert_equal [1.0], de[fid] + fid = new_dataset.features[2].id.to_s + assert_equal ["false"], de[fid] d.delete end @@ -117,7 +119,7 @@ class DatasetTest < MiniTest::Test assert d.warnings.grep(/Duplicate compound/) assert d.warnings.grep(/3, 5/) assert_equal 6, d.features.size - assert_equal 7, d.compounds.size + assert_equal 5, d.compounds.size assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size assert_equal [["1", "1", "true", "true", "test", 1.1], ["1", "2", "false", "7.5", "test", 0.24], ["1", "3", "true", "5", "test", 3578.239], ["0", "4", "false", "false", "test", -2.35], ["1", "2", "true", "4", "test_2", 1], ["1", "2", "false", "false", "test", -1.5], ["1", nil, "false", nil, nil, 1.0]], d.data_entries assert_equal "c1ccc[nH]1,1,,false,,,1.0", d.to_csv.split("\n")[7] @@ -195,7 +197,7 @@ class DatasetTest < MiniTest::Test assert_match "EPAFHM.mini.csv", d.source assert_equal 1, d.features.size feature = d.features.first - assert_kind_of NumericBioAssay, feature + assert_kind_of NumericFeature, feature assert_equal 0.0113, d.data_entries[0][0] assert_equal 0.00323, d.data_entries[5][0] d2 = Dataset.find d.id @@ -207,10 +209,10 @@ class DatasetTest < MiniTest::Test dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") dataset.folds(10).each do |fold| fold.each do |d| - assert_equal d.data_entries.size, d.compound_ids.size - assert_operator d.compound_ids.size, :>=, d.compound_ids.uniq.size + assert_equal d.data_entries.size, d.compounds.size + assert_equal d.compounds.size, :>=, d.compounds.uniq.size end - assert_operator fold[0].compound_ids.uniq.size, :>=, fold[1].compound_ids.uniq.size + assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size end #puts dataset.folds 10 end diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 4fc04ff..8a6836c 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -2,11 +2,16 @@ require_relative "setup.rb" class NanoparticleTest < MiniTest::Test - MODENA = File.join DATA_DIR,"MODENA-EC50_EC25.csv" - def test_import Import::Enanomapper.import assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported" end + def test_create_model + Model::NanoLazar.create_all.each do |model| + np = Nanoparticle.find(model.training_particle_ids.sample) + model.predict np + end + end + end -- cgit v1.2.3