From 063acd4dc63e9287287cc1ff78fff2064ff74e4f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 7 Apr 2016 17:39:14 +0200 Subject: initial ambit import --- lib/dataset.rb | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 5d8aeaf..2e48626 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -9,7 +9,6 @@ module OpenTox field :feature_ids, type: Array, default: [] field :compound_ids, type: Array, default: [] field :data_entries, type: Array, default: [] - field :source, type: String # Readers -- cgit v1.2.3 From 84222bae2bbb9fb3e0ce3e65de1be8e7f94d2147 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 12 Apr 2016 12:37:37 +0200 Subject: new dataset structure --- lib/dataset.rb | 173 +++++++++++++++++++-------------------------------------- 1 file changed, 57 insertions(+), 116 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 2e48626..5c04382 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -6,21 +6,25 @@ module OpenTox class Dataset # associations like has_many, belongs_to deteriorate performance - field :feature_ids, type: Array, default: [] - field :compound_ids, type: Array, default: [] - field :data_entries, type: Array, default: [] + #field :feature_ids, type: Array, default: [] + #field :substance_ids, type: Array, default: [] + field :data_entries, type: Hash, default: {} # Readers - # Get all compounds def compounds - @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id} - @compounds + substances.select{|s| s.is_a? Compound} + end + + # Get all substances + def substances + @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} + @substances end # Get all features def features - @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.keys}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} @features end @@ -29,22 +33,20 @@ module OpenTox # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values def values(compound, feature) - rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id } - col = feature_ids.index feature.id - rows.collect{|row| data_entries[row][col]} + data_entries[compound.id,feature.id] end # Writers # Set compounds def compounds=(compounds) - self.compound_ids = compounds.collect{|c| c.id} + self.substance_ids = compounds.collect{|c| c.id} end # Set features - def features=(features) - self.feature_ids = features.collect{|f| f.id} - end + #def features=(features) + #self.feature_ids = features.collect{|f| f.id} + #end # Dataset operations @@ -52,13 +54,8 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - unique_compound_data = {} - compound_ids.each_with_index do |cid,i| - unique_compound_data[cid] ||= [] - unique_compound_data[cid] << data_entries[i] - end - unique_compound_ids = unique_compound_data.keys - len = unique_compound_ids.size + substance_ids = data_entries.keys + len = substance_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -67,19 +64,19 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| unique_compound_ids[i]} + test_cids = test_idxs.collect{|i| substance_ids[i]} training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| unique_compound_ids[i]} - chunk = [training_cids,test_cids].collect do |unique_cids| - cids = [] - data_entries = [] - unique_cids.each do |cid| - unique_compound_data[cid].each do |de| - cids << cid - data_entries << de + training_cids = training_idxs.collect{|i| substance_ids[i]} + chunk = [training_cids,test_cids].collect do |cids| + new_cids = [] + new_data_entries = [] + cids.each do |cid| + data_entries[cid].each do |de| + new_cids << cid + new_data_entries << de end end - dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) + dataset = self.class.new(:data_entries => data_entries, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save @@ -96,27 +93,7 @@ module OpenTox # Diagnostics def duplicates feature=self.features.first - col = feature_ids.index feature.id - dups = {} - compound_ids.each_with_index do |cid,i| - rows = compound_ids.each_index.select{|r| compound_ids[r] == cid } - values = rows.collect{|row| data_entries[row][col]} - dups[cid] = values if values.size > 1 - end - dups - end - - def correlation_plot training_dataset - # TODO: create/store svg - R.assign "features", data_entries - R.assign "activities", training_dataset.data_entries.collect{|de| de.first} - R.eval "featurePlot(features,activities)" - end - - def density_plot - # TODO: create/store svg - R.assign "acts", data_entries.collect{|r| r.first }#.compact - R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')" + data_entries.select{|sid,f| f[feature.id].size > 1} end # Serialisation @@ -124,10 +101,15 @@ module OpenTox # converts dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] def to_csv(inchi=false) - CSV.generate() do |csv| #{:force_quotes=>true} + CSV.generate() do |csv| csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - compounds.each_with_index do |c,i| - csv << [inchi ? c.inchi : c.smiles] + data_entries[i] + data_entries.each do |sid,f| + substance = Substance.find cid + features.each do |feature| + f[feature.id].each do |v| + csv << [inchi ? substance.inchi : substance.smiles , v] + end + end end end end @@ -143,7 +125,7 @@ module OpenTox # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil, bioassay=true#, layout={} + def self.from_csv_file file, source=nil source ||= file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) @@ -153,21 +135,22 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table, bioassay#, layout + dataset.parse_table table end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def parse_table table, bioassay=true + def parse_table table time = Time.now # features feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip + # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] @@ -176,30 +159,20 @@ module OpenTox metadata = {:name => f} values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes metadata["numeric"] = true numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) else metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) end - if bioassay - if metadata["numeric"] - feature = NumericBioAssay.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalBioAssay.find_or_create_by(metadata) - end - else - metadata.merge({:measured => false, :calculated => true}) - if metadata["numeric"] - feature = NumericFeature.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalFeature.find_or_create_by(metadata) - end - end - feature_ids << feature.id if feature + @features ||= [] + @features << feature if feature end $logger.debug "Feature values: #{Time.now-time}" @@ -210,7 +183,6 @@ module OpenTox value_time = 0 # compounds and values - self.data_entries = [] table.each_with_index do |vals,i| ct = Time.now @@ -222,6 +194,7 @@ module OpenTox compound = OpenTox::Compound.from_smiles(identifier) when /InChI/i compound = OpenTox::Compound.from_inchi(identifier) + # TODO nanoparticle end rescue compound = nil @@ -235,13 +208,13 @@ module OpenTox compound_time += Time.now-ct r += 1 - unless vals.size == feature_ids.size # way cheaper than accessing features + unless vals.size == @features.size warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end - compound_ids << compound.id - table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1) + #substance_ids << compound.id + #table.first.size == 0 ? self.data_entries[compound.id] = Array.new(0) : self.data_entries[compound.id] = Array.new(table.first.size-1) vals.each_with_index do |v,j| if v.blank? @@ -252,10 +225,13 @@ module OpenTox else v = v.strip end - self.data_entries.last[j] = v + self.data_entries[compound.id.to_s] ||= {} + self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] + self.data_entries[compound.id.to_s][@features[j].id.to_s] << v #i = compound.feature_ids.index feature_ids[j] - compound.features[feature_ids[j].to_s] ||= [] - compound.features[feature_ids[j].to_s] << v + #TODO + #compound.features[feature_ids[j].to_s] ||= [] + #compound.features[feature_ids[j].to_s] << v compound.save end end @@ -272,17 +248,6 @@ module OpenTox end - # Fill unset data entries - # @param any value - def fill_nil_with n - (0 .. compound_ids.size-1).each do |i| - data_entries[i] ||= [] - (0 .. feature_ids.size-1).each do |j| - data_entries[i][j] ||= n - end - end - end - end # Dataset for lazar predictions @@ -296,28 +261,4 @@ module OpenTox end - # Dataset for descriptors (physchem) - class DescriptorDataset < Dataset - field :feature_calculation_algorithm, type: String - - end - - class ScaledDataset < DescriptorDataset - - field :centers, type: Array, default: [] - field :scales, type: Array, default: [] - - def original_value value, i - value * scales[i] + centers[i] - end - end - - # Dataset for fminer descriptors - class FminerDataset < DescriptorDataset - field :training_algorithm, type: String - field :training_dataset_id, type: BSON::ObjectId - field :training_feature_id, type: BSON::ObjectId - field :training_parameters, type: Hash - end - end -- cgit v1.2.3 From a8368dda776c05331474adf7eaf9a6e413a3b1eb Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 15:15:51 +0200 Subject: validation tests pass --- lib/dataset.rb | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 5c04382..25307c9 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,9 +5,6 @@ module OpenTox class Dataset - # associations like has_many, belongs_to deteriorate performance - #field :feature_ids, type: Array, default: [] - #field :substance_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers @@ -24,7 +21,7 @@ module OpenTox # Get all features def features - @features ||= data_entries.collect{|cid,f| f.keys}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} @features end @@ -33,7 +30,7 @@ module OpenTox # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values def values(compound, feature) - data_entries[compound.id,feature.id] + data_entries[compound.id.to_s][feature.id.to_s] end # Writers @@ -68,15 +65,14 @@ module OpenTox training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| - new_cids = [] - new_data_entries = [] + new_data_entries = {} cids.each do |cid| - data_entries[cid].each do |de| - new_cids << cid - new_data_entries << de + data_entries[cid].each do |f,v| + new_data_entries[cid] ||= {} + new_data_entries[cid][f] = v end end - dataset = self.class.new(:data_entries => data_entries, :source => self.id ) + dataset = self.class.new(:data_entries => new_data_entries, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save @@ -213,9 +209,6 @@ module OpenTox next end - #substance_ids << compound.id - #table.first.size == 0 ? self.data_entries[compound.id] = Array.new(0) : self.data_entries[compound.id] = Array.new(table.first.size-1) - vals.each_with_index do |v,j| if v.blank? warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." @@ -228,10 +221,8 @@ module OpenTox self.data_entries[compound.id.to_s] ||= {} self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - #i = compound.feature_ids.index feature_ids[j] - #TODO - #compound.features[feature_ids[j].to_s] ||= [] - #compound.features[feature_ids[j].to_s] << v + compound.features[@features[j].id.to_s] ||= [] + compound.features[@features[j].id.to_s] << v compound.save end end @@ -251,14 +242,23 @@ module OpenTox end # Dataset for lazar predictions - class LazarPrediction < Dataset + class LazarPrediction #< Dataset field :creator, type: String - field :prediction_feature_id, type: String + field :prediction_feature_id, type: BSON::ObjectId + field :predictions, type: Hash, default: {} def prediction_feature Feature.find prediction_feature_id end + def compounds + substances.select{|s| s.is_a? Compound} + end + + def substances + predictions.keys.collect{|id| Substance.find id} + end + end end -- cgit v1.2.3 From 753fcc204d93d86c76860bee6e2f7d0468c3c940 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 14 Apr 2016 19:43:24 +0200 Subject: features/toxicities fixed --- lib/dataset.rb | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 25307c9..274c475 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -13,6 +13,10 @@ module OpenTox substances.select{|s| s.is_a? Compound} end + def nanoparticles + substances.select{|s| s.is_a? Nanoparticle} + end + # Get all substances def substances @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} @@ -21,7 +25,7 @@ module OpenTox # Get all features def features - @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact @features end @@ -98,13 +102,22 @@ module OpenTox # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + compound = Substance.find(data_entries.first.first).is_a? Compound + if compound + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + else + csv << ["Name"] + features.collect{|f| f.name} + end data_entries.each do |sid,f| - substance = Substance.find cid + substance = Substance.find sid features.each do |feature| - f[feature.id].each do |v| - csv << [inchi ? substance.inchi : substance.smiles , v] - end + f[feature.id.to_s].each do |v| + if compound + csv << [inchi ? substance.inchi : substance.smiles , v] + else + csv << [substance.name , v] + end + end if f[feature.id.to_s] end end end @@ -221,8 +234,8 @@ module OpenTox self.data_entries[compound.id.to_s] ||= {} self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - compound.features[@features[j].id.to_s] ||= [] - compound.features[@features[j].id.to_s] << v + compound.toxicities[@features[j].id.to_s] ||= [] + compound.toxicities[@features[j].id.to_s] << v compound.save end end -- cgit v1.2.3 From 8aab046eb1ad39aaf10c5a8596102c35c7b2ee0b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 15 Apr 2016 11:01:16 +0200 Subject: data_entries removed from datasets. datasets are now just containers for compounds and features, feature values have to be retrieved from substances. --- lib/dataset.rb | 65 +++++++++++++++++++++------------------------------------- 1 file changed, 23 insertions(+), 42 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 274c475..fdf1bfc 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,7 +5,8 @@ module OpenTox class Dataset - field :data_entries, type: Hash, default: {} + field :substance_ids, type: Array, default: [] + field :feature_ids, type: Array, default: [] # Readers @@ -19,13 +20,13 @@ module OpenTox # Get all substances def substances - @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} + @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id} @substances end # Get all features def features - @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact + @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} @features end @@ -33,9 +34,9 @@ module OpenTox # @param compound [OpenTox::Compound] OpenTox Compound object # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values - def values(compound, feature) - data_entries[compound.id.to_s][feature.id.to_s] - end + #def values(compound, feature) + #data_entries[compound.id.to_s][feature.id.to_s] + #end # Writers @@ -45,9 +46,9 @@ module OpenTox end # Set features - #def features=(features) - #self.feature_ids = features.collect{|f| f.id} - #end + def features=(features) + self.feature_ids = features.collect{|f| f.id} + end # Dataset operations @@ -55,8 +56,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - substance_ids = data_entries.keys - len = substance_ids.size + len = self.substance_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -69,19 +69,11 @@ module OpenTox training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| - new_data_entries = {} - cids.each do |cid| - data_entries[cid].each do |f,v| - new_data_entries[cid] ||= {} - new_data_entries[cid][f] = v - end - end - dataset = self.class.new(:data_entries => new_data_entries, :source => self.id ) + dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save end - dataset.save dataset end start = last+1 @@ -90,12 +82,6 @@ module OpenTox chunks end - # Diagnostics - - def duplicates feature=self.features.first - data_entries.select{|sid,f| f[feature.id].size > 1} - end - # Serialisation # converts dataset to csv format including compound smiles as first column, other column headers are feature names @@ -161,7 +147,6 @@ module OpenTox compound_format = feature_names.shift.strip # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - numeric = [] # guess feature types feature_names.each_with_index do |f,i| @@ -180,8 +165,7 @@ module OpenTox numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end - @features ||= [] - @features << feature if feature + feature_ids << feature.id if feature end $logger.debug "Feature values: #{Time.now-time}" @@ -196,7 +180,7 @@ module OpenTox table.each_with_index do |vals,i| ct = Time.now identifier = vals.shift.strip - warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? + warn "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format when /SMILES/i @@ -208,41 +192,38 @@ module OpenTox rescue compound = nil end - if compound.nil? - # compound parsers may return nil - warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + if compound.nil? # compound parsers may return nil + warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end + substance_ids << compound.id compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id compound_time += Time.now-ct r += 1 - unless vals.size == @features.size - warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." + unless vals.size == feature_ids.size + warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end vals.each_with_index do |v,j| if v.blank? - warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] v = v.to_f else v = v.strip end - self.data_entries[compound.id.to_s] ||= {} - self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] - self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - compound.toxicities[@features[j].id.to_s] ||= [] - compound.toxicities[@features[j].id.to_s] << v + compound.toxicities[feature_ids[j].to_s] ||= [] + compound.toxicities[feature_ids[j].to_s] << v compound.save end end compounds.duplicates.each do |compound| positions = [] compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} - warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" -- cgit v1.2.3 From 4662e845c12e3e623ec9bec208c42cd4b1886047 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 15 Apr 2016 14:58:17 +0200 Subject: enm study import --- lib/dataset.rb | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index fdf1bfc..b51d74b 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -88,22 +88,21 @@ module OpenTox # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| - compound = Substance.find(data_entries.first.first).is_a? Compound + compound = Substance.find(substance_ids.first).is_a? Compound if compound csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} else csv << ["Name"] + features.collect{|f| f.name} end - data_entries.each do |sid,f| - substance = Substance.find sid - features.each do |feature| - f[feature.id.to_s].each do |v| + substances.each do |substance| + features.each do |f| + substance.toxicities[f.id.to_s].each do |v| if compound csv << [inchi ? substance.inchi : substance.smiles , v] else csv << [substance.name , v] end - end if f[feature.id.to_s] + end if substance.toxicities[f.id.to_s] end end end -- cgit v1.2.3 From 51f57e2858b60bed74ebcc97189b2188c900c283 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 6 May 2016 12:49:28 +0200 Subject: dataset tests cleanup --- lib/dataset.rb | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index b51d74b..9b24440 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -30,19 +30,11 @@ module OpenTox @features end - # Find data entry values for a given compound and feature - # @param compound [OpenTox::Compound] OpenTox Compound object - # @param feature [OpenTox::Feature] OpenTox Feature object - # @return [Array] Data entry values - #def values(compound, feature) - #data_entries[compound.id.to_s][feature.id.to_s] - #end - # Writers # Set compounds def compounds=(compounds) - self.substance_ids = compounds.collect{|c| c.id} + self.substance_ids = compounds.collect{|c| c.id}.uniq end # Set features @@ -95,14 +87,27 @@ module OpenTox csv << ["Name"] + features.collect{|f| f.name} end substances.each do |substance| - features.each do |f| - substance.toxicities[f.id.to_s].each do |v| - if compound - csv << [inchi ? substance.inchi : substance.smiles , v] - else - csv << [substance.name , v] + if compound + name = (inchi ? substance.inchi : substance.smiles) + else + name = substance.name + end + nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + + if nr_measurements.size > 1 + warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." + else + (0..nr_measurements.first-1).each do |i| + row = [name] + features.each do |f| + if substance.toxicities[f.id.to_s] + row << substance.toxicities[f.id.to_s][i] + else + row << "" + end end - end if substance.toxicities[f.id.to_s] + csv << row + end end end end @@ -224,6 +229,8 @@ module OpenTox compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end + substance_ids.uniq! + feature_ids.uniq! $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" time = Time.now -- cgit v1.2.3 From 06fc914653face2c58fd4e6c47161cb03e217582 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 8 May 2016 12:22:58 +0200 Subject: default validations fixed --- lib/dataset.rb | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 9b24440..86800c6 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -64,6 +64,9 @@ module OpenTox dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id + compound.toxicities.each do |feature_id,data| + data[dataset.id.to_s] = data[self.id.to_s] # copy data entries + end compound.save end dataset @@ -92,7 +95,7 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s][self.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq if nr_measurements.size > 1 warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." @@ -100,8 +103,8 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if substance.toxicities[f.id.to_s] - row << substance.toxicities[f.id.to_s][i] + if substance.toxicities[f.id.to_s] and substance.toxicities[f.id.to_s][self.id.to_s] + row << substance.toxicities[f.id.to_s][self.id.to_s][i] else row << "" end @@ -149,7 +152,6 @@ module OpenTox feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip - # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] # guess feature types @@ -219,8 +221,9 @@ module OpenTox else v = v.strip end - compound.toxicities[feature_ids[j].to_s] ||= [] - compound.toxicities[feature_ids[j].to_s] << v + compound.toxicities[feature_ids[j].to_s] ||= {} + compound.toxicities[feature_ids[j].to_s][self.id.to_s] ||= [] + compound.toxicities[feature_ids[j].to_s][self.id.to_s] << v compound.save end end -- cgit v1.2.3 From 611bac891177f8d9185d45486dd574b6ef4d1912 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 15:11:46 +0200 Subject: nanoparticle models fixed --- lib/dataset.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 86800c6..9738c1f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -62,12 +62,12 @@ module OpenTox training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) - dataset.compounds.each do |compound| - compound.dataset_ids << dataset.id - compound.toxicities.each do |feature_id,data| + dataset.substances.each do |substance| + substance.dataset_ids << dataset.id + substance.toxicities.each do |feature_id,data| data[dataset.id.to_s] = data[self.id.to_s] # copy data entries end - compound.save + substance.save end dataset end -- cgit v1.2.3 From b8bb12c8a163c238d7d4387c1914e2100bb660df Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 12 May 2016 15:23:01 +0200 Subject: enm study import fixed --- lib/dataset.rb | 77 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 39 insertions(+), 38 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 9738c1f..8c7fe68 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -7,6 +7,7 @@ module OpenTox field :substance_ids, type: Array, default: [] field :feature_ids, type: Array, default: [] + field :data_entries, type: Hash, default: {} # Readers @@ -30,6 +31,16 @@ module OpenTox @features end + def values substance,feature + substance = substance.id if substance.is_a? Substance + feature = feature.id if feature.is_a? Feature + if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s] + data_entries[substance.to_s][feature.to_s] + else + nil + end + end + # Writers # Set compounds @@ -42,6 +53,14 @@ module OpenTox self.feature_ids = features.collect{|f| f.id} end + def add(substance,feature,value) + substance = substance.id if substance.is_a? Substance + feature = feature.id if feature.is_a? Feature + data_entries[substance.to_s] ||= {} + data_entries[substance.to_s][feature.to_s] ||= [] + data_entries[substance.to_s][feature.to_s] << value + end + # Dataset operations # Split a dataset into n folds @@ -64,11 +83,10 @@ module OpenTox dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.substances.each do |substance| substance.dataset_ids << dataset.id - substance.toxicities.each do |feature_id,data| - data[dataset.id.to_s] = data[self.id.to_s] # copy data entries - end substance.save + dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} end + dataset.save dataset end start = last+1 @@ -95,7 +113,7 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s][self.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq if nr_measurements.size > 1 warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." @@ -103,8 +121,8 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if substance.toxicities[f.id.to_s] and substance.toxicities[f.id.to_s][self.id.to_s] - row << substance.toxicities[f.id.to_s][self.id.to_s][i] + if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s] + row << data_entries[substance.id.to_s][f.id.to_s] else row << "" end @@ -146,8 +164,6 @@ module OpenTox # does a lot of guesswork in order to determine feature types def parse_table table - time = Time.now - # features feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size @@ -174,39 +190,31 @@ module OpenTox feature_ids << feature.id if feature end - $logger.debug "Feature values: #{Time.now-time}" - time = Time.now - - r = -1 - compound_time = 0 - value_time = 0 - - # compounds and values + # substances and values table.each_with_index do |vals,i| - ct = Time.now identifier = vals.shift.strip warn "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format when /SMILES/i - compound = OpenTox::Compound.from_smiles(identifier) + substance = OpenTox::Compound.from_smiles(identifier) when /InChI/i - compound = OpenTox::Compound.from_inchi(identifier) + substance = OpenTox::Compound.from_inchi(identifier) # TODO nanoparticle end rescue - compound = nil + substance = nil end - if compound.nil? # compound parsers may return nil + if substance.nil? # compound parsers may return nil warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - substance_ids << compound.id - compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id - compound_time += Time.now-ct + substance_ids << substance.id + data_entries[substance.id.to_s] = {} + substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id + substance.save - r += 1 unless vals.size == feature_ids.size warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next @@ -214,32 +222,25 @@ module OpenTox vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." next elsif numeric[j] v = v.to_f else v = v.strip end - compound.toxicities[feature_ids[j].to_s] ||= {} - compound.toxicities[feature_ids[j].to_s][self.id.to_s] ||= [] - compound.toxicities[feature_ids[j].to_s][self.id.to_s] << v - compound.save + data_entries[substance.id.to_s][feature_ids[j].to_s] ||= [] + data_entries[substance.id.to_s][feature_ids[j].to_s] << v end end - compounds.duplicates.each do |compound| + substances.duplicates.each do |substance| positions = [] - compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} - warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} + warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end substance_ids.uniq! feature_ids.uniq! - - $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" - time = Time.now save - $logger.debug "Saving: #{Time.now-time}" - end end -- cgit v1.2.3 From c90644211e214a50f6fdb3a936bf247f45f1f4be Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 13 May 2016 13:38:24 +0200 Subject: compound tests fixed --- lib/dataset.rb | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 8c7fe68..205f640 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,8 +5,8 @@ module OpenTox class Dataset - field :substance_ids, type: Array, default: [] - field :feature_ids, type: Array, default: [] + #field :substance_ids, type: Array, default: [] + #field :feature_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers @@ -21,13 +21,14 @@ module OpenTox # Get all substances def substances - @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id} + @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq @substances end # Get all features def features - @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} + #@features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq @features end @@ -58,7 +59,11 @@ module OpenTox feature = feature.id if feature.is_a? Feature data_entries[substance.to_s] ||= {} data_entries[substance.to_s][feature.to_s] ||= [] - data_entries[substance.to_s][feature.to_s] << value + if value.is_a? Array + data_entries[substance.to_s][feature.to_s] += value + else + data_entries[substance.to_s][feature.to_s] << value + end end # Dataset operations @@ -67,7 +72,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - len = self.substance_ids.size + len = self.substances.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -76,12 +81,14 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| substance_ids[i]} + test_substances = test_idxs.collect{|i| substances[i]} training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| substance_ids[i]} - chunk = [training_cids,test_cids].collect do |cids| - dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) - dataset.substances.each do |substance| + training_substances = training_idxs.collect{|i| substances[i]} + chunk = [training_substances,test_substances].collect do |substances| + dataset = self.class.create(:source => self.id ) + substances.each do |substance| + #dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) + #dataset.substances.each do |substance| substance.dataset_ids << dataset.id substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} @@ -170,6 +177,7 @@ module OpenTox compound_format = feature_names.shift.strip bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] + features = [] # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} @@ -187,7 +195,7 @@ module OpenTox numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end - feature_ids << feature.id if feature + features << feature if feature end # substances and values @@ -210,12 +218,10 @@ module OpenTox warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - substance_ids << substance.id - data_entries[substance.id.to_s] = {} substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id substance.save - unless vals.size == feature_ids.size + unless vals.size == features.size warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end @@ -229,8 +235,7 @@ module OpenTox else v = v.strip end - data_entries[substance.id.to_s][feature_ids[j].to_s] ||= [] - data_entries[substance.id.to_s][feature_ids[j].to_s] << v + add substance, features[j], v end end substances.duplicates.each do |substance| @@ -238,8 +243,6 @@ module OpenTox substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end - substance_ids.uniq! - feature_ids.uniq! save end -- cgit v1.2.3 From b2d80ad2e470fcb41af4b747142e5693f2fa4615 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 24 May 2016 13:05:53 +0200 Subject: dataset tests fixed --- lib/dataset.rb | 43 +++++++++++++------------------------------ 1 file changed, 13 insertions(+), 30 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 205f640..38a55a8 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,8 +5,6 @@ module OpenTox class Dataset - #field :substance_ids, type: Array, default: [] - #field :feature_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers @@ -27,7 +25,6 @@ module OpenTox # Get all features def features - #@features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq @features end @@ -44,16 +41,6 @@ module OpenTox # Writers - # Set compounds - def compounds=(compounds) - self.substance_ids = compounds.collect{|c| c.id}.uniq - end - - # Set features - def features=(features) - self.feature_ids = features.collect{|f| f.id} - end - def add(substance,feature,value) substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature @@ -87,8 +74,6 @@ module OpenTox chunk = [training_substances,test_substances].collect do |substances| dataset = self.class.create(:source => self.id ) substances.each do |substance| - #dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) - #dataset.substances.each do |substance| substance.dataset_ids << dataset.id substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} @@ -108,7 +93,7 @@ module OpenTox # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| - compound = Substance.find(substance_ids.first).is_a? Compound + compound = substances.first.is_a? Compound if compound csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} else @@ -128,11 +113,7 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s] - row << data_entries[substance.id.to_s][f.id.to_s] - else - row << "" - end + values(substance,f) ? row << values(substance,f)[i] : row << "" end csv << row end @@ -152,8 +133,8 @@ module OpenTox # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil - source ||= file + def self.from_csv_file file, accept_empty_values=false + source = file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) if dataset @@ -162,14 +143,14 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table + dataset.parse_table table, accept_empty_values end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def parse_table table + def parse_table table, accept_empty_values # features feature_names = table.shift.collect{|f| f.strip} @@ -200,24 +181,25 @@ module OpenTox # substances and values + all_substances = [] table.each_with_index do |vals,i| identifier = vals.shift.strip - warn "No feature values for compound at position #{i+2}." if vals.compact.empty? + warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values begin case compound_format when /SMILES/i substance = OpenTox::Compound.from_smiles(identifier) when /InChI/i substance = OpenTox::Compound.from_inchi(identifier) - # TODO nanoparticle end rescue substance = nil end if substance.nil? # compound parsers may return nil - warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." next end + all_substances << substance substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id substance.save @@ -237,10 +219,11 @@ module OpenTox end add substance, features[j], v end + data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values end - substances.duplicates.each do |substance| + all_substances.duplicates.each do |substance| positions = [] - substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} + all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end save -- cgit v1.2.3 From cc08e6beda7f7d70ebf6c6929a22d1a0cd7c1a20 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 24 May 2016 15:41:24 +0200 Subject: tests fixed. DescriptorTest#test_compound_all may fail within all.rb --- lib/dataset.rb | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 38a55a8..9138452 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -229,6 +229,11 @@ module OpenTox save end + def delete + compounds.each{|c| c.dataset_ids.delete id.to_s} + super + end + end # Dataset for lazar predictions -- cgit v1.2.3 From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 27 May 2016 19:16:16 +0200 Subject: first correlation of nanoparticle predictions --- lib/dataset.rb | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 9138452..0c65d61 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -46,11 +46,8 @@ module OpenTox feature = feature.id if feature.is_a? Feature data_entries[substance.to_s] ||= {} data_entries[substance.to_s][feature.to_s] ||= [] - if value.is_a? Array - data_entries[substance.to_s][feature.to_s] += value - else - data_entries[substance.to_s][feature.to_s] << value - end + data_entries[substance.to_s][feature.to_s] << value + #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source end # Dataset operations @@ -75,6 +72,7 @@ module OpenTox dataset = self.class.create(:source => self.id ) substances.each do |substance| substance.dataset_ids << dataset.id + substance.dataset_ids.uniq! substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} end @@ -200,7 +198,8 @@ module OpenTox next end all_substances << substance - substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id + substance.dataset_ids << self.id + substance.dataset_ids.uniq! substance.save unless vals.size == features.size -- cgit v1.2.3 From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 31 May 2016 18:08:08 +0200 Subject: cleanup of validation modules/classes --- lib/dataset.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 0c65d61..2e21e5b 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -69,7 +69,7 @@ module OpenTox training_idxs = indices-test_idxs training_substances = training_idxs.collect{|i| substances[i]} chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:source => self.id ) + dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) substances.each do |substance| substance.dataset_ids << dataset.id substance.dataset_ids.uniq! -- cgit v1.2.3 From 91787edb3682900bc5a2feeca66e5142f387fcc6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 7 Oct 2016 10:25:58 +0200 Subject: unified interface for prediction algorithms --- lib/dataset.rb | 2 -- 1 file changed, 2 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 2e21e5b..453fc35 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -165,11 +165,9 @@ module OpenTox feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - metadata["numeric"] = true numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else - metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) -- cgit v1.2.3 From fbded88db8b51f41ffbd5a02f601e4538ec87258 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 14 Oct 2016 09:55:51 +0200 Subject: git commit added to model metadata --- lib/dataset.rb | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 453fc35..ab55294 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -130,7 +130,6 @@ module OpenTox #end # Create a dataset from CSV file - # TODO: document structure def self.from_csv_file file, accept_empty_values=false source = file name = File.basename(file,".*") -- cgit v1.2.3