From faccda14c0f98333bf7623d4caef00eea7bb1933 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 19 Aug 2019 20:53:53 +0200 Subject: temporary commit before branching - Dataset.from_descriptor_csv_file is broken --- lib/dataset.rb | 35 +++++++++++++---------------------- lib/feature.rb | 2 +- lib/model.rb | 30 +++++++++++++++++++++++++----- lib/substance.rb | 2 +- 4 files changed, 40 insertions(+), 29 deletions(-) diff --git a/lib/dataset.rb b/lib/dataset.rb index 9e9fdd5..1d6b56c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -167,13 +167,15 @@ module OpenTox # - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable) # @param [String] Descriptor type # @return [OpenTox::Dataset] - def self.from_descriptor_csv_file file, type + def self.from_descriptor_csv_file file, category md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) + #dataset = nil if dataset $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." + p "Parsing #{file}." table = nil sep = "," ["\t",";"].each do |s| # guess alternative CSV separator @@ -189,26 +191,17 @@ module OpenTox # features feature_names = table.shift.collect{|f| f.strip} raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift) - numeric = [] - features = [] - - feature_names.each_with_index do |f,i| - values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - feature = nil - if values.size == 0 # empty feature - elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - numeric[i] = true - i == feature_names.size-1 ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f) - else - numeric[i] = false - i == feature_names.size-1 ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) - end - features << feature if feature + bioactivity_feature_name = feature_names.pop + values = table.collect{|row| val=row.last.to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + if values.size > 5 and types.size == 1 and types.first == true # 5 max classes + bioactivity_feature = NumericBioActivity.find_or_create_by(:name => bioactivity_feature_name) + else + bioactivity_feature = NominalBioActivity.find_or_create_by(:name => bioactivity_feature_name, :accept_values => values.sort) end - bioactivity_feature = features.pop # substances and values @@ -223,12 +216,10 @@ module OpenTox if v.blank? warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." next - elsif numeric[j] - v = v.to_f else - v = v.strip + property = NumericSubstanceProperty.find_or_create_by(:name => feature_names[j],:category => category) + substance.properties[property.id.to_s] = v.to_f end - substance.properties[features[j].id.to_s] = [v] end substance.save dataset.add substance, bioactivity_feature, bioactivity_value diff --git a/lib/feature.rb b/lib/feature.rb index 296a174..6f9d5c4 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -98,10 +98,10 @@ module OpenTox end class NumericSubstanceProperty < NumericFeature + field :category, type: String end class NanoParticleProperty < NumericSubstanceProperty - field :category, type: String field :conditions, type: Hash end diff --git a/lib/model.rb b/lib/model.rb index d7b2df6..07759c5 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -109,8 +109,25 @@ module OpenTox :method => "Algorithm::FeatureSelection.correlation_filter", }, } + elsif substance_classes.first == "OpenTox::Substance" and algorithms[:descriptors][:method] == "properties" and algorithms[:descriptors][:categories] + model.algorithms = { + :feature_selection => nil, + :similarity => { # similarity algorithm + :method => "Algorithm::Similarity.weighted_cosine", + :min => [0.5,0.2] + }, + } + if model.class == LazarClassification + model.algorithms[:prediction] = { + :method => "Algorithm::Classification.weighted_majority_vote", + } + elsif model.class == LazarRegression + model.algorithms[:prediction] = { + :method => "Algorithm::Caret.rf", + } + end else - raise ArgumentError, "Cannot create models for #{substance_classes.first}." + raise ArgumentError, "Cannot create models for #{substance_classes.first} #{algorithms.to_json}." end # overwrite defaults with explicit parameters @@ -165,14 +182,17 @@ module OpenTox end # parse independent_variables when "properties" - categories = model.algorithms[:descriptors][:categories] feature_ids = [] - categories.each do |category| + model.algorithms[:descriptors][:categories].each do |category| + p category Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end - properties = model.substances.collect { |s| s.properties } - property_ids = properties.collect{|p| p.keys}.flatten.uniq + p feature_ids + property_ids = model.substances.collect { |s| s.properties.keys }.flatten.uniq + p property_ids model.descriptor_ids = feature_ids & property_ids + p model.descriptor_ids + exit model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} else raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented." diff --git a/lib/substance.rb b/lib/substance.rb index 5c486d8..3d95586 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,6 +1,6 @@ module OpenTox - # Base class for substances (e.g. compunds, nanoparticles) + # Base class for substances (e.g. compounds, nanoparticles) class Substance field :properties, type: Hash, default: {} end -- cgit v1.2.3