summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2019-08-19 20:53:53 +0200
committerChristoph Helma <helma@in-silico.ch>2019-08-19 20:53:53 +0200
commitfaccda14c0f98333bf7623d4caef00eea7bb1933 (patch)
tree0be6178e5e2fbef81734d4cd6809c6d09cf16aae
parentede38a2f1390befe8f7cf8a62fb5432448633d63 (diff)
temporary commit before branching - Dataset.from_descriptor_csv_file is broken
-rw-r--r--lib/dataset.rb35
-rw-r--r--lib/feature.rb2
-rw-r--r--lib/model.rb30
-rw-r--r--lib/substance.rb2
4 files changed, 40 insertions, 29 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 9e9fdd5..1d6b56c 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -167,13 +167,15 @@ module OpenTox
# - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable)
# @param [String] Descriptor type
# @return [OpenTox::Dataset]
- def self.from_descriptor_csv_file file, type
+ def self.from_descriptor_csv_file file, category
md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
dataset = self.find_by(:md5 => md5)
+ #dataset = nil
if dataset
$logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
else
$logger.debug "Parsing #{file}."
+ p "Parsing #{file}."
table = nil
sep = ","
["\t",";"].each do |s| # guess alternative CSV separator
@@ -189,26 +191,17 @@ module OpenTox
# features
feature_names = table.shift.collect{|f| f.strip}
raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+
original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift)
- numeric = []
- features = []
-
- feature_names.each_with_index do |f,i|
- values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
- types = values.collect{|v| v.numeric? ? true : false}.uniq
- feature = nil
- if values.size == 0 # empty feature
- elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
- numeric[i] = true
- i == feature_names.size-1 ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
- else
- numeric[i] = false
- i == feature_names.size-1 ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
- end
- features << feature if feature
+ bioactivity_feature_name = feature_names.pop
+ values = table.collect{|row| val=row.last.to_s.strip; val.blank? ? nil : val }.uniq.compact
+ types = values.collect{|v| v.numeric? ? true : false}.uniq
+ if values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+ bioactivity_feature = NumericBioActivity.find_or_create_by(:name => bioactivity_feature_name)
+ else
+ bioactivity_feature = NominalBioActivity.find_or_create_by(:name => bioactivity_feature_name, :accept_values => values.sort)
end
- bioactivity_feature = features.pop
# substances and values
@@ -223,12 +216,10 @@ module OpenTox
if v.blank?
warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
next
- elsif numeric[j]
- v = v.to_f
else
- v = v.strip
+ property = NumericSubstanceProperty.find_or_create_by(:name => feature_names[j],:category => category)
+ substance.properties[property.id.to_s] = v.to_f
end
- substance.properties[features[j].id.to_s] = [v]
end
substance.save
dataset.add substance, bioactivity_feature, bioactivity_value
diff --git a/lib/feature.rb b/lib/feature.rb
index 296a174..6f9d5c4 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -98,10 +98,10 @@ module OpenTox
end
class NumericSubstanceProperty < NumericFeature
+ field :category, type: String
end
class NanoParticleProperty < NumericSubstanceProperty
- field :category, type: String
field :conditions, type: Hash
end
diff --git a/lib/model.rb b/lib/model.rb
index d7b2df6..07759c5 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -109,8 +109,25 @@ module OpenTox
:method => "Algorithm::FeatureSelection.correlation_filter",
},
}
+ elsif substance_classes.first == "OpenTox::Substance" and algorithms[:descriptors][:method] == "properties" and algorithms[:descriptors][:categories]
+ model.algorithms = {
+ :feature_selection => nil,
+ :similarity => { # similarity algorithm
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => [0.5,0.2]
+ },
+ }
+ if model.class == LazarClassification
+ model.algorithms[:prediction] = {
+ :method => "Algorithm::Classification.weighted_majority_vote",
+ }
+ elsif model.class == LazarRegression
+ model.algorithms[:prediction] = {
+ :method => "Algorithm::Caret.rf",
+ }
+ end
else
- raise ArgumentError, "Cannot create models for #{substance_classes.first}."
+ raise ArgumentError, "Cannot create models for #{substance_classes.first} #{algorithms.to_json}."
end
# overwrite defaults with explicit parameters
@@ -165,14 +182,17 @@ module OpenTox
end
# parse independent_variables
when "properties"
- categories = model.algorithms[:descriptors][:categories]
feature_ids = []
- categories.each do |category|
+ model.algorithms[:descriptors][:categories].each do |category|
+ p category
Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
end
- properties = model.substances.collect { |s| s.properties }
- property_ids = properties.collect{|p| p.keys}.flatten.uniq
+ p feature_ids
+ property_ids = model.substances.collect { |s| s.properties.keys }.flatten.uniq
+ p property_ids
model.descriptor_ids = feature_ids & property_ids
+ p model.descriptor_ids
+ exit
model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
else
raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented."
diff --git a/lib/substance.rb b/lib/substance.rb
index 5c486d8..3d95586 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -1,6 +1,6 @@
module OpenTox
- # Base class for substances (e.g. compunds, nanoparticles)
+ # Base class for substances (e.g. compounds, nanoparticles)
class Substance
field :properties, type: Hash, default: {}
end