summaryrefslogtreecommitdiff
path: root/lib/dataset.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dataset.rb')
-rw-r--r--lib/dataset.rb35
1 files changed, 13 insertions, 22 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 9e9fdd5..1d6b56c 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -167,13 +167,15 @@ module OpenTox
# - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable)
# @param [String] Descriptor type
# @return [OpenTox::Dataset]
- def self.from_descriptor_csv_file file, type
+ def self.from_descriptor_csv_file file, category
md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
dataset = self.find_by(:md5 => md5)
+ #dataset = nil
if dataset
$logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
else
$logger.debug "Parsing #{file}."
+ p "Parsing #{file}."
table = nil
sep = ","
["\t",";"].each do |s| # guess alternative CSV separator
@@ -189,26 +191,17 @@ module OpenTox
# features
feature_names = table.shift.collect{|f| f.strip}
raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+
original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift)
- numeric = []
- features = []
-
- feature_names.each_with_index do |f,i|
- values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
- types = values.collect{|v| v.numeric? ? true : false}.uniq
- feature = nil
- if values.size == 0 # empty feature
- elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
- numeric[i] = true
- i == feature_names.size-1 ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
- else
- numeric[i] = false
- i == feature_names.size-1 ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
- end
- features << feature if feature
+ bioactivity_feature_name = feature_names.pop
+ values = table.collect{|row| val=row.last.to_s.strip; val.blank? ? nil : val }.uniq.compact
+ types = values.collect{|v| v.numeric? ? true : false}.uniq
+ if values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+ bioactivity_feature = NumericBioActivity.find_or_create_by(:name => bioactivity_feature_name)
+ else
+ bioactivity_feature = NominalBioActivity.find_or_create_by(:name => bioactivity_feature_name, :accept_values => values.sort)
end
- bioactivity_feature = features.pop
# substances and values
@@ -223,12 +216,10 @@ module OpenTox
if v.blank?
warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
next
- elsif numeric[j]
- v = v.to_f
else
- v = v.strip
+ property = NumericSubstanceProperty.find_or_create_by(:name => feature_names[j],:category => category)
+ substance.properties[property.id.to_s] = v.to_f
end
- substance.properties[features[j].id.to_s] = [v]
end
substance.save
dataset.add substance, bioactivity_feature, bioactivity_value