summaryrefslogtreecommitdiff
path: root/lib/dataset.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dataset.rb')
-rw-r--r--lib/dataset.rb108
1 files changed, 3 insertions, 105 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index b6c6173..bbb20be 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -384,6 +384,9 @@ module OpenTox
end
chunks
end
+
+ def transform # TODO
+ end
# Delete dataset
def delete
@@ -419,109 +422,4 @@ module OpenTox
end
- class Batch
-
- include OpenTox
- include Mongoid::Document
- include Mongoid::Timestamps
- store_in collection: "batch"
- field :name, type: String
- field :source, type: String
- field :identifiers, type: Array
- field :ids, type: Array
- field :compounds, type: Array
- field :warnings, type: Array, default: []
-
- def self.from_csv_file file
- source = file
- name = File.basename(file,".*")
- batch = self.find_by(:source => source, :name => name)
- if batch
- $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
- else
- $logger.debug "Parsing #{file}."
- # check delimiter
- line = File.readlines(file).first
- if line.match(/\t/)
- table = CSV.read file, :col_sep => "\t", :skip_blanks => true, :encoding => 'windows-1251:utf-8'
- else
- table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
- end
- batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => [])
-
- # original IDs
- if table[0][0] =~ /ID/i
- @original_ids = table.collect{|row| row.shift}
- @original_ids.shift
- end
-
- # features
- feature_names = table.shift.collect{|f| f.strip}
- warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
- compound_format = feature_names.shift.strip
- unless compound_format =~ /SMILES|InChI/i
- File.delete file
- bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \
- "Accepted formats: SMILES, InChI. Please take a look on the help page."
- end
- #numeric = []
- features = []
- # guess feature types
- feature_names.each_with_index do |f,i|
- metadata = {:name => f}
- values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
- types = values.collect{|v| v.numeric? ? true : false}.uniq
- feature = nil
- if values.size == 0 # empty feature
- elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
- #numeric[i] = true
- feature = NumericFeature.find_or_create_by(metadata)
- else
- metadata["accept_values"] = values.sort
- #numeric[i] = false
- feature = NominalFeature.find_or_create_by(metadata)
- end
- features << feature if feature
- end
-
- table.each_with_index do |vals,i|
- identifier = vals.shift.strip.gsub(/^'|'$/,"")
- begin
- case compound_format
- when /SMILES/i
- compound = OpenTox::Compound.from_smiles(identifier)
- when /InChI/i
- compound = OpenTox::Compound.from_inchi(identifier)
- end
- rescue
- compound = nil
- end
- # collect only for present compounds
- unless compound.nil?
- batch.identifiers << identifier
- batch.compounds << compound.id
- batch.ids << @original_ids[i] if @original_ids
- else
- batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}."
- end
- end
- batch.compounds.duplicates.each do |duplicate|
- $logger.debug "Duplicates found in #{name}."
- dup = Compound.find duplicate
- positions = []
- batch.compounds.each_with_index do |co,i|
- c = Compound.find co
- if !c.blank? and c.inchi and c.inchi == dup.inchi
- positions << i+1
- end
- end
- batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}."
- end
- batch.save
- end
- batch
- end
-
- end
-
end