require 'csv' require 'tempfile' module OpenTox class Dataset field :substance_ids, type: Array, default: [] field :feature_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers def compounds substances.select{|s| s.is_a? Compound} end def nanoparticles substances.select{|s| s.is_a? Nanoparticle} end # Get all substances def substances @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id} @substances end # Get all features def features @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} @features end def values substance,feature substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s] data_entries[substance.to_s][feature.to_s] else nil end end # Writers # Set compounds def compounds=(compounds) self.substance_ids = compounds.collect{|c| c.id}.uniq end # Set features def features=(features) self.feature_ids = features.collect{|f| f.id} end def add(substance,feature,value) substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature data_entries[substance.to_s] ||= {} data_entries[substance.to_s][feature.to_s] ||= [] data_entries[substance.to_s][feature.to_s] << value end # Dataset operations # Split a dataset into n folds # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n len = self.substance_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] start = 0 1.upto(n) do |i| last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] test_cids = test_idxs.collect{|i| substance_ids[i]} training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.substances.each do |substance| substance.dataset_ids << dataset.id substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} end dataset.save dataset end start = last+1 chunks << chunk end chunks end # Serialisation # converts dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| compound = Substance.find(substance_ids.first).is_a? Compound if compound csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} else csv << ["Name"] + features.collect{|f| f.name} end substances.each do |substance| if compound name = (inchi ? substance.inchi : substance.smiles) else name = substance.name end nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq if nr_measurements.size > 1 warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." else (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s] row << data_entries[substance.id.to_s][f.id.to_s] else row << "" end end csv << row end end end end end # Parsers # Create a dataset from file (csv,sdf,...) # @param filename [String] # @return [String] dataset uri # TODO #def self.from_sdf_file #end # Create a dataset from CSV file # TODO: document structure def self.from_csv_file file, source=nil source ||= file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) if dataset $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." else $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) dataset.parse_table table end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types def parse_table table # features feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes metadata["numeric"] = true numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end feature_ids << feature.id if feature end # substances and values table.each_with_index do |vals,i| identifier = vals.shift.strip warn "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format when /SMILES/i substance = OpenTox::Compound.from_smiles(identifier) when /InChI/i substance = OpenTox::Compound.from_inchi(identifier) # TODO nanoparticle end rescue substance = nil end if substance.nil? # compound parsers may return nil warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end substance_ids << substance.id data_entries[substance.id.to_s] = {} substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id substance.save unless vals.size == feature_ids.size warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end vals.each_with_index do |v,j| if v.blank? warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." next elsif numeric[j] v = v.to_f else v = v.strip end data_entries[substance.id.to_s][feature_ids[j].to_s] ||= [] data_entries[substance.id.to_s][feature_ids[j].to_s] << v end end substances.duplicates.each do |substance| positions = [] substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end substance_ids.uniq! feature_ids.uniq! save end end # Dataset for lazar predictions class LazarPrediction #< Dataset field :creator, type: String field :prediction_feature_id, type: BSON::ObjectId field :predictions, type: Hash, default: {} def prediction_feature Feature.find prediction_feature_id end def compounds substances.select{|s| s.is_a? Compound} end def substances predictions.keys.collect{|id| Substance.find id} end end end