require 'csv'
require 'tempfile'

module OpenTox

  class LazarPrediction < Dataset
    field :creator, type: String
  end

  class FminerDataset < Dataset
    field :training_algorithm, type: String
    field :training_dataset_id, type: BSON::ObjectId
    field :training_feature_id, type: BSON::ObjectId
    field :training_parameters, type: Hash
  end

  class Dataset
    include Mongoid::Document

    attr_accessor :bulk

    # associations like has_many, belongs_to deteriorate performance
    field :feature_ids, type: Array, default: []
    field :compound_ids, type: Array, default: []
    field :source, type: String
    field :warnings, type: Array, default: []

    def initialize params=nil
      super params
      @bulk = []
    end

    # Readers

    def compounds
      self.compound_ids.collect{|id| OpenTox::Compound.find id}
    end

    def features
      self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
    end

    def [](compound,feature)
      bad_request_error "Incorrect parameter type. The first argument is a OpenTox::Compound the second a OpenTox::Feature." unless compound.is_a? Compound and feature.is_a? Feature
      DataEntry.where(dataset_id: self.id, compound_id: compound.id, feature_id: feature.id).distinct(:value).first
    end

    def fingerprint(compound)
      data_entries[compound.id]
    end

    def data_entries
      unless @data_entries
        entries = {}
        t = Time.now
        DataEntry.where(dataset_id: self.id).each do |de|
          entries[de.compound_id] ||= {}
          entries[de.compound_id][de.feature_id] = de.value.first
        end
        $logger.debug "Retrieving data: #{Time.now-t}"
        t = Time.now
        @data_entries = {}
        # TODO: check performance overhead
        compound_ids.each do |cid|
          @data_entries[cid] = []
          feature_ids.each_with_index do |fid,i|
            @data_entries[cid][i] = entries[cid][fid]
          end
        end
        $logger.debug "Create @data_entries: #{Time.now-t}"
      end
      @data_entries
    end

    # Find data entry values for a given compound and feature
    # @param compound [OpenTox::Compound] OpenTox Compound object
    # @param feature [OpenTox::Feature] OpenTox Feature object
    # @return [Array] Data entry values
    def values(compound, feature)
      data_entries.where(:compound_id => compound.id, :feature_id => feature.id).distinct(:value)
      #rows = (0 ... compound_ids.length).select { |r| compound_ids[r] == compound.id }
      #col = feature_ids.index feature.id
      #rows.collect{|row| data_entries[row][col]}
    end

    # Writers

    def compounds=(compounds)
      self.compound_ids = compounds.collect{|c| c.id}
    end

    def add_compound compound
        self.compound_ids << compound.id
    end

    def features=(features)
      self.feature_ids = features.collect{|f| f.id}
    end

    def add_feature feature
      self.feature_ids << feature.id
    end

    def self.create compounds, features, warnings=[], source=nil
      dataset = Dataset.new(:warnings => warnings)
      dataset.compounds = compounds
      dataset.features = features
      dataset
    end

    # for prediction result datasets
    # assumes that there are feature_ids with title prediction and confidence
    # @return [Array] of Hashes with keys { :compound, :value ,:confidence } (compound value is object not uri)
    # TODO
    #def predictions
    #end

    # Serialisation
    
    # converts dataset to csv format including compound smiles as first column, other column headers are feature titles
    # @return [String]
    def to_csv(inchi=false)
      CSV.generate() do |csv| #{:force_quotes=>true}
        csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
        compounds.each_with_index do |c,i|
          csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
        end
      end
    end

    # Methods for for validation service

    # create a new dataset with the specified compounds and features
    # @param compound_indices [Array] compound indices (integers)
    # @param feats [Array] features objects
    # @param metadata [Hash]
    # @return [OpenTox::Dataset]
    # TODO
    def split( compound_indices, feats, metadata)

      bad_request_error "Dataset.split : Please give compounds as indices" if compound_indices.size==0 or !compound_indices[0].is_a?(Fixnum)
      bad_request_error "Dataset.split : Please give features as feature objects (given: #{feats})" if feats!=nil and feats.size>0 and !feats[0].is_a?(OpenTox::Feature)
      dataset = OpenTox::Dataset.new
      dataset.metadata = metadata
      dataset.features = (feats ? feats : self.features)
      compound_indices.each do |c_idx|
        d = [ self.compounds[c_idx] ]
        dataset.features.each_with_index.each do |f,f_idx|
          d << (self.data_entries[c_idx] ? self.data_entries[c_idx][f_idx] : nil)
        end
        dataset << d
      end
      dataset.put
      dataset
    end


    # maps a compound-index from another dataset to a compound-index from this dataset
    # mapping works as follows:
    # (compound c is the compound identified by the compound-index of the other dataset)
    # * c occurs only once in this dataset? map compound-index of other dataset to index in this dataset
    # * c occurs >1 in this dataset?
    # ** number of occurences is equal in both datasets? assume order is preserved(!) and map accordingly
    # ** number of occurences is not equal in both datasets? cannot map, raise error
    # @param dataset [OpenTox::Dataset] dataset that should be mapped to this dataset (fully loaded)
    # @param compound_index [Fixnum], corresponding to dataset
    # TODO
    def compound_index( dataset, compound_index )
      compound_inchi = dataset.compounds[compound_index].inchi
      self_indices = compound_indices(compound_inchi)
      if self_indices==nil
        nil
      else
        dataset_indices = dataset.compound_indices(compound_inchi)
        if self_indices.size==1
          self_indices.first
        elsif self_indices.size==dataset_indices.size
          # we do assume that the order is preseverd (i.e., the nth occurences in both datasets are mapped to each other)!
          self_indices[dataset_indices.index(compound_index)]
        else
          raise "cannot map compound #{compound_inchi} from dataset #{dataset.id} to dataset #{self.id}, "+
            "compound occurs #{dataset_indices.size} times and #{self_indices.size} times"
        end
      end
    end

    # returns the inidices of the compound in the dataset
    # @param compound_inchi [String]
    # @return [Array] compound index (position) of the compound in the dataset, array-size is 1 unless multiple occurences
    # TODO
    def compound_indices( compound_inchi )
      unless defined?(@cmp_indices) and @cmp_indices.has_key?(compound_inchi)
        @cmp_indices = {}
        compounds().size.times do |i|
          c = self.compounds[i].inchi
          if @cmp_indices[c]==nil
            @cmp_indices[c] = [i]
          else
            @cmp_indices[c] = @cmp_indices[c]+[i]
          end
        end
      end
      @cmp_indices[compound_inchi]
    end

    # Adding data methods
    # (Alternatively, you can directly change @data["feature_ids"] and @data["compounds"])

    # Create a dataset from file (csv,sdf,...)
    # @param filename [String]
    # @return [String] dataset uri
    # TODO
    #def self.from_sdf_file
    #end

    def bulk_write
      time = Time.now
      # Workaround for mongo bulk insertions (insertion of single data_entries is far too slow)
      # Skip ruby JSON serialisation:
      #   - to_json is too slow to write to file
      #   - json (or bson) serialisation is probably causing very long parse times of Mongo::BulkWrite, or any other ruby insert operation
      # this method causes a noticeable overhead compared to direct string serialisation (e.g. total processing time 16" instead of 12" for rat fminer dataset), but it can be reused at different places
      dataset_id = self.id.to_s
      f = Tempfile.new("#{dataset_id}.json","/tmp")
      f.puts @bulk.collect{|row| "{'dataset_id': {'$oid': '#{dataset_id}'},'compound_id': {'$oid': '#{row[0]}'}, 'feature_id': {'$oid': '#{row[1]}'}, 'value': #{row[2]}}"}.join("\n")
      f.close
      $logger.debug "Write JSON file: #{Time.now-time}"
      # TODO DB name from config
      puts `mongoimport --db opentox --collection data_entries --type json --file #{f.path}  2>&1`
      $logger.debug "Bulk import: #{Time.now-time}"
      @bulk = []
    end

    def self.from_csv_file file, source=nil, bioassay=true
      source ||= file
      table = CSV.read file, :skip_blanks => true
      parse_table table, source, bioassay
    end

    # parse data in tabular format (e.g. from csv)
    # does a lot of guesswork in order to determine feature types
    def self.parse_table table, source, bioassay=true

      time = Time.now

      # features
      feature_names = table.shift.collect{|f| f.strip}
      dataset = Dataset.new(:source => source)
      dataset_id = dataset.id.to_s
      dataset.warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
      compound_format = feature_names.shift.strip
      bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i

      numeric = []
      # guess feature types
      feature_names.each_with_index do |f,i|
        values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
        types = values.collect{|v| v.numeric? ? true : false}.uniq
        metadata = {"name" => File.basename(f), "source" => source}
        if values.size == 0 # empty feature
        elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
          metadata["numeric"] = true
          numeric[i] = true
        else
          metadata["nominal"] = true
          metadata["accept_values"] = values
          numeric[i] = false
        end
        if bioassay
          if metadata["numeric"]
            feature = NumericBioAssay.find_or_create_by(metadata)
          elsif metadata["nominal"]
            feature = NominalBioAssay.find_or_create_by(metadata)
          end
        else
          metadata.merge({:measured => false, :calculated => true})
          if metadata["numeric"]
            feature = NumericFeature.find_or_create_by(metadata)
          elsif metadata["nominal"]
            feature = NominalFeature.find_or_create_by(metadata)
          end
        end
        dataset.feature_ids << OpenTox::Feature.find_or_create_by(metadata).id
      end
      feature_ids = dataset.features.collect{|f| f.id.to_s}
      
      $logger.debug "Feature values: #{Time.now-time}"
      time = Time.now

      # compounds and values
      r = -1

      compound_time = 0
      value_time = 0

      table.each_with_index do |vals,j|
        ct = Time.now
        identifier = vals.shift
        begin
          case compound_format
          when /SMILES/i
            compound = OpenTox::Compound.from_smiles(identifier)
            if compound.inchi.empty?
              dataset.warnings << "Cannot parse #{compound_format} compound '#{compound.strip}' at position #{j+2}, all entries are ignored."
              next
            end
          when /InChI/i
            compound = OpenTox::Compound.from_inchi(identifier)
          end
        rescue
          dataset.warnings << "Cannot parse #{compound_format} compound '#{compound}' at position #{j+2}, all entries are ignored."
          next
        end
        compound_time += Time.now-ct
        dataset.compound_ids << compound.id
          
        r += 1
        unless vals.size == feature_ids.size # way cheaper than accessing dataset.features
          dataset.warnings << "Number of values at position #{j+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
          next
        end

        cid = compound.id.to_s
        vals.each_with_index do |v,i|
          if v.blank?
            dataset.warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[i]}' (column #{i+2})."
            next
          elsif numeric[i]
            dataset.bulk << [cid,feature_ids[i],v.to_f]
          else
            dataset.bulk << [cid,feature_ids[i],v.split]
          end
        end
      end
      dataset.compounds.duplicates.each do |compound|
        positions = []
        dataset.compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
        dataset.warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." 
      end
      
      $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
      time = Time.now
      dataset.bulk_write
      dataset.save

      dataset
    end
  end
end