module OpenTox

  class CrossValidation
    field :validation_ids, type: Array, default: []
    field :model_id, type: BSON::ObjectId
    field :folds, type: Integer
    field :nr_instances, type: Integer
    field :nr_unpredicted, type: Integer
    field :predictions, type: Array, default: []
    field :finished_at, type: Time 

    def time
      finished_at - created_at
    end

    def validations
      validation_ids.collect{|vid| Validation.find vid}
    end

    def model
      Model::Lazar.find model_id
    end

    def self.create model, n=10
      cv = self.new(
        name: model.name,
        model_id: model.id,
        folds: n
      )
      cv.save # set created_at
      nr_instances = 0
      nr_unpredicted = 0
      predictions = []
      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
      training_dataset = Dataset.find model.training_dataset_id
      training_dataset.folds(n).each_with_index do |fold,fold_nr|
        fork do # parallel execution of validations
          $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
          t = Time.now
          #p validation_class#.create(model, fold[0], fold[1],cv)
          validation = validation_class.create(model, fold[0], fold[1],cv)
          #p validation
          $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
        end
      end
      Process.waitall
      cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
      cv.validations.each do |validation|
        nr_instances += validation.nr_instances
        nr_unpredicted += validation.nr_unpredicted
        predictions += validation.predictions
      end
      cv.update_attributes(
        nr_instances: nr_instances,
        nr_unpredicted: nr_unpredicted,
        predictions: predictions
      )
      cv
    end
  end

  class ClassificationCrossValidation < CrossValidation

    field :accept_values, type: Array
    field :confusion_matrix, type: Array
    field :weighted_confusion_matrix, type: Array
    field :accuracy, type: Float
    field :weighted_accuracy, type: Float
    field :true_rate, type: Hash
    field :predictivity, type: Hash
    # TODO auc, f-measure (usability??)

    def self.create model, n=10
      cv = super model, n
      accept_values = Feature.find(model.prediction_feature_id).accept_values
      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
      true_rate = {}
      predictivity = {}
      cv.predictions.each do |pred|
        compound_id,activity,prediction,confidence = pred
        if activity and prediction and confidence.numeric? 
          if prediction == activity
            if prediction == accept_values[0]
              confusion_matrix[0][0] += 1
              weighted_confusion_matrix[0][0] += confidence
            elsif prediction == accept_values[1]
              confusion_matrix[1][1] += 1
              weighted_confusion_matrix[1][1] += confidence
            end
          elsif prediction != activity
            if prediction == accept_values[0]
              confusion_matrix[0][1] += 1
              weighted_confusion_matrix[0][1] += confidence
            elsif prediction == accept_values[1]
              confusion_matrix[1][0] += 1
              weighted_confusion_matrix[1][0] += confidence
            end
          end
        else
          nr_unpredicted += 1 if prediction.nil?
        end
      end
      true_rate = {}
      predictivity = {}
      accept_values.each_with_index do |v,i|
        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
      end
      confidence_sum = 0
      weighted_confusion_matrix.each do |r|
        r.each do |c|
          confidence_sum += c
        end
      end
      cv.update_attributes(
        accept_values: accept_values,
        confusion_matrix: confusion_matrix,
        weighted_confusion_matrix: weighted_confusion_matrix,
        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(cv.nr_instances-cv.nr_unpredicted).to_f,
        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
        true_rate: true_rate,
        predictivity: predictivity,
        finished_at: Time.now
      )
      cv.save
      cv
    end

    #Average area under roc  0.646
    #Area under roc  0.646
    #F measure carcinogen: 0.769, noncarcinogen: 0.348
  end

  class RegressionCrossValidation < CrossValidation

    field :rmse, type: Float
    field :mae, type: Float
    field :weighted_rmse, type: Float
    field :weighted_mae, type: Float
    field :r_squared, type: Float
    field :correlation_plot_id, type: BSON::ObjectId
    field :confidence_plot_id, type: BSON::ObjectId

    def self.create model, n=10
      cv = super model, n
      rmse = 0
      weighted_rmse = 0
      rse = 0
      weighted_rse = 0
      mae = 0
      weighted_mae = 0
      rae = 0
      weighted_rae = 0
      confidence_sum = 0
      cv.predictions.each do |pred|
        compound_id,activity,prediction,confidence = pred
        if activity and prediction
          error = Math.log10(prediction)-Math.log10(activity)
          rmse += error**2
          weighted_rmse += confidence*error**2
          mae += error.abs
          weighted_mae += confidence*error.abs
          confidence_sum += confidence
        else
          cv.warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}."
          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}."
        end
      end
      x = cv.predictions.collect{|p| p[1]}
      y = cv.predictions.collect{|p| p[2]}
      R.assign "measurement", x
      R.assign "prediction", y
      R.eval "r <- cor(-log(measurement),-log(prediction))"
      r = R.eval("r").to_ruby

      mae = mae/cv.predictions.size
      weighted_mae = weighted_mae/confidence_sum
      rmse = Math.sqrt(rmse/cv.predictions.size)
      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
      # TODO check!!
=begin
      cv.predictions.sort! do |a,b|
        relative_error_a = (a[1]-a[2]).abs/a[1].to_f
        relative_error_a = 1/relative_error_a if relative_error_a < 1
        relative_error_b = (b[1]-b[2]).abs/b[1].to_f
        relative_error_b = 1/relative_error_b if relative_error_b < 1
        [relative_error_b,b[3]] <=> [relative_error_a,a[3]]
      end
=end
      cv.update_attributes(
        mae: mae,
        rmse: rmse,
        weighted_mae: weighted_mae,
        weighted_rmse: weighted_rmse,
        r_squared: r**2
      )
      cv.save
      cv
    end

    def misclassifications n=nil
      #n = predictions.size unless n
      n = 20 unless n
      model = Model::Lazar.find(self.model_id)
      training_dataset = Dataset.find(model.training_dataset_id)
      prediction_feature = training_dataset.features.first
      predictions[0..n-1].collect do |p|
        compound = Compound.find(p[0])
        neighbors = compound.neighbors.collect do |n|
          neighbor = Compound.find(n[0])
          values = training_dataset.values(neighbor,prediction_feature)
          { :smiles => neighbor.smiles, :fingerprint => neighbor.fp4.collect{|id| Smarts.find(id).name},:similarity => n[1], :measurements => values}
        end
        {
          :smiles => compound.smiles, 
          :fingerprint => compound.fp4.collect{|id|  Smarts.find(id).name},
          :measured => p[1],
          :predicted => p[2],
          :relative_error => (p[1]-p[2]).abs/p[1].to_f,
          :confidence => p[3],
          :neighbors => neighbors
        }
      end
    end

    def confidence_plot
      tmpfile = "/tmp/#{id.to_s}_confidence.svg"
      sorted_predictions = predictions.sort{|a,b| b[3]<=>a[3]}.collect{|p| [(Math.log10(p[1])-Math.log10(p[2]))**2,p[3]]}
      R.assign "error", sorted_predictions.collect{|p| p[0]}
      #R.assign "p", predictions.collect{|p| p[2]}
      R.assign "confidence", predictions.collect{|p| p[2]}
      #R.eval "diff = log(m)-log(p)"
      R.eval "library(ggplot2)"
      R.eval "svg(filename='#{tmpfile}')"
      R.eval "image = qplot(confidence,error)"#,main='#{self.name}',asp=1,xlim=range, ylim=range)"
      R.eval "ggsave(file='#{tmpfile}', plot=image)"
        R.eval "dev.off()"
        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
        plot_id = $gridfs.insert_one(file)
        update(:confidence_plot_id => plot_id)
      $gridfs.find_one(_id: confidence_plot_id).data
    end

    def correlation_plot
      unless correlation_plot_id
        tmpfile = "/tmp/#{id.to_s}_correlation.svg"
        x = predictions.collect{|p| p[1]}
        y = predictions.collect{|p| p[2]}
        attributes = Model::Lazar.find(self.model_id).attributes
        attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
        attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
        p "'"+attributes
        R.eval "library(ggplot2)"
        R.eval "library(grid)"
        R.eval "library(gridExtra)"
        R.assign "measurement", x
        R.assign "prediction", y
        #R.eval "error <- log(Measurement)-log(Prediction)"
        #R.eval "rmse <- sqrt(mean(error^2, na.rm=T))"
        #R.eval "mae <- mean(abs(error), na.rm=T)"
        #R.eval "r <- cor(-log(prediction),-log(measurement))"
        R.eval "svg(filename='#{tmpfile}')"
        R.eval "all = c(-log(measurement),-log(prediction))"
        R.eval "range = c(min(all), max(all))"
        R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
        R.eval "image = image + geom_abline(intercept=0, slope=1) + stat_smooth(method='lm', se=FALSE)"
        R.eval "text = textGrob(paste('RMSE: ', '#{rmse.round(2)},','MAE:','#{mae.round(2)},','r^2: ','#{r_squared.round(2)}','\n\n','#{attributes}'),just=c('left','top'),check.overlap = T)"
        R.eval "grid.arrange(image, text, ncol=2)"
        R.eval "dev.off()"
        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
        plot_id = $gridfs.insert_one(file)
        update(:correlation_plot_id => plot_id)
      end
      p correlation_plot_id
      $gridfs.find_one(_id: correlation_plot_id).data
    end
  end


end