//
// This file is part of YODA -- Yet more Objects for Data Analysis
// Copyright (C) 2008-2025 The YODA collaboration (see AUTHORS for details)
//
#ifndef YODA_StatsUtils_H
#define YODA_StatsUtils_H

#include "YODA/Exceptions.h"
#include "YODA/Config/BuildConfig.h"
#include "YODA/Utils/MathUtils.h"

#include <algorithm>
#include <map>
#include <string>
#include <vector>

namespace YODA {

  /// @name Statistics functions
  /// @{

  /// @brief Calculate the effective number of entries of a sample.
  inline double effNumEntries(const double sumW, const double sumW2) {
    if (isZero(sumW2))  return 0;
    return sqr(sumW) / sumW2;
  }

  /// @brief Calculate the effective number of entries of a sample.
  inline double effNumEntries(const std::vector<double>& weights) {
    double sumW = 0.0, sumW2 = 0.0;
    for (size_t i = 0; i < weights.size(); ++i) {
      sumW += weights[i];
      sumW2 += sqr(weights[i]);
    }
    return effNumEntries(sumW, sumW2);
  }

  /// @brief Calculate the mean of a sample.
  template<typename T>
  inline double mean(const std::vector<T>& sample) {
    double mean = 0.0;
    for (size_t i=0; i<sample.size(); ++i) {
      mean += (double)sample[i];
    }
    return mean/(double)sample.size();
  }

  /// @brief Calculate the weighted mean of a sample.
  inline double mean(const double sumWX, const double sumW) {
    return sumW? sumWX / sumW : std::numeric_limits<double>::quiet_NaN();
  }

  /// @brief Calculate the weighted mean of a sample.
  inline double mean(const std::vector<double>& sample,
                     const std::vector<double>& weights) {
    if (sample.size() != weights.size())  throw RangeError("Inputs should have equal length!");
    double sumWX = 0., sumW = 0.;
    for (size_t i = 0; i < sample.size(); ++i) {
      sumW  += weights[i];
      sumWX += weights[i]*sample[i];
    }
    return mean(sumWX, sumW);
  }

  /// @brief Calculate the weighted variance of a sample.
  ///
  /// Weighted variance defined as
  /// sig2 = ( sum(wx**2) * sum(w) - sum(wx)**2 ) / ( sum(w)**2 - sum(w**2) )
  /// see http://en.wikipedia.org/wiki/Weighted_mean
  inline double variance(const double sumWX, const double sumW,
                         const double sumWX2, const double sumW2) {
    const double num = subtract(sumWX2*sumW, sqr(sumWX));
    const double den = subtract(sqr(sumW), sumW2);
    /// @todo Isn't this sensitive to the overall scale of the weights?
    /// Shouldn't it check if den is bigger then num by a set number of
    /// orders of magnitude and vice versa?
    // if (fabs(num) < 1e-10 && fabs(den) < 1e-10) {
    //   return std::numeric_limits<double>::quiet_NaN();
    // }
    /// We take the modulus of the weighted variance
    /// since the ratio can be negative with weighted means
    /// @todo Is this the correct approach? There is no information
    /// online other than "weights are non-negative"...
    return den? fabs(num/den): std::numeric_limits<double>::quiet_NaN();
  }

  /// @brief Calculate the weighted variance of a sample.
  inline double variance(const std::vector<double>& sample,
                         const std::vector<double>& weights) {
    if (sample.size() != weights.size())  throw RangeError("Inputs should have equal length!");
    if (fuzzyLessEquals(effNumEntries(weights), 1.0)) {
       //throw LowStatsError("Requested variance of a distribution with only one effective entry");
       return std::numeric_limits<double>::quiet_NaN();
    }
    double sumWX = 0., sumW = 0.;
    double sumWX2 = 0., sumW2 = 0.;
    for (size_t i = 0; i < sample.size(); ++i) {
      sumW   += weights[i];
      sumWX  += weights[i]*sample[i];
      sumW2  += sqr(weights[i]);
      sumWX2 += weights[i]*sqr(sample[i]);
    }
    return variance(sumWX, sumW, sumWX2, sumW2);
  }

  /// @brief Calculate the weighted standard deviation of a sample.
  inline double stdDev(const double sumWX, const double sumW,
                       const double sumWX2, const double sumW2) {
    return std::sqrt(variance(sumWX, sumW, sumWX2, sumW2));
  }

  /// @brief Calculate the weighted variance of a sample.
  inline double stdDev(const std::vector<double>& sample,
                       const std::vector<double>& weights) {
    return std::sqrt(variance(sample, weights));
  }

  /// @brief Calculate the weighted standard error of a sample.
  inline double stdErr(const double sumWX, const double sumW,
                       const double sumWX2, const double sumW2) {
    const double effN = effNumEntries(sumW, sumW2);
    if (effN == 0)  return std::numeric_limits<double>::quiet_NaN();
    const double var = variance(sumWX, sumW, sumWX2, sumW2);
    return std::sqrt(var / effN);
  }

  /// @brief Calculate the weighted variance of a sample.
  inline double stdErr(const std::vector<double>& sample,
                       const std::vector<double>& weights) {
    if (sample.size() != weights.size())  throw RangeError("Inputs should have equal length!");
    const double effN = effNumEntries(weights);
    if (effN == 0)  return std::numeric_limits<double>::quiet_NaN();
    const double var = variance(sample, weights);
    return std::sqrt(var / effN);
  }

  /// @brief Calculate the weighted RMS of a sample.
  inline double rms(const double sumWX2, const double sumW, const double sumW2) {
    // Weighted RMS defined as
    // rms = sqrt(sum{w x^2} / sum{w})
    const double effN = effNumEntries(sumW, sumW2);
    if (effN == 0)  return std::numeric_limits<double>::quiet_NaN();
    const double meanSq = sumWX2 / sumW;
    return std::sqrt(meanSq);
  }

  /// @brief Calculate the weighted RMS of a sample.
  inline double rms(const std::vector<double>& sample,
                    const std::vector<double>& weights) {
    if (sample.size() != weights.size())  throw RangeError("Inputs should have equal length!");
    double sumWX2 = 0., sumW = 0., sumW2 = 0.;
    for (size_t i = 0; i < sample.size(); ++i) {
      sumW   += weights[i];
      sumW2  += sqr(weights[i]);
      sumWX2 += weights[i]*sqr(sample[i]);
    }
    return rms(sumWX2, sumW, sumW2);
  }

  /// @brief Alias for rms
  ///
  /// @deprecated Just use rms()!
  inline double RMS(const double sumWX2, const double sumW, const double sumW2) {
    return rms(sumWX2, sumW, sumW2);
  }

  /// @brief Alias for rms
  ///
  /// @deprecated Just use rms()!
  inline double RMS(const std::vector<double>& sample,
                    const std::vector<double>& weights) {
    return rms(sample, weights);
  }

  /// @brief Calculate the covariance (variance) between two samples.
  template<typename T>
  inline double covariance(const std::vector<T>& sample1, const std::vector<T>& sample2) {
    const double mean1 = mean(sample1);
    const double mean2 = mean(sample2);
    const size_t N = sample1.size();
    double cov = 0.0;
    for (size_t i = 0; i < N; i++) {
      const double cov_i = (sample1[i] - mean1)*(sample2[i] - mean2);
      cov += cov_i;
    }
    if (N > 1) return cov/(N-1);
    else return 0.0;
  }


  /// @brief Calculate the correlation strength between two samples.
  template<typename T>
  inline double correlation(const std::vector<T>& sample1, const std::vector<T>& sample2) {
    const double cov = covariance(sample1, sample2);
    const double var1 = covariance(sample1, sample1);
    const double var2 = covariance(sample2, sample2);
    const double correlation = cov/sqrt(var1*var2);
    const double corr_strength = correlation*sqrt(var2/var1);
    return corr_strength;
  }


  /// @brief Calculate the cumulative distribution function for a sample.
  inline std::vector<double> cdf(std::vector<double> sample) {
    if (sample.empty())  return sample;

    const double total = std::accumulate(sample.begin(), sample.end(), 0.0);
    if (total == 0.0)  return sample;

    sample[0] /= total;
    for (size_t i = 1; i < sample.size(); ++i) {
        sample[i] = sample[i-1] + sample[i] / total;
    }
    return sample;
  }

  /// @brief Calculate the weighted average of two sets of values @a values1
  /// and @a values2 with associated uncertainties @a errors1 and @a errors2,
  /// respectively, taking the inverse squared uncertainty as the weight.
  inline std::vector<std::vector<double>> weightedAverage(const std::vector<double>& sample1,
                                                          const std::vector<double>& s1errors,
                                                          const std::vector<double>& sample2,
                                                          const std::vector<double>& s2errors) {
    if (sample1.size() !=  sample2.size())  throw RangeError("Inputs should have equal length!");
    if (sample1.size() != s1errors.size())  throw RangeError("Inputs should have equal length!");
    if (sample1.size() != s2errors.size())  throw RangeError("Inputs should have equal length!");
    std::vector<std::vector<double>> rtn; rtn.resize(2);
    rtn[0].reserve(sample1.size());
    rtn[1].reserve(sample1.size());
    for (size_t i=0; i<sample1.size(); ++i) {
      const double w1 = s1errors[i]? 1.0 / sqr(s1errors[i]) : 0.;
      const double w2 = s2errors[i]? 1.0 / sqr(s2errors[i]) : 0.;
      const double wsum = w1 + w2;
      const double wtot = wsum? 1.0 / wsum : 0.0;
      rtn[0].push_back( wtot * (w1*sample1[i] + w2*sample2[i]));
      rtn[1].push_back( wtot );
    }
    return rtn;
  }

  /// @brief Calculate the Kolmogorov-Smirnov test statistic between two samples.
  ///
  /// @note This implementation assumes that the bin widths are small compared
  /// with any physical phenomena of interest.
  inline double ksTest(const std::vector<double>& sample1, const std::vector<double>& sample2) {
    if (sample1.size() != sample2.size())  throw RangeError("Inputs should have equal length!");
    const std::vector<double> cdf1 = cdf(sample1);
    const std::vector<double> cdf2 = cdf(sample2);
    double D = 0.0;
    for (size_t i = 0; i < cdf1.size(); ++i) {
      D = std::max(D, std::abs(cdf1[i] - cdf2[i]));
    }
    return D;
  }

  /// @brief Approximate asymptotic p-value for a KS test statistics @a D,
  /// given two effective sample sizes @a n1 and @a n2.
  inline double pValFromKS(const double D, const double n1, const double n2,
                           const double tolerance = 1e-5) {
    if (isZero(D) || isZero(n1) || isZero(n2))  return 1.0;
    const double neff = (n1 * n2) / (n1 + n2);
    const double x = D * std::sqrt(neff);
    // According to https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
    // the cumulative Kolmogorov distribution is given by
    // Pr(K <= x) = 1 - 2* sum_{k=1}^inf (-1)^(k-1) exp(-2 * k^2 * x^2)
    // and so one-sided p-value = 1 - Pr(K <= x):
    size_t k = 0;
    double p = 0.0;
    while (++k) {
      const double term = std::exp(-2. * k * k * x * x);
      p += ((k % 2) ? 1 : -1) * term;
      if (term < tolerance) break;
    }
    return std::clamp(2*p, 0.0, 1.0);
  }

  /// @brief Calculate the error-weighted chi2 statistic between two samples
  ///
  /// @note This calculation is rather naive as it neglects the error breakdowns that
  /// may be available for the curves being compared. More sophisticated comparisons
  /// will require a separate analysis based on the Estimate objects utilising the full
  /// covariance information.
  inline double naiveChi2(const std::vector<double>& sample1, const std::vector<double>& sample2,
                          const std::vector<double>& s1errors = std::vector<double>{},
                          const std::vector<double>& s2errors = std::vector<double>{}) {
    if (sample1.size() != sample2.size()) {
      throw RangeError("Inputs should have equal length!");
    }
    if (s1errors.size() && sample1.size() != s1errors.size()) {
      throw RangeError("Inputs should have equal length!");
    }
    if (s2errors.size() && sample2.size() != s2errors.size()) {
      throw RangeError("Inputs should have equal length!");
    }
    const size_t N = sample1.size();
    double chi2 = 0.0;
    for (size_t i = 0; i < N; ++i) {
      double temp = sqr(sample1[i] - sample2[i]);
      if (s1errors.size()) {
        temp /= sqr(s1errors[i]) + sqr(s2errors[i]);
      }
      chi2 += temp;
    }
    return chi2;
  }

  /// @brief Calculate the error-weighted reduced chi2 statistic between two samples
  ///
  /// @note This calculation is rather naive as it neglects the error breakdowns that
  /// may be available for the curves being compared. More sophisticated comparisons
  /// will require a separate analysis based on the Estimate objects utilising the full
  /// covariance information.
  inline double naiveChi2reduced(const std::vector<double>& sample1, const std::vector<double>& sample2,
                                 const std::vector<double>& s1errors = std::vector<double>{},
                                 const std::vector<double>& s2errors = std::vector<double>{}) {
    if (sample1.empty()) throw RangeError("Inputs should not have 0 length!");
    return naiveChi2(sample1, sample2, s1errors, s2errors)/sample1.size();
  }

  /// @}

}

#endif
