Source code for process_improve.univariate.metrics

from __future__ import annotations

import warnings
from collections import defaultdict
from typing import TYPE_CHECKING, Any, NoReturn

import numpy as np
import pandas as pd
from scipy.stats import shapiro, t

if TYPE_CHECKING:
    from collections.abc import Callable

__eps = np.finfo(np.float32).eps



[docs]
def t_value(p: float, v: float) -> float:
    r"""
    Return the value on the x-axis if you plot the cumulative t-distribution with a fractional
    area of `p` (p is therefore a fractional value between 0 and 1 on the y-axis) and `v` is the
    degrees of freedom.


    Examples
    --------
    Since the cumulative distribution passes symmetrically through the x-axis at 0.0 for any
    number of degrees of freedom

    >>> t_value(0.5, v)
    0.0

    Zero fractional area under the curve is always at :math:`-\infty`:

    >>> t_value(0.0, v)
    -Inf

    100% fractional area is always at :math:`+\infty`:

    >>> t_value(1.0, v)
    +Inf

    See also
    --------
    t_value_cdf: does the inverse of this function.
    """
    return t.ppf(p, df=v)




[docs]
def t_value_cdf(z: float, v: float) -> float:
    r"""
    Return the fractional area under the cumulative t-distribution (y-axis value) at the t-value
    `z` on the x-axis, with `v` degrees of freedom.

    Examples
    --------
    The cumulative distribution is symmetric through the x-axis at 0.0 for any number of degrees
    of freedom, so half of the area lies below zero:

    >>> t_value_cdf(0.0, v)
    0.5

    Zero fractional area under the curve is at :math:`-\infty`:

    >>> t_value_cdf(-np.inf, v)
    0.0

    100% fractional area is at :math:`+\infty`:

    >>> t_value_cdf(np.inf, v)
    1.0

    See also
    --------
    t_value: does the inverse of this function.
    """
    return t.cdf(z, df=v)




[docs]
def test_normality(x: np.ndarray | pd.Series) -> float:
    """
    Check the p-value of the hypothesis that the data are from a normal distribution.

    If the p-value is less than the chosen alpha level (e.g. 0.05 or 0.025), then there is
    evidence that the data tested are NOT normally distributed.

    On the other hand, if the p-value is greater than the chosen alpha level, then the null
    hypothesis that the data came from a normally distributed population can not be rejected.
    NOTE: it does not mean that the data are normally distributed, just that we have nothing better
    to say about it. See the `Shapiro-Wilk test`_.

    Implementation: Uses the Shapiro Wilk test directly taken from `scipy.stats.shapiro`_.

    .. _Shapiro-Wilk test:
        https://en.wikipedia.org/wiki/Shapiro-Wilk_test

    .. _scipy.stats.shapiro:
        https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html
    """
    output = shapiro(x)
    return output[1]




[docs]
def Sn(x: np.ndarray | pd.Series, constant: float = 1.1926) -> np.floating:  # noqa: N802
    """
    Compute a robust scale estimator. The Sn metric is an efficient alternative to MAD.

    Parameters
    ----------
    x : np.ndarray or pd.Series
        A vector of values. NaN entries are ignored.
    constant : float, optional
        Multiplicative constant that makes the estimator consistent with iid values from a
        Gaussian distribution with no outliers. Default is 1.1926.

    Returns
    -------
    np.floating
        A scalar value, the Sn estimate of spread. Returns NaN if all entries are missing,
        and 0.0 when only a single non-missing value is supplied.

    Notes
    -----
    Tested against once of the most reliable open-source packages, written by some of the
    most respected names in the area of robust methods: [1]_ and [2]_.

    Disadvantages of MAD:

    *   It does not have particularly high efficiency for data that is in fact normal (37%).
        In comparison, the median has 64% efficiency for normal data.

    *   The MAD statistic also has an implicit assumption of symmetry. That is, it measures the
        distance from a measure of central location (the median).

    References
    ----------
    .. [1] https://cran.r-project.org/web/packages/robustbase/
    .. [2] Rousseeuw, Peter J.; Croux, Christophe (December 1993),
        "Alternatives to the Median Absolute Deviation", Journal of the American Statistical
        Association, American Statistical Association, 88 (424): 1273-1283,
        https://dx.doi.org/10.2307/2291267

    """
    arr = np.asarray(x, dtype=np.float64)
    n = np.sum(~np.isnan(arr))
    if n == 0:
        return np.float64(np.nan)
    elif n == 1:
        return np.float64(0.0)

    # Remove NaNs and compute all pairwise absolute differences via broadcasting
    clean = arr[~np.isnan(arr)]
    diffs = np.abs(clean[:, None] - clean[None, :])
    medians = np.median(diffs, axis=1)

    if n <= 9:
        # Correction factors for n = 2 to 9:
        correction = [0.743, 1.851, 0.954, 1.351, 0.993, 1.198, 1.005, 1.131][n - 2]
    elif n % 2:
        correction = n / (n - 0.9)  # odd number of cases
    else:
        correction = 1.0

    return constant * np.median(medians) * correction



def _contains_nan(a: np.ndarray, nan_policy: str = "propagate") -> tuple[bool, str]:
    """From scipy.stats.stats."""
    policies = ["propagate", "raise", "omit"]
    if nan_policy not in policies:
        raise ValueError(f"nan_policy must be one of {{{', '.join(f'{s!r}' for s in policies)}}}")
    try:
        # Calling np.sum to avoid creating a huge array into memory
        # e.g. np.isnan(a).any()
        with np.errstate(invalid="ignore"):
            contains_nan = np.isnan(np.sum(a))
    except TypeError:
        # This can happen when attempting to sum things which are not
        # numbers (e.g. as in the function `mode`). Try an alternative method:
        try:
            contains_nan = any(np.isnan(v) for v in a.ravel())
        except TypeError:  # pragma: no cover
            # Don't know what to do. Fall back to omitting nan values and
            # issue a warning.
            contains_nan = False
            nan_policy = "omit"
            warnings.warn(
                "The input array could not be properly checked for nan values. nan values will be ignored.",
                RuntimeWarning,
                stacklevel=2,
            )

    if contains_nan and nan_policy == "raise":
        raise ValueError("The input contains nan values")

    return (contains_nan, nan_policy)



[docs]
def ttest_independent(sample_A: pd.Series, sample_B: pd.Series, conflevel: float = 0.995) -> dict:
    """Core calculation for a test of differences between the average of A and the average of B.
    No checking of inputs.

    Parameters
    ----------
    sample_A : iterable
        Vector of n_a measurements.
    sample_B : iterable
        Vector of n_b measurements.
    conflevel : float
        Value between 0 and 1 (closer to 1.0), that gives the level of confidence required for
        the 2-sided test.

    Returns
    -------
    dict
        Outcomes from the statistical test.
    """
    axis = 0
    v1, v2 = sample_A.var(axis, ddof=1), sample_B.var(axis, ddof=1)
    n_A, n_B = sample_A.shape[axis], sample_B.shape[axis]
    m1, m2 = sample_A.mean(), sample_B.mean()
    df = n_A + n_B - 2.0
    ct = abs(t_value(p=(1 - conflevel) / 2.0, v=df))
    d = m2 - m1
    svar = ((n_A - 1) * v1 + (n_B - 1) * v2) / df

    with np.errstate(divide="ignore", invalid="ignore"):
        sd_z_variate = np.sqrt(svar * (np.divide(1.0, n_A) + np.divide(1.0, n_B)))
        z_variate = np.divide(d, sd_z_variate)

    confint_lo = d - ct * sd_z_variate
    confint_hi = d + ct * sd_z_variate

    return {
        "Group A number": int(n_A),
        "Group B number": int(n_B),
        "Group A average": sample_A.mean(),
        "Group B average": sample_B.mean(),
        "z value": z_variate,
        "ConfInt: Lo": confint_lo,
        "ConfInt: Hi": confint_hi,
        "p value": 2 * t_value_cdf(-np.abs(z_variate), df),
        "Degrees of freedom": df,
        "Pooled standard deviation": sd_z_variate,
    }




[docs]
def ttest_independent_from_df(
    df: pd.DataFrame, grouper_column: str, values_column: str, conflevel: float = 0.995,
) -> pd.DataFrame:
    """
    Calculate the t-test for differences between two or more groups and returns a confidence
    interval for the difference. The test is for UNPAIRED differences.

    The dataframe `df` contains a `grouper_column` with 2 or more unique values (e.g. 'A' and 'B').
    All unique values of the `grouper_column` are used, and t-tests are done between the values
    in the `values_column`.

    Args:
        df (pd.DataFrame): Dataframe of the values and grouping variable.
        grouper_column (str): Indicates which column will be grouped on.
        values_column (str): Which column contains the numeric values to calculate the test on.
        conflevel (float, optional): [description]. Defaults to 0.995.

    Output: Dataframe with columns containing the statistical outputs of the t-test, including:
        1. Group "A" name
        2. Group "B" name
        3. Group "A" mean
        4. Group "B" mean
        5. z-value for the difference between group "B" minus group "A"
        6. p-value for this z-value
        7. Confidence interval low value for difference between group "B" minus group "A"
        8. Confidence interval high value for difference between group "B" minus group "A"

    Example:
    df : has 3 levels in the grouper variable; ['Marco', 'Pete', 'Sam']

    Output will have 3 rows:
    Group A name  Group B name
    Marco         Pete
    Marco         Sam
    Pete          Sam
    """

    data_subset = df[[grouper_column, values_column]].copy()
    data_subset = data_subset.dropna()
    groups = df[grouper_column].unique()
    output = pd.DataFrame()
    groups = list(groups)
    while len(groups) > 0:
        groupA_name = groups.pop(0)
        for groupB_name in groups:
            sample_A = data_subset[data_subset[grouper_column].eq(groupA_name)][values_column]
            sample_B = data_subset[data_subset[grouper_column].eq(groupB_name)][values_column]
            sample_A = sample_A.astype(np.float64)
            sample_B = sample_B.astype(np.float64)
            basic_stats = ttest_independent(sample_A, sample_B, conflevel)
            basic_stats.update(
                {
                    "Group A name": groupA_name,
                    "Group B name": groupB_name,
                }
            )
            output = pd.concat([output, pd.DataFrame(basic_stats, index=[0])])

    return output




[docs]
def ttest_paired(differences: pd.Series, conflevel: float = 0.995) -> dict:
    """Core calculation for a test of paired differences.

    Parameters
    ----------
    differences : pd.Series
        The paired differences (e.g. ``sample_A - sample_B``) for which the test is run.
    conflevel : float, optional
        Value between 0 and 1 (closer to 1.0), that gives the level of confidence required for
        the 2-sided test. Default is 0.995.

    Returns
    -------
    dict
        Outcomes from the statistical test.
    """
    diff_mean = differences.mean()
    diff_svar = differences.std(ddof=1)
    dof = differences.shape[0] - 1  # n-1 d

    # By the central limit theorem, the `differences` values should be normally distributed
    # with average (central value) given by mean of group A values, minus mean of group B values.
    # Scale factor is the standard deviations of `differences`: estimated, therefore t-distribution
    ct = abs(t_value(p=(1 - conflevel) / 2.0, v=dof))

    with np.errstate(divide="ignore", invalid="ignore"):
        sd_z_variate = diff_svar * np.sqrt(np.divide(1.0, dof + 1))
        z_variate = np.divide(diff_mean, sd_z_variate)

    confint_lo = diff_mean - ct * sd_z_variate
    confint_hi = diff_mean + ct * sd_z_variate

    return {
        "Differences mean": diff_mean,
        "z value": z_variate,
        "ConfInt: Lo": confint_lo,
        "ConfInt: Hi": confint_hi,
        "p value": 2 * t_value_cdf(-np.abs(z_variate), dof),
        "Degrees of freedom": dof,
        "Standard deviation": sd_z_variate,
    }




[docs]
def ttest_paired_from_df(
    df: pd.DataFrame, grouper_column: str, values_column: str, conflevel: float = 0.995,
) -> pd.DataFrame:
    """
    Calculate the t-test for paired differences between two or more groups and returns a
    confidence interval for the difference. The test is for PAIRED differences.
    The differences is always defined as the A values minus the B values: after - before, or A - B.

    The dataframe `df` contains a `grouper_column` with 2 or more unique values (e.g. 'A' and 'B').
    All unique values of the `grouper_column` are used, and t-tests are done between the values
    in the `values_column`.

    When selecting the columns, the number of values per column must be the same.

    Args:
        df (pd.DataFrame): Dataframe of the values and grouping variable.
        grouper_column (str): Indicates which column will be grouped on.
        values_column (str): Which column contains the numeric values to calculate the test on.
        conflevel (float, optional): [description]. Defaults to 0.995.

    Output: Dataframe with columns containing the statistical outputs of the t-test, including:

    1. Group A name
    2. Group B name
    3. Group A mean
    4. Group B mean
    5. Differences mean: average difference between the groups (not the same as the difference
       of the averages from items 3 and 4 above)
    6. z-value for the difference between group "B" minus group "A"
    7. p-value for this z-value
    8. Confidence interval low value for difference between group "B" minus group "A"
    9. Confidence interval high value for difference between group "B" minus group "A"

    """
    data_subset = df[[grouper_column, values_column]].copy()
    data_subset = data_subset.dropna()
    groups = df[grouper_column].unique()
    output = pd.DataFrame()
    groups = list(groups)
    while len(groups) > 0:
        groupA_name = groups.pop(0)
        for groupB_name in groups:
            sample_A = data_subset[data_subset[grouper_column].eq(groupA_name)][values_column]
            sample_B = data_subset[data_subset[grouper_column].eq(groupB_name)][values_column]
            sample_A = sample_A.astype(np.float64)
            sample_B = sample_B.astype(np.float64)
            assert sample_A.shape[0] == sample_B.shape[0]
            differences = sample_A - sample_B.values  # only the .values of one vector are needed!
            basic_stats = ttest_paired(differences, conflevel)
            basic_stats.update(
                {
                    "Group A name": groupA_name,
                    "Group B name": groupB_name,
                    "Group A number": sample_A.shape[0],
                    "Group B number": sample_B.shape[0],
                    "Group A average": sample_A.mean(),
                    "Group B average": sample_B.mean(),
                }
            )
            output = pd.concat([output, pd.DataFrame(basic_stats, index=[0])])

    return output




[docs]
def confidence_interval(df: pd.DataFrame, column_name: str, conflevel: float = 0.95, style: str = "robust") -> tuple:
    """
    Calculate the confidence interval, returned as a tuple, for the `column_name` (str) in the
    dataframe `df`, for a given confidence level `conflevel` (default: 0.95).

    `style`: ['robust'; 'regular']: indicates which style of estimates to use for the center and
                                    spread. Default: 'robust'

    Missing values are ignored.
    """

    # TODO : http://www.rips-irsp.com/article/10.5334/irsp.82/

    data = df[column_name]
    n = data.count()

    if style.lower() == "robust":
        center = data.median()
        spread = median_absolute_deviation(data.values, nan_policy="omit")
    else:
        center = data.mean()
        spread = data.std()

    c_t = t_value(1 - (1 - conflevel) / 2, n - 1)
    return (center - c_t * spread / np.sqrt(n), center + c_t * spread / np.sqrt(n))



def _mad_1d(x: np.ndarray, center: Callable, nan_policy: str) -> float:
    """Taken from `scipy.stats.stats`.

    Median absolute deviation for 1-d array x.
    This is a helper function for `median_abs_deviation`; it assumes its
    arguments have been validated already.  In particular,  x must be a
    1-d numpy array, center must be callable, and if nan_policy is not
    'propagate', it is assumed to be 'omit', because 'raise' is handled
    in `median_abs_deviation`.
    No warning is generated if x is empty or all nan.
    """
    isnan = np.isnan(x)
    if isnan.any():
        if nan_policy == "propagate":
            return np.nan
        x = x[~isnan]
    if x.size == 0:
        # MAD of an empty array is nan.
        return np.nan
    # Edge cases have been handled, so do the basic MAD calculation.
    med = center(x)
    return np.median(np.abs(x - med))



[docs]
def median_absolute_deviation(
    x: np.ndarray,
    axis: int = 0,
    center: Callable = np.median,
    scale: str | float = "normal",
    nan_policy: str = "omit",
) -> np.ndarray | float:
    r"""
    Taken from `scipy.stats.stats`: we want the same functionality, but with a slightly different
    default function signature.

    *   `scale='normal'` instead of `scale=1.0`.
    *   `nan_policy='omit'` instead of `nan_policy='propogate'`

    Compute the median absolute deviation of the data along the given axis.  The median absolute
    deviation (MAD_) computes the median over the absolute deviations from the median. It is a
    measure of dispersion similar to the standard deviation but more `robust to outliers`_.
    The MAD of an empty array is ``np.nan``.

    Parameters
    ----------
    x : array_like
        Input array or object that can be converted to an array.
    axis : int or None, optional
        Axis along which the range is computed. Default is 0.
        Axis == None will not be accepted anymore.
    center : callable, optional
        A function that will return the central value. The default is to use
        np.median. Any user defined function used will need to have the
        function signature ``func(arr, axis)``.
    scale : scalar or str, optional
        The numerical value of scale will be divided out of the final
        result. The default is 1.0. The string "normal" is also accepted,
        and results in `scale` being the inverse of the standard normal
        quantile function at 0.75, which is approximately 0.67449.
        Array-like scale is also allowed, as long as it broadcasts correctly
        to the output such that ``out / scale`` is a valid operation. The
        output dimensions depend on the input array, `x`, and the `axis`
        argument.
    nan_policy : {'propagate', 'raise', 'omit'}, optional
        Defines how to handle when input contains nan.
        The following options are available (default is 'propagate'):
        * 'propagate': returns nan
        * 'raise': throws an error
        * 'omit': performs the calculations ignoring nan values

    Returns
    -------
    mad : scalar or ndarray
        If the input contains
        integers or floats of smaller precision than ``np.float64``, then the
        output data-type is ``np.float64``. Otherwise, the output data-type is
        the same as that of the input.

    See Also
    --------
    numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean,
    scipy.stats.tstd, scipy.stats.tvar

    Notes
    -----
    The `center` argument only affects the calculation of the central value
    around which the MAD is calculated. That is, passing in ``center=np.mean``
    will calculate the MAD around the mean - it will not calculate the *mean*
    absolute deviation.

    The input array may contain `inf`, but if `center` returns `inf`, the
    corresponding MAD for that data will be `nan`.

    References
    ----------

    .. _MAD:
        "Median absolute deviation",
        https://en.wikipedia.org/wiki/Median_absolute_deviation

    .. _robust to outliers:
        "Robust measures of scale",
        https://en.wikipedia.org/wiki/Robust_measures_of_scale

    Examples
    --------
    When comparing the behavior of `median_abs_deviation` with ``np.std``,
    the latter is affected when we change a single value of an array to have an
    outlier value while the MAD hardly changes:

    >>> from scipy import stats
    >>> x = stats.norm.rvs(size=100, scale=1, random_state=123456)
    >>> x.std()
    0.9973906394005013
    >>> stats.median_abs_deviation(x)
    0.82832610097857
    >>> x[0] = 345.6
    >>> x.std()
    34.42304872314415
    >>> stats.median_abs_deviation(x)
    0.8323442311590675

    Axis handling example:

    >>> x = np.array([[10, 7, 4], [3, 2, 1]])
    >>> x
    array([[10,  7,  4],
           [ 3,  2,  1]])
    >>> stats.median_abs_deviation(x)
    array([3.5, 2.5, 1.5])
    >>> stats.median_abs_deviation(x, axis=None)  <-- syntax of `axis=None` will be depricated
    2.0

    Scale normal example:

    >>> x = stats.norm.rvs(size=1000000, scale=2, random_state=123456)
    >>> stats.median_abs_deviation(x)
    1.3487398527041636
    >>> stats.median_abs_deviation(x, scale='normal')
    1.9996446978061115
    """
    assert axis is not None, "axis=None is now depricated. Unraval the array."
    if not callable(center):
        raise TypeError(f"The argument 'center' must be callable. The given value {center!r} is not callable.")

    # An error may be raised here, so fail-fast, before doing lengthy
    # computations, even though `scale` is not used until later
    if isinstance(scale, str):
        if scale.lower() == "normal":
            scale = 0.6744897501960817  # special.ndtri(0.75)
        else:
            raise ValueError(f"{scale} is not a valid scale value.")

    x = np.asarray(x)

    # Consistent with `np.var` and `np.std`.
    if not x.size:
        nan_shape = tuple(item for i, item in enumerate(x.shape) if i != axis)
        if nan_shape == ():
            # Return nan, not array(nan)
            return np.nan
        return np.full(nan_shape, np.nan)  # pragma: no cover

    contains_nan, nan_policy = _contains_nan(x, nan_policy)

    if contains_nan:
        mad = np.apply_along_axis(_mad_1d, axis, x, center, nan_policy)
    else:
        # Wrap the call to center() in expand_dims() so it acts like
        # keepdims=True was used.
        med = np.expand_dims(center(x, axis=axis), axis)
        mad = np.median(np.abs(x - med), axis=axis)

    return mad / scale




[docs]
def summary_stats(x: np.ndarray | pd.Series, method: str = "robust") -> dict:
    """
    Return summary statistics of the numeric values in vector `x`.

    Arguments:
        x (Numpy vector or Pandas series): A vector of univariate values to summarize.

    Returns:
        dict: a summary of the univariate vector. The following outputs are the most interesting:

            "center":   a measure of the center  (average). If method is ``robust``, this is the
                        median.
            "spread":   a measure of the spread. If method is ``robust``, this is the Sn, a robust
                        spread estimate.
    """
    if isinstance(x, pd.Series):
        x = x.copy(deep=True).values
    elif isinstance(x, np.ndarray):
        x = x.ravel()
    else:
        raise TypeError("Expecting a NumPy vector or Pandas series.")

    out = {}
    out["mean"] = np.nanmean(x)
    out["std_ddof0"] = np.nanstd(x, ddof=0)
    out["std_ddof1"] = np.nanstd(x, ddof=1)
    out["rsd_classical"] = out["std_ddof1"] / out["mean"]
    (
        out["percentile_05"],
        out["percentile_25"],
        out["median"],
        out["percentile_75"],
        out["percentile_95"],
    ) = np.nanpercentile(x, [5, 25, 50, 75, 95])
    out["iqr"] = out["percentile_75"] - out["percentile_25"]
    out["min"] = np.nanmin(x)
    out["max"] = np.nanmax(x)
    out["N_non_missing"] = np.sum(~np.isnan(x))

    if method.lower() == "robust":
        out["center"], out["spread"] = out["median"], Sn(x)
        if ((out["max"] - out["min"]) > 0) and (out["spread"] == 0) and (out["N_non_missing"] > 0):
            # Don't fully trust the Sn() yet. It works strangely on quantized data when there is
            # little variation. Replace the RSD with the classically calculated version in this
            # very specific case. This example shows it: [99, 95, 95, 100, 100, 100, 100, 95, 100,
            # 100, 100, 100, 105, 105, 100, 95, 105, 100, 95, 100]
            out["center"], out["spread"] = out["mean"], out["std_ddof1"]

    else:
        out["center"], out["spread"] = out["mean"], out["std_ddof1"]

    out["rsd"] = out["spread"] / out["center"]
    return out




[docs]
def detect_outliers_esd(
    x: np.ndarray | pd.Series, algorithm: str = "esd", max_outliers_detected: int = 1, **kwargs
) -> tuple[list[int], defaultdict[Any, Any]]:
    """
    Return a list of indexes of points in the vector `x` which are likely outliers.

    A second output (can be ignored) contains the details of the values used to make the decision.

    Arguments:
        x {list, sequence, NumPy vector/array} -- [A sequence, list or vector which can be
        unravelled.]

    Keyword Arguments:
        algorithm -- Two algorithms are possible to detect outliers: (default: "esd")

            'esd': Generalized ESD Test for Outliers.
            If `max_outliers_detected=1` this is essentially Grubb's test.
            For more details, please see:
            https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm

            'cc-robust': Build a robust control-chart for the sequence `x` and
            points should lie outside the +/- 3 sigma limits are considered
            outliers.
            Not Implemented Yet: left here as an idea for the future, but not confirmed yet.

        max_outliers_detected -- The maximum number of outliers that
            should be detected, as required by the algorithms.

        kwargs -- Algorithm dependent arguments. Defaults are shown here.

            'esd':

                'robust_variant' = True. Uses the median and MAD for the center
                and the standard deviation respectively.

                'alpha' = 0.05. The significance level of the testing.


    """
    algorithm = algorithm.strip().lower()
    x = pd.Series(x).reset_index(drop=True)
    max_outliers_detected = int(max_outliers_detected)

    if algorithm == "esd":
        """
        https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm

        Note: the two-sided test is implemented here. p = 1-alpha/(2*(n-i+1))
        """

        # 0. Default settings
        robust_variant = bool(kwargs.get("robust_variant", True))
        alpha = float(kwargs.get("alpha", 0.05))

        assert alpha <= 1.0
        assert max_outliers_detected <= len(x)

        # 1. Run K-S test first to check normality

        # 2. https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm
        x = x.copy(deep=True)
        extra_out = defaultdict(list)
        N = x.shape[0] - pd.isna(x).sum()

        for k in range(max_outliers_detected):
            i = k + 1
            extra_out["i"].append(i)

            if robust_variant:
                variation = median_absolute_deviation(x)
                R = ((x - x.median()) / variation).abs()
            else:
                variation = x.std()
                R = ((x - x.mean()) / variation).abs()

            dof = N - i - 1
            p = 1 - alpha / (2 * (N - i + 1))
            t_s = t_value(p, dof)
            lambda_i = (N - i) * t_s / np.sqrt((dof + np.power(t_s, 2)) * (dof + 2))
            extra_out["lambda"].append(lambda_i)

            g = R.max()
            s = g**2 * N * (2 - N) / (g**2 * N - (N - 1) ** 2)  # Formula from R-function for the Grubb's calculation
            p_value = 0 if s <= 0 else min(N * (1 - t_value_cdf(np.sqrt(s), N - 2)), 1)
            R_i_idx = -1 if pd.isna(p_value) else R.idxmax()
            extra_out["R_i_idx"].append(R_i_idx)
            extra_out["R_i"].append(R.max())
            extra_out["p-value"].append(p_value)

            if variation > __eps:
                # The variation, if zero or small, will fail to drop this index
                x = x.drop(R_i_idx)

        try:
            cutoff_i = np.where(
                np.array(extra_out["R_i"]) - np.array(extra_out["lambda"]) >= 0,
            )[0][0]
        except IndexError:
            cutoff_i = -1

        # The outlier indices are the points from the start of
        # `extra_out['R_i_idx']`, up to, and including, the `cutoff_i` index
        # in the list.
        extra_out["cutoff"] = cutoff_i

        outlier_index = extra_out["R_i_idx"][0 : (cutoff_i + 1)]
        return outlier_index, extra_out
    else:
        return [], defaultdict(dict)




[docs]
def variance_decomposition(df: pd.DataFrame, measured: str, repeat: str) -> dict:
    """
    Given a DataFrame `df` of raw data, and an indication of which column is the `measured` value
    column, and which is the `repeat` indicator, it will calculate the within and between replicate
    standard deviation.

    Example
    Two measurements on day 1 ``[101, 102]`` and two measurements on day 2 ``[94, 95]``. The
    between-day variation can already be expected to be much greater than the within-day variation.

    >>> df = pd.DataFrame(data={'Result': [101, 102, 94, 95], 'Repeat': [1, 1, 2, 2]})
        Result  Repeat
    0     101       1
    1     102       1
    2      94       2
    3      95       2
    >>> output = within_between_standard_deviation(df, measured="Result", repeat="Repeat")
    {'total_ms':       16.666667,
     'total_dof':      3,
     'within_ms':      0.5,
     'within_stddev':  0.70711,
     'within_dof':     2,
     'between_ms':     49.0,
     'between_stddev': 7.0,
     'between_dof':    1}

    Note
    * SSQ = sum of squares
    * DOF= degrees of freedom
    * MS = mean square = (sum of squares) / (degrees of freedom) = SSQ / DOF = variance
    """

    # Overall statistics:
    total_ms = max(0, df[measured].var())
    total_dof = max(0, df[measured].count() - 1)

    # Within a group: calculate the standard deviation (of variance) of each group. If you take
    # the average of those variances, pooled, you get an estimate of the within-group variance.
    within_ms = 0.0
    within_dof = 0
    for _, group in df.groupby(repeat):
        dof_group_i = max(0, group[measured].count() - 1)
        within_dof += dof_group_i
        # handling missing data makes for messier code
        within_ms = np.nansum([within_ms, group[measured].var() * dof_group_i])

    within_ms = 0 if within_dof == 0 else within_ms / within_dof

    # Between groups: the ANOVA relationship is such that SSQ(total) = SSQ(between) + SSQ(within).
    # Therefore the between-group statistics are found by differencing
    between_dof = max(0, total_dof - within_dof)
    between_ms = 0 if between_dof == 0 else max(0.0, (total_ms * total_dof - within_ms * within_dof) / between_dof)

    return {
        "total_ms": total_ms,
        "total_dof": total_dof,
        "within_ms": within_ms,
        "within_stddev": np.sqrt(within_ms),
        "within_dof": within_dof,
        "between_ms": between_ms,
        "between_stddev": np.sqrt(between_ms),
        "between_dof": between_dof,
    }



# ---------------------------------------------------------------------------
# Migration helpers - old names raise helpful errors
# ---------------------------------------------------------------------------

_RENAMED = {
    "median_abs_deviation": "median_absolute_deviation",
    "normality_check": "test_normality",
    "within_between_standard_deviation": "variance_decomposition",
    "outlier_detection_multiple": "detect_outliers_esd",
    "ttest_difference_calculate": "ttest_independent",
    "ttest_paired_difference_calculate": "ttest_paired",
    "ttest_difference": "ttest_independent_from_df",
    "ttest_paired_difference": "ttest_paired_from_df",
}


def __getattr__(name: str) -> NoReturn:
    if name in _RENAMED:
        new = _RENAMED[name]
        raise AttributeError(
            f"{name!r} has been renamed to {new!r}. "
            f"Use: from process_improve.univariate.metrics import {new}"
        )
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")