Source code for process_improve.monitoring.metrics

import numpy as np
import pandas as pd
from sklearn.utils import Bunch

from ..univariate.metrics import Sn



[docs]
def calculate_cpk(  # noqa: C901
    df: pd.DataFrame,
    which_column: str,
    specifications: tuple[float, float] = (np.nan, np.nan),
    trim_percentile: float = 2.5,
) -> Bunch:
    """
    Calculate the process capability, Cpk, near either the lower or the upper limit [will be
    automatically determined which].

    Process capability, nearer the lower limit = (avg - lower_spec)/(3 x std deviation)
    Process capability, nearer the upper limit = (upper_spec - avg)/(3 x std deviation)

    Parameters
    ----------
    df : pd.DataFrame
        Raw data, at least one column is numeric.
    which_column : str
        Indicates which is the column of data that should be used for the Cpk calculation.
    specifications : tuple of (lower, upper), optional
        A 2-tuple ``(lower_spec, upper_spec)`` of the lower and upper specification limits.
        Each element may be:

        * a numeric value, when the specification is constant over time;
        * a string, interpreted as a column name in ``df`` whose values give the
          per-row specification (use this when the specification changes over time);
        * ``None``, in which case the corresponding spec is estimated from the data
          using ``trim_percentile`` (a percentile-based robust limit).

        Default is ``(np.nan, np.nan)``, which treats both specs as numeric NaN
        and yields NaN for the corresponding side of the Cpk calculation.
    trim_percentile : float, optional
        Controls two things. (1) When a specification limit is missing, ``trim_percentile`` is
        used as a percentile on the data (in percent) to estimate that limit: the lower spec is
        set to ``np.nanpercentile(data, trim_percentile)`` and the upper spec to
        ``np.nanpercentile(data, 100 - trim_percentile)``. Default ``2.5`` therefore yields the
        2.5th and 97.5th percentiles. (2) When ``trim_percentile > 0`` the centre/spread used in
        the Cpk formula switch from mean/std to robust alternatives (median and ``Sn``); when 0
        the classical mean/std are used.

    Returns
    -------
    sklearn.utils.Bunch
        A bunch with the following fields:

        * ``cpk``: the Cpk value (the limiting, i.e. smaller, of the two sides).
        * ``center``: the center (mean or median) of the limiting side.
        * ``spread``: the spread (standard deviation or Sn) of the limiting side.
        * ``rsd``: the relative standard deviation of the limiting side, as a
          percentage, ``(spread / center) * 100``.
    """
    if trim_percentile < 0:
        raise ValueError(
            f"trim_percentile must be non-negative; got {trim_percentile}."
        )
    if trim_percentile >= 40:
        raise ValueError(
            f"trim_percentile must be < 40 (typically <= 10-20); got {trim_percentile}."
        )
    lower_spec, upper_spec = specifications

    if lower_spec is None:
        Cpk_lower_spec = float(np.nanpercentile(df[which_column].values, trim_percentile))
    elif isinstance(lower_spec, str):
        Cpk_lower_spec = df[lower_spec]
    else:
        Cpk_lower_spec = float(lower_spec)

    if upper_spec is None:
        Cpk_upper_spec = float(np.nanpercentile(df[which_column].values, 100 - trim_percentile))
    elif isinstance(upper_spec, str):
        Cpk_upper_spec = df[upper_spec]
    else:
        Cpk_upper_spec = float(upper_spec)

    metric_lower = df[which_column] - Cpk_lower_spec
    metric_upper = Cpk_upper_spec - df[which_column]

    if trim_percentile > 0:
        center_lower, center_upper = float(metric_lower.median()), float(metric_upper.median())
        spread_lower, spread_upper = float(Sn(metric_lower)), float(Sn(metric_upper))
    else:
        center_lower, center_upper = float(metric_lower.mean()), float(metric_upper.mean())
        spread_lower, spread_upper = float(metric_lower.std()), float(metric_upper.std())

    # A column with no spread (constant data, or only one non-NaN value)
    # makes Cpk undefined: a bare division would silently yield inf / NaN.
    # Emit a clear warning and return NaN per side -- callers can then
    # distinguish "Cpk could not be computed" from a numeric result. SEC-24
    # (#273).
    import warnings  # noqa: PLC0415

    def _safe_ratio(numer: float, denom: float, side: str) -> float:
        if not (denom > 0):
            warnings.warn(
                f"Cpk_{side}: spread is zero or non-finite; returning NaN. "
                "Likely cause: constant column or only one non-NaN value.",
                category=RuntimeWarning,
                stacklevel=2,
            )
            return float("nan")
        return numer / (3 * denom)

    cpk_lower = _safe_ratio(center_lower, spread_lower, "lower")
    cpk_upper = _safe_ratio(center_upper, spread_upper, "upper")

    # The Cpk is the smaller (limiting) of the two sides; report the centre,
    # spread and RSD of whichever side that is. A NaN side never wins over a
    # finite one.
    if np.isnan(cpk_upper) or (not np.isnan(cpk_lower) and cpk_lower <= cpk_upper):
        cpk, center, spread = cpk_lower, center_lower, spread_lower
    else:
        cpk, center, spread = cpk_upper, center_upper, spread_upper

    rsd = (spread / center) * 100 if center else float("nan")
    return Bunch(cpk=cpk, center=center, spread=spread, rsd=rsd)



_RENAMED = {"calculate_Cpk": "calculate_cpk"}

def __getattr__(name: str) -> None:
    """Raise a helpful error when a renamed module attribute is accessed."""
    if name in _RENAMED:
        new = _RENAMED[name]
        raise AttributeError(
            f"{name!r} has been renamed to {new!r}. "
            f"Use: from process_improve.monitoring.metrics import {new}"
        )
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")