Source code for process_improve.bivariate.methods

"""
Bivariate statistical tools.

* elbow detection in an (x,y) plot
* peaks: finding peaks, quantifying their height, width, center, area, left & right boundaries
* area under curve
"""

import numpy as np

from ..regression.methods import fit_robust_lm



[docs]
def find_elbow_point(x: np.ndarray, y: np.ndarray, max_iter: int = 41) -> int | float:  # noqa: PLR0915
    """
    Find the elbow point when plotting numeric entries in `x` vs numeric values in list `y`.

    Return the index into the vectors `x` and `y` [the vectors must have the same length], where
    the elbow point occurs.

    Using a robust linear fit, sorts the samples in X (independent variable)
    and takes sample 1:5 from the left, and samples (end-5):end and fits two
    linear regressions, then computes the intersection of the two fitted lines.
    Adds a point to each regression, so (1:6) and (end-6:end) and repeats,
    accumulating one intersection point per iteration.

    The elbow is taken as the data point whose (x, y) location is closest to
    the median of the accumulated intersection points; the median location is
    where the intersections should stabilise.

    Will probably not work well on few data points. If so, try fitting a spline
    to the raw data and then repeat with the interpolated data.

    """
    start = 5
    # assert divmod(max_iter, 2)[1]  # must be odd number; to ensure we calculate the median later

    def calculate_line_length(x1: float, y1: float, x2: np.ndarray, y2: np.ndarray) -> float | np.ndarray:
        r"""Return the length of the line between 2 points (x1, y1) and (x2, y2).

        Defined as :math:`\\sqrt{(x2 - x1)^2 + (y2 - y1)^2}`.
        """
        return np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)

    # Ensure it is a Numpy array: pandas objects and lists are correctly handled.
    x = np.array(x.copy())
    y = np.array(y.copy())
    assert len(x) == len(y)

    # Stop if everything is missing:
    if np.isnan(np.nanmedian(x)) or np.isnan(np.nanmedian(y)):
        return -1

    # Eliminate missing values in x and y simultaneously.
    x, y = x[~(np.isnan(x) | np.isnan(y))], y[~(np.isnan(x) | np.isnan(y))]
    assert len(x) > 10, "Requires more than 10 values in the vectors (not including missing data)."
    idx_sort = x.argsort()
    x = x[idx_sort]
    y = y[idx_sort]
    N = x.size
    # Left and right anchor points: use median of the 5 points at start and end
    lft_x_avg = np.median(x[0:start])
    rgt_x_avg = np.median(x[-start:])

    int_x_list = []
    int_y_list = []
    lft_line_list = []  # type: List[float]
    rgt_line_list = []  # type: List[float]
    angle_list = []  # type: List[float]
    for i in np.floor(np.linspace(0, int(N / 2 - start) + 1, max_iter)):
        idx = int(start + i)
        lo_x, lo_y = x[0:idx], y[0:idx]
        hi_x, hi_y = x[-idx:], y[-idx:]
        lft = fit_robust_lm(lo_x, lo_y)
        rgt = fit_robust_lm(hi_x, hi_y)
        int_x, int_y = find_line_intersection(lft[1], lft[0], rgt[1], rgt[0])
        int_x_list.append(int_x)
        int_y_list.append(int_y)
        continue

        # The rest of this code is not reachable. Exploratory code: was used during development
        # only, and can be used to come up with alternative approaches of finding the elbow.
        if False:  # pragma: no cover
            # Left edge: (x=median(left 5 values), y = prediction from regression(x))
            lft_y = lft[0] + lft[1] * lft_x_avg

            # Right edge: (x=median(right 5 values), y = prediction from regression(x))
            rgt_y = rgt[0] + rgt[1] * rgt_x_avg

            lft_line = calculate_line_length(int_x, int_y, lft_x_avg, lft_y)
            rgt_line = calculate_line_length(int_x, int_y, rgt_x_avg, rgt_y)
            hypotenuse_line = calculate_line_length(lft_x_avg, lft_y, rgt_x_avg, rgt_y)
            lft_line_list.append(lft_line)
            rgt_line_list.append(rgt_line)

            angle = (
                np.arccos((lft_line**2 + rgt_line**2 - hypotenuse_line**2) / (2 * lft_line * rgt_line)) * 180.0 / np.pi
            )
            angle_list.append(angle)

    # Visualize the elbow point
    if False:
        import pandas as pd  # noqa: PLC0415

        data = pd.DataFrame(data={"x": x, "y": y})
        ax = data.plot.scatter(x="x", y="y")

        intersections = pd.DataFrame(data={"x_int": int_x_list, "y_int": int_y_list})
        intersections.plot.scatter(x="x_int", y="y_int", ax=ax)
        pd.DataFrame(intersections.median()).T.plot.scatter(x="x_int", y="y_int", color="red", ax=ax)
        ax.grid(True)

    # Elbow point is taken as the average intersection point which is closest
    # to the raw data. Handle the case for even and odd number of data points
    if divmod(len(int_x_list), 2)[1] == 0:
        mid_idx = np.argmin((np.array(int_x_list) - np.nanmedian(int_x_list)) ** 2)
    else:
        mid_idx = np.where(int_x_list == np.nanmedian(int_x_list))
        if mid_idx[0].any():
            mid_idx = mid_idx[0][0]
        else:
            return np.nan

    mid_x = np.nanmedian(int_x_list)
    if np.isnan(mid_x):
        return np.nan

    # TODO: Could robustify it:
    # np.quantile(calculate_line_length(mid_x, mid_y, xraw, yraw), 0.05)
    mid_y = int_y_list[int(mid_idx)]
    return int(np.argmin(calculate_line_length(mid_x, mid_y, x, y)))




[docs]
def find_line_intersection(m1: float, b1: float, m2: float, b2: float) -> tuple:
    """
    Find the intersection point of two lines.

    From Stackoverflow:
    stackoverflow.com/questions/20677795/how-do-i-compute-the-intersection-point-of-two-lines

    Returns a tuple: (x, y) where the two lines intersect, given slopes `m1` and
    `m2`, and intercepts `b1` and `b2`.
    """
    # These lines are essentially parallel!
    if np.abs(m1 - m2) < np.sqrt(np.finfo(float).eps):
        return np.nan, np.nan

    # y = mx + b
    # Derivation: Set both lines equal to find the intersection point in the x direction
    # m1 * x + b1 = m2 * x + b2
    # m1 * x - m2 * x = b2 - b1
    # x * (m1 - m2) = b2 - b1
    # Solving the above equation for x:
    x = (b2 - b1) / (m1 - m2)

    # Now solve it for y: use either line, because they are equal here:
    y = m1 * x + b1
    return x, y