Source code for process_improve.bivariate.methods

"""
Bivariate statistical tools.

* elbow detection in an (x,y) plot
* peaks: finding peaks, quantifying their height, width, center, area, left & right boundaries
* area under curve
"""

import numpy as np

from ..regression.methods import fit_robust_lm


[docs] def find_elbow_point(x: np.ndarray, y: np.ndarray, max_iter: int = 41) -> int | float: # noqa: PLR0915 """ Find the elbow point when plotting numeric entries in `x` vs numeric values in list `y`. Return the index into the vectors `x` and `y` [the vectors must have the same length], where the elbow point occurs. Using a robust linear fit, sorts the samples in X (independent variable) and takes sample 1:5 from the left, and samples (end-5):end and fits two linear regressions, then computes the intersection of the two fitted lines. Adds a point to each regression, so (1:6) and (end-6:end) and repeats, accumulating one intersection point per iteration. The elbow is taken as the data point whose (x, y) location is closest to the median of the accumulated intersection points; the median location is where the intersections should stabilise. Will probably not work well on few data points. If so, try fitting a spline to the raw data and then repeat with the interpolated data. """ start = 5 # assert divmod(max_iter, 2)[1] # must be odd number; to ensure we calculate the median later def calculate_line_length(x1: float, y1: float, x2: np.ndarray, y2: np.ndarray) -> float | np.ndarray: r"""Return the length of the line between 2 points (x1, y1) and (x2, y2). Defined as :math:`\\sqrt{(x2 - x1)^2 + (y2 - y1)^2}`. """ return np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2) # Ensure it is a Numpy array: pandas objects and lists are correctly handled. x = np.array(x.copy()) y = np.array(y.copy()) assert len(x) == len(y) # Stop if everything is missing: if np.isnan(np.nanmedian(x)) or np.isnan(np.nanmedian(y)): return -1 # Eliminate missing values in x and y simultaneously. x, y = x[~(np.isnan(x) | np.isnan(y))], y[~(np.isnan(x) | np.isnan(y))] assert len(x) > 10, "Requires more than 10 values in the vectors (not including missing data)." idx_sort = x.argsort() x = x[idx_sort] y = y[idx_sort] N = x.size # Left and right anchor points: use median of the 5 points at start and end lft_x_avg = np.median(x[0:start]) rgt_x_avg = np.median(x[-start:]) int_x_list = [] int_y_list = [] lft_line_list = [] # type: List[float] rgt_line_list = [] # type: List[float] angle_list = [] # type: List[float] for i in np.floor(np.linspace(0, int(N / 2 - start) + 1, max_iter)): idx = int(start + i) lo_x, lo_y = x[0:idx], y[0:idx] hi_x, hi_y = x[-idx:], y[-idx:] lft = fit_robust_lm(lo_x, lo_y) rgt = fit_robust_lm(hi_x, hi_y) int_x, int_y = find_line_intersection(lft[1], lft[0], rgt[1], rgt[0]) int_x_list.append(int_x) int_y_list.append(int_y) continue # The rest of this code is not reachable. Exploratory code: was used during development # only, and can be used to come up with alternative approaches of finding the elbow. if False: # pragma: no cover # Left edge: (x=median(left 5 values), y = prediction from regression(x)) lft_y = lft[0] + lft[1] * lft_x_avg # Right edge: (x=median(right 5 values), y = prediction from regression(x)) rgt_y = rgt[0] + rgt[1] * rgt_x_avg lft_line = calculate_line_length(int_x, int_y, lft_x_avg, lft_y) rgt_line = calculate_line_length(int_x, int_y, rgt_x_avg, rgt_y) hypotenuse_line = calculate_line_length(lft_x_avg, lft_y, rgt_x_avg, rgt_y) lft_line_list.append(lft_line) rgt_line_list.append(rgt_line) angle = ( np.arccos((lft_line**2 + rgt_line**2 - hypotenuse_line**2) / (2 * lft_line * rgt_line)) * 180.0 / np.pi ) angle_list.append(angle) # Visualize the elbow point if False: import pandas as pd # noqa: PLC0415 data = pd.DataFrame(data={"x": x, "y": y}) ax = data.plot.scatter(x="x", y="y") intersections = pd.DataFrame(data={"x_int": int_x_list, "y_int": int_y_list}) intersections.plot.scatter(x="x_int", y="y_int", ax=ax) pd.DataFrame(intersections.median()).T.plot.scatter(x="x_int", y="y_int", color="red", ax=ax) ax.grid(True) # Elbow point is taken as the average intersection point which is closest # to the raw data. Handle the case for even and odd number of data points if divmod(len(int_x_list), 2)[1] == 0: mid_idx = np.argmin((np.array(int_x_list) - np.nanmedian(int_x_list)) ** 2) else: mid_idx = np.where(int_x_list == np.nanmedian(int_x_list)) if mid_idx[0].any(): mid_idx = mid_idx[0][0] else: return np.nan mid_x = np.nanmedian(int_x_list) if np.isnan(mid_x): return np.nan # TODO: Could robustify it: # np.quantile(calculate_line_length(mid_x, mid_y, xraw, yraw), 0.05) mid_y = int_y_list[int(mid_idx)] return int(np.argmin(calculate_line_length(mid_x, mid_y, x, y)))
[docs] def find_line_intersection(m1: float, b1: float, m2: float, b2: float) -> tuple: """ Find the intersection point of two lines. From Stackoverflow: stackoverflow.com/questions/20677795/how-do-i-compute-the-intersection-point-of-two-lines Returns a tuple: (x, y) where the two lines intersect, given slopes `m1` and `m2`, and intercepts `b1` and `b2`. """ # These lines are essentially parallel! if np.abs(m1 - m2) < np.sqrt(np.finfo(float).eps): return np.nan, np.nan # y = mx + b # Derivation: Set both lines equal to find the intersection point in the x direction # m1 * x + b1 = m2 * x + b2 # m1 * x - m2 * x = b2 - b1 # x * (m1 - m2) = b2 - b1 # Solving the above equation for x: x = (b2 - b1) / (m1 - m2) # Now solve it for y: use either line, because they are equal here: y = m1 * x + b1 return x, y