Source code for process_improve.multivariate._preprocessing

# (c) Kevin Dunn, 2010-2026. MIT License. Based on own private work over the years.
"""Scaling and centering helpers for the multivariate package (ENG-01).

Holds :class:`MCUVScaler` (mean-center, unit-variance; the preferred scaler for
fitting PCA / PLS models) and the standalone :func:`center` / :func:`scale`
utilities. Depends only on :mod:`process_improve.multivariate._common`.
"""

from __future__ import annotations

from collections.abc import Callable

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted, validate_data

from ._common import DataMatrix


[docs] class MCUVScaler(TransformerMixin, BaseEstimator): """Mean-centre, unit-variance (MCUV) scaler. Unlike ``sklearn.preprocessing.StandardScaler`` this uses the sample standard deviation (``ddof=1``), the convention for chemometric data analysis where the population is the training set itself rather than a sampled super-population. The estimator follows the standard sklearn contract: ``n_features_in_`` and ``feature_names_in_`` are populated by ``fit``; sparse / complex / object dtype / empty input are rejected with sklearn-style errors; NaN values pass through (the chemometric preprocessing pipeline expects to thread missing-data through to the downstream NIPALS estimator). """ def __init__(self): pass def __sklearn_tags__(self): """Declare sklearn capability tags (sklearn 1.6+). ``allow_nan=True`` because :meth:`fit` and :meth:`transform` use ``np.nanmean`` / ``np.nanstd``: NaN cells flow through, get re-NaN'd by the centring/scaling arithmetic, and reach the downstream NIPALS estimator that knows how to handle them. """ tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True return tags
[docs] def get_feature_names_out(self, input_features=None) -> np.ndarray: # noqa: ANN001 """Return the output column names of :meth:`transform`. :class:`MCUVScaler` is column-preserving (centring + scaling leave the X column layout unchanged), so the returned names mirror those captured during :meth:`fit` (or the ``input_features`` argument when no ``feature_names_in_`` was captured - the standard sklearn fallback for ndarray-fit estimators). Used by :meth:`set_output` (sklearn 1.2+) to label the :class:`~pandas.DataFrame` view of the output when ``set_output(transform="pandas")`` is on, and by Pipeline introspection. """ return _check_feature_names_in(self, input_features)
[docs] def fit(self, X: DataMatrix, y=None) -> MCUVScaler: # noqa: ANN001, ARG002 """Compute the column means and sample standard deviations. ``y`` is accepted (and ignored) so the scaler plugs into :class:`sklearn.pipeline.Pipeline`, which threads ``y`` through every step's ``fit`` even when (as for a transformer) it is unused. """ # Convenience: accept a 1-D Series (a single-column y, common when # the scaler is used for the target side of a PLS fit). validate_data # itself requires 2-D input, so promote here before it sees X. if isinstance(X, pd.Series): X = X.to_frame() X_arr = validate_data( self, X, reset=True, accept_sparse=False, ensure_min_samples=2, ensure_min_features=1, dtype="numeric", ensure_all_finite="allow-nan", ) feature_names = getattr(self, "feature_names_in_", None) index = pd.Index(feature_names) if feature_names is not None else pd.RangeIndex(X_arr.shape[1]) # nanmean / nanstd so NaN cells pass through with the right # column-level statistics (the chemometric pipeline's missing-data # contract). std uses ddof=1: this is the difference from # sklearn.preprocessing.StandardScaler. center = np.nanmean(X_arr, axis=0) scale = np.nanstd(X_arr, axis=0, ddof=1) # Constant columns are left as-is (scale to 1.0) rather than # producing inf / nan when transform divides. scale = np.where(scale == 0, 1.0, scale) self.center_ = pd.Series(center, index=index) self.scale_ = pd.Series(scale, index=index) return self
[docs] def transform(self, X: DataMatrix, y=None) -> pd.DataFrame: # noqa: ANN001, ARG002 """Mean-centre and unit-variance scale ``X``. ``y`` is accepted (and ignored) for :class:`Pipeline` compatibility. """ check_is_fitted(self, ("center_", "scale_")) # Mirror fit()'s Series convenience for symmetric round-tripping. if isinstance(X, pd.Series): X = X.to_frame() # Preserve the row index for DataFrame input; ndarray input falls # back to a RangeIndex. index = X.index if isinstance(X, pd.DataFrame) else None X_arr = validate_data( self, X, reset=False, accept_sparse=False, dtype="numeric", ensure_all_finite="allow-nan", ) out = (X_arr - self.center_.to_numpy()) / self.scale_.to_numpy() return pd.DataFrame(out, index=index, columns=self.center_.index)
[docs] def inverse_transform(self, X: DataMatrix) -> pd.DataFrame: """Inverse the mean-centring and unit-variance scaling.""" check_is_fitted(self, ("center_", "scale_")) index = X.index if isinstance(X, pd.DataFrame) else None # inverse_transform is intentionally NOT routed through validate_data: # callers (TransformedTargetRegressor included) pass ndarray output # from a downstream estimator that may have a different shape than # the fit-time X (typical: 1-D y_pred for a single-target regressor). # We coerce to 2-D, scale back, and return a DataFrame. X_arr = np.asarray(X, dtype=float) if X_arr.ndim == 1: X_arr = X_arr.reshape(-1, 1) out = X_arr * self.scale_.to_numpy() + self.center_.to_numpy() return pd.DataFrame(out, index=index, columns=self.center_.index)
[docs] def center( X, # noqa: ANN001 func: Callable = np.mean, axis: int = 0, extra_output: bool = False, ) -> DataMatrix | tuple[DataMatrix, np.ndarray]: """ Perform centering of data, using a function, `func` (default: np.mean). The function, if supplied, must return a vector with as many columns as the matrix X. `axis` [optional; default=0] {integer or None} This specifies the axis along which the centering vector will be calculated if not provided. The function is applied along the `axis`: 0=down the columns; 1 = across the rows. *Missing values*: The sample mean is computed by taking the sum along the `axis`, skipping any missing data, and dividing by N = number of values which are present. Values which were missing before, are left as missing after. """ # pandas-stubs types apply()'s axis as a Literal, so a plain ``int`` axis does # not match any overload; the call is valid at runtime. vector = pd.DataFrame(X).apply(func, axis=axis).to_numpy() # type: ignore[call-overload] # pandas-stubs axis is Literal if extra_output: return np.subtract(X, vector), vector else: return np.subtract(X, vector)
[docs] def scale( X: DataMatrix, func: Callable = np.std, axis: int = 0, extra_output: bool = False, ddof: int = 0, **kwargs, ) -> DataMatrix | tuple[DataMatrix, np.ndarray]: """ Scales the data (does NOT do any centering); scales to unit variance by default. `func` [optional; default=np.std] {a function} The default (np.std) uses NumPy to calculate the standard deviation of the data along the required `axis`, skipping over any missing data, and uses that as `scale`. `axis` [optional; default=0] {integer} Transformations are applied on slices of data. This specifies the axis along which the transformation will be applied. `ddof` [optional; default=0] {integer} Delta degrees of freedom, forwarded to `np.std` when `func` is the default `np.std`. The standard deviation is computed by dividing by ``N - ddof``, where N is the number of values which are present. The default (``ddof=0``) divides by N (the population standard deviation); pass ``ddof=1`` for the sample standard deviation (dividing by N-1). Note: :class:`MCUVScaler` uses ``ddof=1`` and is the preferred scaler for fitting PCA / PLS models. Use ``scale(center(X), ddof=1)`` here to match it. The ``ddof`` argument is ignored when a custom `func` is supplied (forward your own keyword arguments via ``**kwargs`` instead). Constant (zero-variance) columns are left unchanged: a zero entry in the computed scaling vector is replaced by 1.0 before inversion, mirroring :class:`MCUVScaler`, so no ``inf`` / ``NaN`` is introduced. Usage ===== X = ... # data matrix X = scale(center(X)) X = scale(center(X), ddof=1) # sample standard deviation, matches MCUVScaler my_scale = np.mad X = scale(center(X), func=my_scale) Returns ------- scaled : DataMatrix The scaled data, returned when ``extra_output=False`` (the default). (scaled, scale_vector) : tuple[DataMatrix, np.ndarray] When ``extra_output=True``, a tuple of the scaled data and the per-column scaling vector (the reciprocal of `func` applied along `axis`, with zero entries replaced by 1.0 to leave constant columns unchanged) is returned instead. """ if func is np.std and "ddof" not in kwargs: kwargs["ddof"] = ddof # pandas-stubs types apply()'s axis as a Literal, so a plain ``int`` axis does # not match any overload; the call is valid at runtime. vector = pd.DataFrame(X).apply(func, axis=axis, **kwargs).to_numpy() # type: ignore[call-overload] # pandas-stubs axis is Literal # Zero-variance (constant) columns are left as-is, mirroring MCUVScaler, so # that ``1.0 / vector`` does not introduce inf/NaN. vector = np.where(vector == 0, 1.0, vector) vector = 1.0 / vector if extra_output: return np.multiply(X, vector), vector else: return np.multiply(X, vector)