# (c) Kevin Dunn, 2010-2026. MIT License. Based on own private work over the years.
"""Scaling and centering helpers for the multivariate package (ENG-01).
Holds :class:`MCUVScaler` (mean-center, unit-variance; the preferred scaler for
fitting PCA / PLS models) and the standalone :func:`center` / :func:`scale`
utilities. Depends only on :mod:`process_improve.multivariate._common`.
"""
from __future__ import annotations
from collections.abc import Callable
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted, validate_data
from ._common import DataMatrix
[docs]
class MCUVScaler(TransformerMixin, BaseEstimator):
"""Mean-centre, unit-variance (MCUV) scaler.
Unlike ``sklearn.preprocessing.StandardScaler`` this uses the sample
standard deviation (``ddof=1``), the convention for chemometric data
analysis where the population is the training set itself rather than a
sampled super-population.
The estimator follows the standard sklearn contract: ``n_features_in_``
and ``feature_names_in_`` are populated by ``fit``; sparse / complex /
object dtype / empty input are rejected with sklearn-style errors;
NaN values pass through (the chemometric preprocessing pipeline expects
to thread missing-data through to the downstream NIPALS estimator).
"""
def __init__(self):
pass
def __sklearn_tags__(self):
"""Declare sklearn capability tags (sklearn 1.6+).
``allow_nan=True`` because :meth:`fit` and :meth:`transform` use
``np.nanmean`` / ``np.nanstd``: NaN cells flow through, get
re-NaN'd by the centring/scaling arithmetic, and reach the
downstream NIPALS estimator that knows how to handle them.
"""
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = True
return tags
[docs]
def get_feature_names_out(self, input_features=None) -> np.ndarray: # noqa: ANN001
"""Return the output column names of :meth:`transform`.
:class:`MCUVScaler` is column-preserving (centring + scaling
leave the X column layout unchanged), so the returned names
mirror those captured during :meth:`fit` (or the
``input_features`` argument when no ``feature_names_in_`` was
captured - the standard sklearn fallback for ndarray-fit
estimators).
Used by :meth:`set_output` (sklearn 1.2+) to label the
:class:`~pandas.DataFrame` view of the output when
``set_output(transform="pandas")`` is on, and by Pipeline
introspection.
"""
return _check_feature_names_in(self, input_features)
[docs]
def fit(self, X: DataMatrix, y=None) -> MCUVScaler: # noqa: ANN001, ARG002
"""Compute the column means and sample standard deviations.
``y`` is accepted (and ignored) so the scaler plugs into
:class:`sklearn.pipeline.Pipeline`, which threads ``y`` through every
step's ``fit`` even when (as for a transformer) it is unused.
"""
# Convenience: accept a 1-D Series (a single-column y, common when
# the scaler is used for the target side of a PLS fit). validate_data
# itself requires 2-D input, so promote here before it sees X.
if isinstance(X, pd.Series):
X = X.to_frame()
X_arr = validate_data(
self,
X,
reset=True,
accept_sparse=False,
ensure_min_samples=2,
ensure_min_features=1,
dtype="numeric",
ensure_all_finite="allow-nan",
)
feature_names = getattr(self, "feature_names_in_", None)
index = pd.Index(feature_names) if feature_names is not None else pd.RangeIndex(X_arr.shape[1])
# nanmean / nanstd so NaN cells pass through with the right
# column-level statistics (the chemometric pipeline's missing-data
# contract). std uses ddof=1: this is the difference from
# sklearn.preprocessing.StandardScaler.
center = np.nanmean(X_arr, axis=0)
scale = np.nanstd(X_arr, axis=0, ddof=1)
# Constant columns are left as-is (scale to 1.0) rather than
# producing inf / nan when transform divides.
scale = np.where(scale == 0, 1.0, scale)
self.center_ = pd.Series(center, index=index)
self.scale_ = pd.Series(scale, index=index)
return self
[docs]
def center(
X, # noqa: ANN001
func: Callable = np.mean,
axis: int = 0,
extra_output: bool = False,
) -> DataMatrix | tuple[DataMatrix, np.ndarray]:
"""
Perform centering of data, using a function, `func` (default: np.mean).
The function, if supplied, must return a vector with as many columns as the matrix X.
`axis` [optional; default=0] {integer or None}
This specifies the axis along which the centering vector will be calculated if not provided.
The function is applied along the `axis`: 0=down the columns; 1 = across the rows.
*Missing values*: The sample mean is computed by taking the sum along the `axis`, skipping
any missing data, and dividing by N = number of values which are present. Values which were
missing before, are left as missing after.
"""
# pandas-stubs types apply()'s axis as a Literal, so a plain ``int`` axis does
# not match any overload; the call is valid at runtime.
vector = pd.DataFrame(X).apply(func, axis=axis).to_numpy() # type: ignore[call-overload] # pandas-stubs axis is Literal
if extra_output:
return np.subtract(X, vector), vector
else:
return np.subtract(X, vector)
[docs]
def scale(
X: DataMatrix,
func: Callable = np.std,
axis: int = 0,
extra_output: bool = False,
ddof: int = 0,
**kwargs,
) -> DataMatrix | tuple[DataMatrix, np.ndarray]:
"""
Scales the data (does NOT do any centering); scales to unit variance by
default.
`func` [optional; default=np.std] {a function}
The default (np.std) uses NumPy to calculate the standard deviation of
the data along the required `axis`, skipping over any missing data, and
uses that as `scale`.
`axis` [optional; default=0] {integer}
Transformations are applied on slices of data. This specifies the
axis along which the transformation will be applied.
`ddof` [optional; default=0] {integer}
Delta degrees of freedom, forwarded to `np.std` when `func` is the
default `np.std`. The standard deviation is computed by dividing by
``N - ddof``, where N is the number of values which are present. The
default (``ddof=0``) divides by N (the population standard deviation);
pass ``ddof=1`` for the sample standard deviation (dividing by N-1).
Note: :class:`MCUVScaler` uses ``ddof=1`` and is the preferred scaler
for fitting PCA / PLS models. Use ``scale(center(X), ddof=1)`` here to
match it. The ``ddof`` argument is ignored when a custom `func` is
supplied (forward your own keyword arguments via ``**kwargs`` instead).
Constant (zero-variance) columns are left unchanged: a zero entry in the
computed scaling vector is replaced by 1.0 before inversion, mirroring
:class:`MCUVScaler`, so no ``inf`` / ``NaN`` is introduced.
Usage
=====
X = ... # data matrix
X = scale(center(X))
X = scale(center(X), ddof=1) # sample standard deviation, matches MCUVScaler
my_scale = np.mad
X = scale(center(X), func=my_scale)
Returns
-------
scaled : DataMatrix
The scaled data, returned when ``extra_output=False`` (the default).
(scaled, scale_vector) : tuple[DataMatrix, np.ndarray]
When ``extra_output=True``, a tuple of the scaled data and the
per-column scaling vector (the reciprocal of `func` applied along
`axis`, with zero entries replaced by 1.0 to leave constant columns
unchanged) is returned instead.
"""
if func is np.std and "ddof" not in kwargs:
kwargs["ddof"] = ddof
# pandas-stubs types apply()'s axis as a Literal, so a plain ``int`` axis does
# not match any overload; the call is valid at runtime.
vector = pd.DataFrame(X).apply(func, axis=axis, **kwargs).to_numpy() # type: ignore[call-overload] # pandas-stubs axis is Literal
# Zero-variance (constant) columns are left as-is, mirroring MCUVScaler, so
# that ``1.0 / vector`` does not introduce inf/NaN.
vector = np.where(vector == 0, 1.0, vector)
vector = 1.0 / vector
if extra_output:
return np.multiply(X, vector), vector
else:
return np.multiply(X, vector)