Source code for process_improve.experiments.evaluate

# (c) Kevin Dunn, 2010-2026. MIT License.

"""Design evaluation: quality metrics for experimental designs.

Provides :func:`evaluate_design`, which computes properties and quality metrics
of an existing design matrix.  Supported metrics include efficiency values
(D/I/G), prediction variance, VIF, condition number, power analysis, alias
structure, confounding, resolution, defining relation, clear effects, minimum
aberration, and degrees of freedom.

Example
-------
>>> from process_improve.experiments import evaluate_design, generate_design, Factor
>>> factors = [Factor(name="A", low=0, high=10), Factor(name="B", low=0, high=10)]
>>> result = generate_design(factors, design_type="full_factorial", center_points=0)
>>> metrics = evaluate_design(result, model="interactions", metric=["d_efficiency", "vif"])
"""

from __future__ import annotations

import itertools
from collections.abc import Callable
from dataclasses import dataclass
from typing import Any

import numpy as np
import pandas as pd
from patsy import dmatrix
from scipy import stats
from scipy.stats import qmc
from statsmodels.stats.outliers_influence import variance_inflation_factor

from process_improve.experiments.factor import DesignResult

# ---------------------------------------------------------------------------
# Internal context shared across metric computations
# ---------------------------------------------------------------------------


@dataclass
class _EvalContext:
    """Shared evaluation context, computed once and reused by all metrics."""

    X: np.ndarray
    column_names: list[str]
    factor_names: list[str]
    design_df: pd.DataFrame
    N: int
    p: int
    XtX: np.ndarray
    XtX_inv: np.ndarray | None  # None when X'X is singular
    is_singular: bool
    generators: list[str] | None
    defining_relation: list[str] | None
    resolution: int | None
    effect_size: float | None
    alpha: float
    sigma: float | None


# ---------------------------------------------------------------------------
# Model matrix construction
# ---------------------------------------------------------------------------

_ROMAN = {3: "III", 4: "IV", 5: "V", 6: "VI", 7: "VII", 8: "VIII"}


def _build_model_matrix(
    design_df: pd.DataFrame,
    model: str | None,
    factor_names: list[str],
) -> tuple[np.ndarray, list[str]]:
    """Build the expanded model matrix *X* using patsy.

    Parameters
    ----------
    design_df : DataFrame
        Design matrix with one column per factor (coded units).
    model : str or None
        ``"main_effects"``, ``"interactions"``, ``"quadratic"``, an explicit
        patsy formula, or *None* (defaults to ``"interactions"``).
    factor_names : list[str]
        Ordered factor names.

    Returns
    -------
    X : ndarray of shape (N, p)
        The model matrix including the intercept column.
    column_names : list[str]
        Human-readable names for each column of *X*.
    """
    if model is None:
        model = "interactions"

    # Map shorthand names to patsy right-hand-side formulas
    joined = " + ".join(factor_names)
    if model == "main_effects":
        rhs = joined
    elif model == "interactions":
        rhs = f"({joined}) ** 2"
    elif model == "quadratic":
        squared = " + ".join(f"I({f} ** 2)" for f in factor_names)
        rhs = f"({joined}) ** 2 + {squared}"
    elif "~" in model:
        # Explicit formula with response side - strip LHS
        rhs = model.split("~", 1)[1].strip()
    else:
        # Assume it is already a valid RHS formula
        rhs = model

    dm = dmatrix(rhs, design_df, return_type="dataframe")
    X = np.asarray(dm, dtype=float)
    column_names = list(dm.columns)
    return X, column_names


def _build_context(  # noqa: PLR0913
    design_df: pd.DataFrame,
    factor_names: list[str],
    model: str | None,
    generators: list[str] | None,
    defining_relation: list[str] | None,
    resolution: int | None,
    effect_size: float | None,
    alpha: float,
    sigma: float | None,
) -> _EvalContext:
    """Build the shared evaluation context."""
    X, column_names = _build_model_matrix(design_df, model, factor_names)
    N, p = X.shape
    XtX = X.T @ X

    rank = np.linalg.matrix_rank(XtX)
    is_singular = rank < p
    XtX_inv: np.ndarray | None = None
    if not is_singular:
        XtX_inv = np.linalg.inv(XtX)

    return _EvalContext(
        X=X,
        column_names=column_names,
        factor_names=factor_names,
        design_df=design_df,
        N=N,
        p=p,
        XtX=XtX,
        XtX_inv=XtX_inv,
        is_singular=is_singular,
        generators=generators,
        defining_relation=defining_relation,
        resolution=resolution,
        effect_size=effect_size,
        alpha=alpha,
        sigma=sigma,
    )


# ---------------------------------------------------------------------------
# Metric implementations
# ---------------------------------------------------------------------------


def _compute_d_efficiency(ctx: _EvalContext) -> dict[str, Any]:
    """D-efficiency: 100 * det(X'X)^(1/p) / N."""
    if ctx.is_singular:
        return {"d_efficiency": None, "note": "Design is rank-deficient for the specified model."}
    sign, logdet = np.linalg.slogdet(ctx.XtX)
    if sign <= 0:
        return {"d_efficiency": None, "note": "X'X has non-positive determinant."}
    d_eff = 100.0 * np.exp(logdet / ctx.p) / ctx.N
    return {"d_efficiency": float(d_eff)}


def _prediction_variance_at_points(X_points: np.ndarray, XtX_inv: np.ndarray) -> np.ndarray:
    """Compute d(x) = x' (X'X)^-1 x for each row of X_points."""
    # (X_points @ XtX_inv) element-wise * X_points, summed per row
    return np.sum((X_points @ XtX_inv) * X_points, axis=1)


def _generate_evaluation_grid(factor_names: list[str], n_points: int = 5000) -> np.ndarray:
    """Generate points in [-1, 1]^k for evaluating prediction variance.

    Uses Sobol sequences for efficient space-filling coverage.
    """
    k = len(factor_names)
    sampler = qmc.Sobol(d=k, scramble=True, seed=42)
    # Power-of-2 samples for Sobol balance
    m = max(10, int(np.ceil(np.log2(n_points))))
    points_01 = sampler.random_base2(m)  # in [0, 1]^k
    return points_01 * 2.0 - 1.0  # scale to [-1, 1]^k


def _expand_grid_points(grid_raw: np.ndarray, factor_names: list[str], model: str | None) -> np.ndarray:
    """Expand raw grid points through the model formula to get X_grid."""
    df_grid = pd.DataFrame(grid_raw, columns=factor_names)
    X_grid, _ = _build_model_matrix(df_grid, model, factor_names)
    return X_grid


def _compute_g_efficiency(ctx: _EvalContext) -> dict[str, Any]:
    """G-efficiency: 100 * p / (N * max prediction variance over design region)."""
    if ctx.is_singular:
        return {"g_efficiency": None, "note": "Design is rank-deficient for the specified model."}

    grid_raw = _generate_evaluation_grid(ctx.factor_names)
    # Determine the model string to rebuild for grid points
    model_str = _infer_model_string(ctx)
    X_grid = _expand_grid_points(grid_raw, ctx.factor_names, model_str)
    pv = _prediction_variance_at_points(X_grid, ctx.XtX_inv)
    max_pv = float(np.max(pv))

    g_eff = 100.0 * ctx.p / (ctx.N * max_pv) if max_pv > 0 else None
    return {
        "g_efficiency": float(g_eff) if g_eff is not None else None,
        "max_prediction_variance": max_pv,
    }


def _compute_i_efficiency(ctx: _EvalContext) -> dict[str, Any]:
    """I-efficiency: 100 * p / (N * average prediction variance over design region)."""
    if ctx.is_singular:
        return {"i_efficiency": None, "note": "Design is rank-deficient for the specified model."}

    grid_raw = _generate_evaluation_grid(ctx.factor_names)
    model_str = _infer_model_string(ctx)
    X_grid = _expand_grid_points(grid_raw, ctx.factor_names, model_str)
    pv = _prediction_variance_at_points(X_grid, ctx.XtX_inv)
    avg_pv = float(np.mean(pv))

    i_eff = 100.0 * ctx.p / (ctx.N * avg_pv) if avg_pv > 0 else None
    return {
        "i_efficiency": float(i_eff) if i_eff is not None else None,
        "average_prediction_variance": avg_pv,
    }


def _infer_model_string(ctx: _EvalContext) -> str | None:
    """Reconstruct the model string from the context for grid expansion.

    We need to re-apply the same model transformation to grid points.  Since
    the context stores only the expanded column names, we reconstruct the
    model shorthand by checking the column structure.
    """
    cols = set(ctx.column_names)
    has_interactions = any(":" in c for c in cols)
    has_squared = any("**" in c or c.startswith("I(") for c in cols)

    if has_squared:
        return "quadratic"
    if has_interactions:
        return "interactions"
    return "main_effects"


def _compute_prediction_variance(ctx: _EvalContext) -> dict[str, Any]:
    """Prediction variance d(x_i) = x_i' (X'X)^-1 x_i at each design point."""
    if ctx.is_singular:
        return {"prediction_variance": None, "note": "Design is rank-deficient for the specified model."}

    pv = _prediction_variance_at_points(ctx.X, ctx.XtX_inv)
    pv_list = [float(v) for v in pv]
    return {
        "prediction_variance": pv_list,
        "mean": float(np.mean(pv)),
        "max": float(np.max(pv)),
        "min": float(np.min(pv)),
    }


def _compute_vif(ctx: _EvalContext) -> dict[str, Any]:
    """Variance Inflation Factor for each model term (excluding intercept)."""
    if ctx.is_singular:
        return {"vif": None, "note": "Design is rank-deficient for the specified model."}

    vif_dict: dict[str, float] = {}
    for i, name in enumerate(ctx.column_names):
        if name.lower() == "intercept" or name == "1":
            continue
        vif_val = variance_inflation_factor(ctx.X, i)
        vif_dict[name] = float(vif_val)
    return {"vif": vif_dict}


def _compute_condition_number(ctx: _EvalContext) -> dict[str, float]:
    """Condition number of the model matrix X."""
    cn = float(np.linalg.cond(ctx.X))
    return {"condition_number": cn}


def _compute_power(ctx: _EvalContext) -> dict[str, Any]:
    """Statistical power for detecting each model term."""
    if ctx.is_singular:
        return {"power": None, "note": "Design is rank-deficient for the specified model."}

    sigma = ctx.sigma if ctx.sigma is not None else 1.0
    df_resid = ctx.N - ctx.p

    if df_resid <= 0:
        return {"power": None, "note": "No residual degrees of freedom (saturated model)."}

    assert ctx.XtX_inv is not None  # guaranteed by not is_singular
    diag_inv = np.diag(ctx.XtX_inv)

    if ctx.effect_size is not None:
        # Single power value per term
        power_dict: dict[str, float] = {}
        for i, name in enumerate(ctx.column_names):
            if name.lower() == "intercept" or name == "1":
                continue
            ncp = (ctx.effect_size**2) / (sigma**2 * diag_inv[i])
            f_crit = stats.f.ppf(1.0 - ctx.alpha, dfn=1, dfd=df_resid)
            pwr = 1.0 - stats.ncf.cdf(f_crit, dfn=1, dfd=df_resid, nc=ncp)
            power_dict[name] = float(pwr)
        return {"power": power_dict}

    # No effect_size: generate power curves over a range of effect sizes
    effect_sizes = np.linspace(0.5 * sigma, 3.0 * sigma, 20)
    power_curves: dict[str, list[dict[str, float]]] = {}
    for i, name in enumerate(ctx.column_names):
        if name.lower() == "intercept" or name == "1":
            continue
        curve = []
        for es in effect_sizes:
            ncp = (es**2) / (sigma**2 * diag_inv[i])
            f_crit = stats.f.ppf(1.0 - ctx.alpha, dfn=1, dfd=df_resid)
            pwr = 1.0 - stats.ncf.cdf(f_crit, dfn=1, dfd=df_resid, nc=ncp)
            curve.append({"effect_size": float(es), "power": float(pwr)})
        power_curves[name] = curve
    return {"power_curves": power_curves, "sigma": float(sigma)}


def _compute_degrees_of_freedom(ctx: _EvalContext) -> dict[str, Any]:
    """Degrees of freedom breakdown."""
    df_model = ctx.p - 1  # excluding intercept
    df_residual = ctx.N - ctx.p
    df_total = ctx.N - 1

    # Detect replicates by counting distinct rows
    design_rounded = np.round(ctx.design_df[ctx.factor_names].values, decimals=10)
    n_distinct = len(set(map(tuple, design_rounded)))
    has_replicates = n_distinct < ctx.N

    result: dict[str, Any] = {
        "degrees_of_freedom": {
            "model": df_model,
            "residual": df_residual,
            "total": df_total,
        }
    }
    if has_replicates:
        df_pure_error = ctx.N - n_distinct
        df_lack_of_fit = n_distinct - ctx.p if n_distinct > ctx.p else 0
        result["degrees_of_freedom"]["pure_error"] = df_pure_error
        result["degrees_of_freedom"]["lack_of_fit"] = df_lack_of_fit

    return result


# ---------------------------------------------------------------------------
# Alias / confounding metrics (GF(2) arithmetic on generator words)
# ---------------------------------------------------------------------------


def _parse_word(word: str, factor_names: list[str]) -> frozenset[int]:
    """Parse a word like ``"ABCE"`` into a frozenset of factor indices.

    Also handles ``"I=ABCE"`` format (strips the ``I=`` prefix).
    """
    word = word.strip().removeprefix("I=")
    if word == "I":
        return frozenset()

    # Try multi-char factor names first (when names are longer than 1 char)
    name_to_idx = {name: i for i, name in enumerate(factor_names)}

    # If all factor names are single chars, parse character-by-character
    if all(len(n) == 1 for n in factor_names):
        indices = set()
        for ch in word:
            if ch in name_to_idx:
                indices.add(name_to_idx[ch])
        return frozenset(indices)

    # Multi-char names: try to match greedily (longest first)
    remaining = word
    indices = set()
    sorted_names = sorted(name_to_idx.keys(), key=len, reverse=True)
    while remaining:
        matched = False
        for name in sorted_names:
            if remaining.startswith(name):
                indices.add(name_to_idx[name])
                remaining = remaining[len(name) :]
                matched = True
                break
        if not matched:
            remaining = remaining[1:]  # skip unrecognized character
    return frozenset(indices)


def _word_to_str(indices: frozenset[int], factor_names: list[str]) -> str:
    """Convert factor indices back to a word string."""
    if not indices:
        return "I"
    sorted_indices = sorted(indices)
    return "".join(factor_names[i] for i in sorted_indices)


def _multiply_words(w1: frozenset[int], w2: frozenset[int]) -> frozenset[int]:
    """Multiply two words in GF(2) - symmetric difference of factor sets."""
    return w1.symmetric_difference(w2)


def _defining_relation_from_generators(
    generators: list[str], factor_names: list[str]
) -> list[frozenset[int]]:
    """Compute the full defining relation from generator strings.

    Each generator like ``"D=ABC"`` produces the word ``ABCD``.  The full
    defining relation is the closure under GF(2) multiplication of all
    generator words and their products (all non-empty subsets).
    """
    # Parse each generator into a defining word
    base_words: list[frozenset[int]] = []
    for gen in generators:
        parts = gen.split("=")
        lhs = parts[0].strip()
        rhs = parts[1].strip() if len(parts) > 1 else ""
        lhs_idx = _parse_word(lhs, factor_names)
        rhs_idx = _parse_word(rhs, factor_names)
        word = _multiply_words(lhs_idx, rhs_idx)
        base_words.append(word)

    # Generate all non-empty subsets and their products
    all_words: set[frozenset[int]] = set()
    for r in range(1, len(base_words) + 1):
        for subset in itertools.combinations(base_words, r):
            product = frozenset()
            for w in subset:
                product = _multiply_words(product, w)
            if product:  # exclude identity
                all_words.add(product)

    return sorted(all_words, key=lambda w: (len(w), sorted(w)))


def _compute_defining_relation(ctx: _EvalContext) -> dict[str, Any]:
    """Compute or return the defining relation."""
    if ctx.defining_relation:
        return {"defining_relation": ctx.defining_relation}

    if not ctx.generators:
        return {"defining_relation": None, "note": "No generators available. Not a fractional factorial design."}

    words = _defining_relation_from_generators(ctx.generators, ctx.factor_names)
    relation = [f"I={_word_to_str(w, ctx.factor_names)}" for w in words]
    return {"defining_relation": relation}


def _compute_resolution(ctx: _EvalContext) -> dict[str, Any]:
    """Design resolution = minimum word length in the defining relation."""
    if ctx.resolution is not None:
        roman = _ROMAN.get(ctx.resolution, str(ctx.resolution))
        return {"resolution": ctx.resolution, "roman": roman}

    if not ctx.generators:
        return {"resolution": None, "roman": None, "note": "Not a fractional factorial design."}

    words = _defining_relation_from_generators(ctx.generators, ctx.factor_names)
    if not words:
        return {"resolution": None, "roman": None, "note": "No defining relation words found."}

    res = min(len(w) for w in words)
    roman = _ROMAN.get(res, str(res))
    return {"resolution": res, "roman": roman}


def _compute_alias_structure(ctx: _EvalContext) -> dict[str, Any]:
    """Alias structure: which effects are aliased with which others.

    Uses GF(2) arithmetic when generators are available; falls back to
    correlation-based detection otherwise.
    """
    if ctx.generators:
        return _alias_structure_from_generators(ctx)
    return _alias_structure_from_correlation(ctx)


def _alias_structure_from_generators(ctx: _EvalContext) -> dict[str, Any]:
    """Compute alias chains using GF(2) multiplication against the defining relation."""
    words = _defining_relation_from_generators(ctx.generators, ctx.factor_names)
    if not words:
        return {"alias_structure": []}

    # Build alias chains for main effects and 2-factor interactions
    alias_chains: list[str] = []
    k = len(ctx.factor_names)

    # Main effects
    for i in range(k):
        effect = frozenset([i])
        effect_name = ctx.factor_names[i]
        aliases = []
        for w in words:
            alias = _multiply_words(effect, w)
            alias_name = _word_to_str(alias, ctx.factor_names)
            aliases.append(alias_name)
        # Sort by word length
        aliases.sort(key=lambda s: (len(s), s))
        chain = f"{effect_name} = " + " + ".join(aliases)
        alias_chains.append(chain)

    # 2-factor interactions
    for i, j in itertools.combinations(range(k), 2):
        effect = frozenset([i, j])
        effect_name = _word_to_str(effect, ctx.factor_names)
        aliases = []
        for w in words:
            alias = _multiply_words(effect, w)
            alias_name = _word_to_str(alias, ctx.factor_names)
            aliases.append(alias_name)
        aliases.sort(key=lambda s: (len(s), s))
        chain = f"{effect_name} = " + " + ".join(aliases)
        alias_chains.append(chain)

    return {"alias_structure": alias_chains}


def _is_intercept_col(name: str) -> bool:
    """Check if a column name represents the intercept."""
    return name.lower() == "intercept" or name == "1"


def _find_correlated_aliases(
    X_full: np.ndarray, col_names: list[str], col_idx: int, nonzero: np.ndarray, threshold: float
) -> list[tuple[str, str]]:
    """Find columns highly correlated with column *col_idx*."""
    aliases: list[tuple[str, str]] = []
    for j in range(X_full.shape[1]):
        if j == col_idx or not nonzero[j] or _is_intercept_col(col_names[j]):
            continue
        corr = np.corrcoef(X_full[:, col_idx], X_full[:, j])[0, 1]
        if abs(corr) > threshold:
            sign = "+" if corr > 0 else "-"
            aliases.append((sign, col_names[j]))
    return aliases


def _alias_structure_from_correlation(ctx: _EvalContext) -> dict[str, Any]:
    """Detect aliasing via correlation of model matrix columns."""
    if ctx.p <= 1:
        return {"alias_structure": []}

    # Build an expanded model matrix with higher-order terms for alias detection
    factor_str = " + ".join(ctx.factor_names)
    k = len(ctx.factor_names)
    max_order = min(k, 3)
    rhs = f"({factor_str}) ** {max_order}" if max_order >= 3 else f"({factor_str}) ** 2"

    dm = dmatrix(rhs, ctx.design_df, return_type="dataframe")
    X_full = np.asarray(dm, dtype=float)
    col_names = list(dm.columns)

    stddevs = X_full.std(axis=0)
    nonzero = stddevs > np.sqrt(np.finfo(float).eps)

    alias_chains: list[str] = []
    for i in range(X_full.shape[1]):
        if _is_intercept_col(col_names[i]) or not nonzero[i]:
            continue
        aliases = _find_correlated_aliases(X_full, col_names, i, nonzero, threshold=0.995)
        if aliases:
            alias_parts = [f"{sign}{name}" for sign, name in aliases]
            alias_chains.append(f"{col_names[i]} = " + " + ".join(alias_parts))

    return {"alias_structure": alias_chains}


def _compute_confounding(ctx: _EvalContext) -> dict[str, Any]:
    """Confounding structure: pairs of effects that cannot be distinguished."""
    alias_result = _compute_alias_structure(ctx)
    alias_chains = alias_result.get("alias_structure", [])
    if not alias_chains:
        return {"confounding": [], "note": "No confounding detected."}

    confounding_list: list[dict[str, Any]] = []
    for chain in alias_chains:
        if " = " not in chain:
            continue
        parts = chain.split(" = ", 1)
        effect = parts[0].strip()
        aliases_str = parts[1].strip()
        confounded = [a.strip().lstrip("+-") for a in aliases_str.split(" + ")]
        confounding_list.append({
            "effect": effect,
            "confounded_with": confounded,
        })
    return {"confounding": confounding_list}


def _effect_order(term: str, factor_names: list[str]) -> int:
    """Determine the order of an effect term (1=main, 2=2FI, etc.)."""
    if ":" in term:
        return term.count(":") + 1
    if term in factor_names:
        return 1
    return len(term)  # approximate for single-char factor names


def _compute_clear_effects(ctx: _EvalContext) -> dict[str, Any]:
    """Identify effects whose aliases are all of higher order."""
    alias_result = _compute_alias_structure(ctx)
    alias_chains = alias_result.get("alias_structure", [])

    clear_main: list[str] = []
    clear_2fi: list[str] = []

    for chain in alias_chains:
        if " = " not in chain:
            continue
        effect, aliases_str = chain.split(" = ", 1)
        effect = effect.strip()
        order = _effect_order(effect, ctx.factor_names)

        alias_terms = [a.strip().lstrip("+-") for a in aliases_str.strip().split(" + ")]
        all_higher = all(_effect_order(a, ctx.factor_names) > order for a in alias_terms)

        if all_higher and order == 1:
            clear_main.append(effect)
        elif all_higher and order == 2:
            clear_2fi.append(effect)

    return {
        "clear_effects": {
            "main_effects": clear_main,
            "two_factor_interactions": clear_2fi,
        }
    }


def _compute_minimum_aberration(ctx: _EvalContext) -> dict[str, Any]:
    """Wordlength pattern (A_3, A_4, ...) from the defining relation."""
    if not ctx.generators:
        return {
            "minimum_aberration": {
                "wordlength_pattern": [],
                "note": "Not a fractional factorial design.",
            }
        }

    words = _defining_relation_from_generators(ctx.generators, ctx.factor_names)
    if not words:
        return {
            "minimum_aberration": {
                "wordlength_pattern": [],
                "note": "No defining relation words found.",
            }
        }

    lengths = [len(w) for w in words]
    max_len = max(lengths) if lengths else 0
    # Wordlength pattern: A_i = number of words of length i, starting at i=3
    pattern = [lengths.count(i) for i in range(3, max_len + 1)]

    return {
        "minimum_aberration": {
            "wordlength_pattern": pattern,
            "wordlength_pattern_labels": [f"A_{i}" for i in range(3, max_len + 1)],
        }
    }


# ---------------------------------------------------------------------------
# Metric dispatch registry
# ---------------------------------------------------------------------------

_METRIC_REGISTRY: dict[str, Callable[[_EvalContext], Any]] = {
    "d_efficiency": _compute_d_efficiency,
    "i_efficiency": _compute_i_efficiency,
    "g_efficiency": _compute_g_efficiency,
    "prediction_variance": _compute_prediction_variance,
    "vif": _compute_vif,
    "condition_number": _compute_condition_number,
    "power": _compute_power,
    "degrees_of_freedom": _compute_degrees_of_freedom,
    "alias_structure": _compute_alias_structure,
    "confounding": _compute_confounding,
    "resolution": _compute_resolution,
    "defining_relation": _compute_defining_relation,
    "clear_effects": _compute_clear_effects,
    "minimum_aberration": _compute_minimum_aberration,
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------



[docs]
def evaluate_design(  # noqa: PLR0913
    design_matrix: pd.DataFrame | DesignResult,
    model: str | None = None,
    metric: str | list[str] = "d_efficiency",
    effect_size: float | None = None,
    alpha: float = 0.05,
    sigma: float | None = None,
) -> dict[str, Any]:
    """Compute quality metrics for an experimental design.

    Parameters
    ----------
    design_matrix : DataFrame or DesignResult
        The design to evaluate.  If a :class:`DesignResult` is passed, the
        coded design matrix and any generator / defining-relation metadata
        are extracted automatically.
    model : str or None
        Model type: ``"main_effects"``, ``"interactions"``, ``"quadratic"``,
        or an explicit patsy formula.  ``None`` defaults to ``"interactions"``.
    metric : str or list[str]
        One or more metric names to compute.  Valid names:
        ``"alias_structure"``, ``"confounding"``, ``"resolution"``,
        ``"defining_relation"``, ``"power"``, ``"d_efficiency"``,
        ``"i_efficiency"``, ``"g_efficiency"``, ``"prediction_variance"``,
        ``"degrees_of_freedom"``, ``"vif"``, ``"condition_number"``,
        ``"clear_effects"``, ``"minimum_aberration"``.
    effect_size : float or None
        Expected effect size for power calculation.  When *None*, a power
        curve over a range of effect sizes is returned instead.
    alpha : float
        Significance level for power calculation (default 0.05).
    sigma : float or None
        Estimated noise standard deviation.  Defaults to 1.0 when needed
        but not provided.

    Returns
    -------
    dict[str, Any]
        Results keyed by metric name.  The structure of each value depends
        on the metric - see individual metric documentation.

    Examples
    --------
    >>> from process_improve.experiments import evaluate_design, generate_design, Factor
    >>> factors = [Factor(name="A", low=0, high=10), Factor(name="B", low=0, high=10)]
    >>> result = generate_design(factors, design_type="full_factorial", center_points=0)
    >>> metrics = evaluate_design(result, model="main_effects", metric="d_efficiency")
    >>> metrics["d_efficiency"]  # doctest: +SKIP
    100.0
    """
    # --- Unpack input ---
    generators: list[str] | None = None
    defining_relation: list[str] | None = None
    resolution: int | None = None

    if isinstance(design_matrix, DesignResult):
        generators = design_matrix.generators
        defining_relation = design_matrix.defining_relation
        resolution = design_matrix.resolution
        factor_names = list(design_matrix.factor_names)
        design_df = pd.DataFrame(design_matrix.design)
    else:
        design_df = pd.DataFrame(design_matrix)
        factor_names = list(design_df.columns)

    # Drop non-factor columns
    for col in ["RunOrder", "Block"]:
        if col in design_df.columns and col not in factor_names:
            design_df = design_df.drop(columns=[col])
        elif col in factor_names:
            factor_names.remove(col)
            design_df = design_df.drop(columns=[col])

    # --- Normalize metric to list ---
    metrics = [metric] if isinstance(metric, str) else list(metric)

    # Validate metric names
    unknown = [m for m in metrics if m not in _METRIC_REGISTRY]
    if unknown:
        available = sorted(_METRIC_REGISTRY.keys())
        raise ValueError(f"Unknown metric(s): {unknown}. Available metrics: {available}")

    # --- Build context ---
    ctx = _build_context(
        design_df=design_df,
        factor_names=factor_names,
        model=model,
        generators=generators,
        defining_relation=defining_relation,
        resolution=resolution,
        effect_size=effect_size,
        alpha=alpha,
        sigma=sigma,
    )

    # --- Compute requested metrics ---
    results: dict[str, Any] = {}
    for m in metrics:
        result = _METRIC_REGISTRY[m](ctx)
        results.update(result)

    return results