Source code for skfb.estimators._anomaly

"""Fallback classification via anomaly detection."""

__all__ = ("AnomalyFallbackClassifier",)

import numpy as np
from sklearn.utils.metaestimators import available_if
from sklearn.utils.validation import check_is_fitted, NotFittedError

from ..core import array as ska
from ..utils._legacy import HasMethods, validate_params
from ..utils._validation import check_X_y_sample_weight
from .base import BaseFallbackClassifier, _estimator_has


[docs]class AnomalyFallbackClassifier(BaseFallbackClassifier):
    """A fallback classifier based on provided anomaly detector.

    Augments ``estimator`` behavior with a reject option based on outlier detection.
    If ``outlier_detector`` predicts -1 for a given input, then returns, masks, or
    ignores rejections depending on ``fallback_mode``. Otherwise, accepts the input
    and makes predictions.

    Parameters
    ----------
    estimator : object
        The base estimator making decisions w/o fallbacks.
    outlier_detector : object
        The outlier detector returning 1 for inliers and -1 for outliers.
    remove_outliers : bool, default=False
        Whether to remove outliers from training data before fitting.
    fallback_label : any, default=-1
        The label of a rejected example.
        Should be compatible w/ the class labels from training data.
    fallback_mode : {"return", "store", "ignore"}, default="store"
        While predicting w/ the ``predict`` method, whether to return:

        * (``"return"``) a numpy ndarray of both predictions and fallbacks;
        * (``"store"``)  an ``FBNDArray`` of predictions storing also fallback mask;
        * (``"ignore"``) a numpy ndarray of only estimator's predictions.

        Calling ``decision_function`` or ``predict_proba`` is equivalent to
        ``estimator``'s corresponding calls except that with ``"store"``, ``FBNDArray``
        is returned.

    Attributes
    ----------
    estimator_ : object
        Trained base estimator.
    outlier_detector_ : object
        Trained outlier detector.

    Examples
    --------
    >>> import numpy as np
    >>> from skfb.estimators import AnomalyFallbackClassifier
    >>> from sklearn.ensemble import IsolationForest
    >>> from sklearn.linear_model import LogisticRegression
    >>> estimator = LogisticRegression(random_state=0)
    >>> outlier_detector = IsolationForest(n_estimators=10, max_samples=1.0,
    ...                                    contamination=0.2, random_state=0)
    >>> rejector = AnomalyFallbackClassifier(estimator, outlier_detector)
    >>> X = np.array([
    ...     [0, 0], [10, 10], [1, 1], [9, 9], [1, 0], [9, 10], [0, 1], [10, 9],
    ...     [5.5, 5], [5., 5.5]
    ... ])
    >>> y = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
    >>> rejector.fit(X, y).predict(X).get_dense_fallback_mask()
    array([False, False, False, False, False, False, False, False,  True,
            True])
    >>> rejector.set_params(fallback_mode="return").predict(X)
    array([ 0,  1,  0,  1,  0,  1,  0,  1, -1, -1])
    >>> rejector.score(X, y)
    1.0
    """

    _parameter_constraints = {**BaseFallbackClassifier._parameter_constraints}
    _parameter_constraints.update(
        {
            "outlier_detector": [HasMethods(["fit_predict"])],
            "remove_outliers": ["boolean"],
        },
    )

    def __init__(
        self,
        estimator,
        outlier_detector,
        remove_outliers=False,
        fallback_label=-1,
        fallback_mode="store",
    ):
        super().__init__(
            estimator=estimator,
            fallback_label=fallback_label,
            fallback_mode=fallback_mode,
        )

        self.outlier_detector = outlier_detector
        self.remove_outliers = remove_outliers

        # NOTE: I believe we are violating a scikit-learn proposal by assigning an
        #       attribute right after the initialization instead of the fitting.
        #       But this seems to be the best way to prevent refitting.
        try:
            check_is_fitted(self.estimator, "classes_")
            check_is_fitted(self.outlier_detector)

            fallback_label_ = self.validate_fallback_label(
                self.fallback_label, self.estimator.classes_
            )
            fitted_params = {
                "estimator_": self.estimator,
                "outlier_detector_": self.outlier_detector,
                "classes_": self.estimator.classes_,
                "fallback_label_": fallback_label_,
            }
            self._set_fitted_attributes(fitted_params)
        except NotFittedError:
            pass

[docs]    def fit(self, X, y, **fit_params):
        """Trains base estimator and outlier detector then sets fit attributes.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values.

        Returns
        -------
        self : object
            Returns self.
        """
        sample_weight = fit_params.pop("sample_weight", None)
        X, y, sample_weight = check_X_y_sample_weight(
            X, y=y, sample_weight=sample_weight
        )

        if sample_weight is not None:
            self.outlier_detector.fit(X, y=y, sample_weight=sample_weight, **fit_params)
        else:
            self.outlier_detector.fit(X, y=y, **fit_params)

        self._set_fitted_attributes(
            {
                "outlier_detector_": self.outlier_detector,
                "is_fitted_": False,  # Not yet; after super().fit(X, y, **fp)
            },
        )

        if self.remove_outliers:
            acceptance_mask = self.outlier_detector_.predict(X) == 1
            X = X[acceptance_mask]
            y = y[acceptance_mask]
            if sample_weight is not None:
                sample_weight = sample_weight[acceptance_mask]

        if sample_weight is not None:
            return super().fit(X, y, sample_weight=sample_weight, **fit_params)
        else:
            return super().fit(X, y, **fit_params)

    def _predict(self, X):
        """Runs outlier detection and classification.

        Returns both fallbacks and classes if ``self.fallback_mode == 'return'``,
        or classes w/ fallback mask if ``self.fallback_mode == 'store'``.
        """
        X, _, _ = check_X_y_sample_weight(X)

        y_out = self.outlier_detector_.predict(X)
        fallback_mask = y_out == -1

        if self.fallback_mode == "return":
            y_comb = np.empty(len(X), dtype=self.classes_.dtype)
            acceptance_mask = ~fallback_mask
            y_comb[acceptance_mask] = self.estimator_.predict(X[acceptance_mask])
            y_comb[fallback_mask] = self.fallback_label_
            return y_comb
        else:
            y_pred = self.estimator_.predict(X)
            y_pred = ska.fbarray(y_pred, fallback_mask)
            return y_pred

    def _set_fallback_mask(self, y_prob, X):
        """Doesn't set fallback mask for ``predict_proba`` and ``decision_function``."""
        y_out = self.outlier_detector_.predict(X)
        y_prob.fallback_mask = y_out == -1

[docs]    @available_if(_estimator_has("decision_function"))
    @validate_params(
        {
            "X": ["array-like", "sparse matrix"],
        },
        prefer_skip_nested_validation=True,
    )
    def decision_function(self, X):
        """Calls ``decision_function`` on the estimator and sets fallback mask.

        Parameters
        ----------
        X : indexable, length n_samples
            Input samples to classify.
            Must fulfill the input assumptions of the
            underlying estimator.

        Returns
        -------
        y_pred : FBNDArray of shape (n_samples,) or (n_samples, n_classes)
            Predicted class scores for `X` based on the estimator.
            If ``self.fallback_mode == "store"``, scores store fallback mask.
        """
        check_is_fitted(self, attributes="is_fitted_")
        y_prob = self.estimator_.decision_function(X)
        if self.fallback_mode == "store":
            y_prob = ska.fbarray(y_prob)
            self._set_fallback_mask(y_prob, X=X)
        return y_prob

[docs]    @available_if(_estimator_has("predict_proba"))
    @validate_params(
        {
            "X": ["array-like", "sparse matrix"],
        },
        prefer_skip_nested_validation=True,
    )
    def predict_proba(self, X):
        """Calls ``predict_proba`` on the estimator.

        Parameters
        ----------
        X : indexable, length n_samples
            Input samples to classify.
            Must fulfill the input assumptions of the
            underlying estimator.

        Returns
        -------
        y_pred : FBNDArray of shape (n_samples,) or (n_samples, n_classes)
            Predicted class probabilities for `X` based on the estimator.
            The order of the classes corresponds to that in the fitted
            attribute :term:`classes_`.
            If ``self.fallback_mode == "store"``, probabilities store fallback mask.
        """
        check_is_fitted(self, attributes="is_fitted_")
        y_prob = self.estimator_.predict_proba(X)
        if self.fallback_mode == "store":
            y_prob = ska.fbarray(y_prob)
            self._set_fallback_mask(y_prob, X=X)
        return y_prob