"""Threshold-based cascade ensembles."""
from typing import Sequence
import warnings
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, check_is_fitted, clone
from sklearn.metrics import accuracy_score, get_scorer, get_scorer_names
from sklearn.model_selection import check_cv, ParameterGrid
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import NotFittedError
try:
from sklearn.utils.parallel import delayed, Parallel
except ModuleNotFoundError:
from joblib import Parallel, delayed
from ..utils._legacy import (
_fit_context,
Integral,
Interval,
Real,
StrOptions,
validate_params,
)
from ._common import fit_one
from ..core.array import earray
from ..core.exceptions import SKFBException, SKFBWarning
[docs]class CascadeNotFittedWarning(SKFBWarning):
"""Raised if base estimators in cascade are not fitted or fitted incorrectly."""
[docs]class CascadeParetoConfigWarning(SKFBWarning):
"""Raised if no Pareto configuration satisfies cost-performance constraints."""
[docs]class CascadeParetoConfigException(SKFBException):
"""Raised if no Pareto configuration satisfies cost-performance constraints."""
[docs]class ThresholdCascadeClassifier(BaseEstimator, ClassifierMixin):
"""Cascade of classifiers w/ deferrals based on predefined thresholds.
During inference, runs the first estimator and if a predicted score is lower than
``thresholds[0]``, tries the second, and so on. The last estimator always makes
predictions on the samples deferred by the previous estimators.
If every estimator is fitted, it is not necessary to run ``fit`` to make
predictions.
Parameters
----------
estimators : array-like of object, length n_estimators
Base estimators. Preferrably, from weakest (e.g., rule-based or linear) to
strongest (e.g., gradient boosting).
thresholds : float or array-like of float, length n_estimators - 1
Deferral thresholds for each base estimator except the last.
If only one number is specified, every estimator (except the last) will have
the same threshold (i.e., the threshold will be *global*).
response_method : {"predict_proba", "decision_function"}, default="predict_proba"
Methods by ``estimators`` for which we want to find return deferral thresholds.
For ``"decision_function"``, ``thresholds`` can be negative.
return_earray : bool, default=False
Whether to return :class:`~skfb.core.ENDArray` of predicted classes / scores
or plain numpy ndarray.
prefit : bool, default=False
Whether estimators are fitted. If True, checks their ``classes_`` attributes
for intercompatibility.
n_jobs : int, default=None
Number of parallel jobs used during training.
verbose : int, default=False
Verbosity level.
Examples
--------
>>> import numpy as np
>>> from skfb.ensemble import ThresholdCascadeClassifier
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.linear_model import LogisticRegression
>>> X = np.array([
... [0, 0], [4, 4], [1, 1], [3, 3], [2.5, 2], [2., 2.5], [2., 2.], [2.5, 2.5]
... ])
>>> y = np.array([0, 1, 0, 1, 0, 1, 1, 0])
>>> maxent = LogisticRegression(random_state=0)
>>> rf = RandomForestClassifier(random_state=0)
>>> cascade = ThresholdCascadeClassifier([maxent, rf], [0.8]).fit(X, y)
>>> cascade.score(X, y)
1.0
>>> cascade.set_estimators(0).score(X, y) # Use only LogisticRegression
0.75
Notes
-----
If you want to have a fallback option (for the last estimator), consider rejectors
from :mod:`skfb.estimators`.
"""
_parameter_constraints = {
"estimators": ["array-like"],
"thresholds": ["array-like", Interval(Real, None, None, closed="neither")],
"response_method": [StrOptions({"decision_function", "predict_proba"})],
"return_earray": ["boolean"],
"prefit": ["boolean"],
"n_jobs": [Interval(Integral, -1, None, closed="left"), None],
"verbose": ["verbose"],
}
def __init__(
self,
estimators,
thresholds,
response_method="predict_proba",
return_earray=True,
prefit=False,
n_jobs=None,
verbose=False,
):
self.estimators = estimators
self.response_method = response_method
self.thresholds = thresholds
self._set_thresholds(thresholds)
self.return_earray = return_earray
self.prefit = prefit
self.n_jobs = n_jobs
self.verbose = verbose
# region Check if base estimators are fitted correctly
if self.prefit:
classes = None
for i, estimator in enumerate(self.estimators):
try:
check_is_fitted(estimator, "classes_")
if not (
classes is None or np.array_equal(classes, estimator.classes_)
):
warnings.warn(
f"Estimators {i} and {i - 1} predict different classes; "
f"please, run cascade's `fit` method to train all "
f"estimators.",
category=CascadeNotFittedWarning,
)
break
classes = estimator.classes_
except NotFittedError:
warnings.warn(
f"Estimator {i} is not fitted; "
f"please, run cascade's `fit` method to train all estimators.",
category=CascadeNotFittedWarning,
)
break
else:
self.estimators_ = estimators
self.classes_ = classes
self._current_estimators = self.estimators_[:]
self._current_thresholds = self.thresholds_[:]
self.is_fitted_ = True
# endregion
[docs] @_fit_context(prefer_skip_nested_validation=False)
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"y": ["array-like"],
"sample_weight": ["array-like", None],
},
prefer_skip_nested_validation=True,
)
def fit(self, X, y, sample_weight=None):
"""Fits base estimators and sets meta-estimator attributes.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,) or (n_samples, n_outputs)
The target values (class labels).
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Returns
-------
self : object
Returns self.
"""
self.classes_ = unique_labels(y)
if not hasattr(self, "estimators_"):
self.estimators_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
delayed(fit_one)(estimator, X, y, sample_weight)
for estimator in self.estimators
)
self._current_estimators = self.estimators_[:]
self._current_thresholds = self.thresholds_[:]
self.is_fitted_ = True
return self
[docs] @validate_params(
{
"X": ["array-like", "sparse matrix"],
},
prefer_skip_nested_validation=True,
)
def predict(self, X):
"""Predicts classes using one or more base estimators.
Tries estimators in the order specified during initialization. If the first
estimator doesn't have a score higher or equal than the first threshold,
switches to the second estimator, and so on. The last estimator always makes
predictions if all the previous estimators deferred.
Parameters
----------
X : indexable, length n_samples
Input samples to classify.
Must fulfill the input assumptions of the underlying estimators.
Returns
-------
y_pred : ndarray of shape (n_samples,)
Classes predicted by the base estimators.
"""
check_is_fitted(self, attributes="is_fitted_")
y_score = self._predict_scores(X)
if y_score.ndim == 2:
y_pred = np.take(self.classes_, y_score.argmax(axis=1))
else:
y_pred = np.take(self.classes_, y_score >= 0)
return earray(y_pred, y_score.ensemble_mask) if self.return_earray else y_pred
[docs] @validate_params(
{
"X": ["array-like", "sparse matrix"],
},
prefer_skip_nested_validation=True,
)
def predict_proba(self, X):
"""Predicts probabilities using one or more base estimators.
Tries estimators in the order specified during initialization. If the first
estimator doesn't have a score higher or equal than the first threshold,
switches to the second estimator, and so on. The last estimator always makes
predictions if all the previous estimators deferred.
Parameters
----------
X : indexable, length n_samples
Input samples to classify.
Must fulfill the input assumptions of the underlying estimators.
Returns
-------
y_prob : ndarray of shape (n_samples, n_classes)
Probabilities predicted by the base estimators.
"""
check_is_fitted(self, attributes="is_fitted_")
return self._predict_scores(X)
[docs] @validate_params(
{
"X": ["array-like", "sparse matrix"],
},
prefer_skip_nested_validation=True,
)
def predict_log_proba(self, X):
"""Predicts log-probabilities using one or more base estimators.
Tries estimators in the order specified during initialization. If the first
estimator doesn't have a score higher or equal than the first threshold,
switches to the second estimator, and so on. The last estimator always makes
predictions if all the previous estimators deferred.
Parameters
----------
X : indexable, length n_samples
Input samples to classify.
Must fulfill the input assumptions of the underlying estimators.
Returns
-------
y_score : ndarray of shape (n_samples, n_classes)
Log-probabilities predicted by the base estimators.
"""
return np.log(self.predict_proba(X))
[docs] def decision_function(self, X):
"""Predicts decision scores using one or more base estimators.
Tries estimators in the order specified during initialization. If the first
estimator doesn't have a score higher or equal than the first threshold,
switches to the second estimator, and so on. The last estimator always makes
predictions if all the previous estimators deferred.
Parameters
----------
X : indexable, length n_samples
Input samples to classify.
Must fulfill the input assumptions of the underlying estimators.
Returns
-------
y_score : ndarray of shape n_samples
Decision scores predicted by the base estimators.
"""
check_is_fitted(self, attributes="is_fitted_")
return self._predict_scores(X)
[docs] @validate_params(
{
"X": ["array-like", "sparse matrix"],
"y": ["array-like"],
"sample_weight": ["array-like", None],
},
prefer_skip_nested_validation=True,
)
def score(self, X, y, sample_weight=None):
"""Computes accuracy score on true labels and cascade predictions.
Parameters
----------
X : indexable, length n_samples
Input samples to evaluate.
Must fulfill the input assumptions of the underlying estimators.
y : array-like of shape (n_samples,)
True labels for `X`.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Returns
-------
score : float
Accuracy score.
"""
check_is_fitted(self, attributes="is_fitted_")
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
[docs] def set_params(self, **params):
"""Sets the parameters of the cascade.
If thresholds are provided, the transformations are done accordingly, so there
is no need to refit the cascade.
Parameters
----------
**params : dict
Cascade parameters.
Returns
-------
self : object
Returns self.
"""
if "thresholds" in params:
self._set_thresholds(params["thresholds"])
return super().set_params(**params)
def _set_thresholds(self, thresholds):
"""Transforms ``thresholds`` into correct sequence of thresholds."""
if isinstance(thresholds, float):
self.thresholds_ = [thresholds] * (len(self.estimators) - 1)
else:
assert len(thresholds) >= len(self.estimators) - 1, (
"thresholds must be provided for at least all but the last estimator"
)
self.thresholds_ = list(thresholds)
if self.response_method == "decision_function":
if len(thresholds) == len(self.estimators) - 1:
self.thresholds_.append(-np.inf)
else:
self.thresholds_.append(0.0)
self._current_thresholds = self.thresholds_[:]
return self
[docs] def set_estimators(self, index):
"""Sets the estimators and thresholds to use for prediction and scoring.
If a single index passed, the corresponding threshold is set to 0.0 or -np.inf
depending on the ``response_method`` attribute.
By default, uses all trained estimators (available by the ``estimators_``
``thresholds`` attribute).
Parameters
----------
index : int, or slice, or "all", or array-like of int
Returns
-------
self : object
Returns self.
Raises
------
TypeError:
If ``index`` is of unsupported type or value.
"""
check_is_fitted(self, attributes="is_fitted_")
if isinstance(index, int):
self._current_estimators = [self.estimators_[index]]
if self.response_method == "predict_proba":
self._current_thresholds = [0.0]
else:
self._current_thresholds = [-np.inf]
elif index == "all":
self._current_estimators = self.estimators_[:]
self._current_thresholds = self.thresholds_[:]
elif isinstance(index, Sequence):
self._current_estimators = [self.estimators_[i] for i in index]
self._current_thresholds = [self.thresholds_[i] for i in index[:-1]]
if self.response_method == "predict_proba":
self._current_thresholds.append(0.0)
else:
self._current_thresholds.append(-np.inf)
elif isinstance(index, slice):
self._current_estimators = self.estimators_[index]
self._current_thresholds = self.thresholds_[index]
self._current_thresholds.pop()
if self.response_method == "predict_proba":
self._current_thresholds.append(0.0)
else:
self._current_thresholds.append(-np.inf)
else:
raise TypeError(
f"index must be int or slice or sequence of int, not {type(index)}"
)
return self
[docs] def reset_estimators(self):
"""Reactivates all the base estimators.
Same as ``set_estimators("all")``. Use if you previously set to skip some
estimators and thresholds, and want to activate all estimators again.
Returns
-------
self : object
Returns self.
"""
return self.set_estimators("all")
def _predict_scores(self, X):
"""Estimates confidence scores for `predict_proba`.
Returns
-------
y_score : np.ndarray, shape = (n_samples, n_classes) or n_samples
Confidence scores (probabilities or decision scores, depending on
`self.response_method`).
"""
n_samples = len(X)
n_estimators = len(self._current_estimators)
n_classes = self._current_estimators[0].classes_.shape[0]
# Scores to return
if self.response_method == "predict_proba":
y_score = np.zeros((n_samples, n_classes), dtype=np.float64)
else:
y_score = np.zeros(n_samples, dtype=np.float64)
# Ensemble mask if `self.return_earray` is True
if self.return_earray:
ensemble_mask = np.zeros((n_samples, n_estimators), dtype=np.bool_)
# The current sample indices to process
remaining_idx = np.arange(n_samples)
# region Cascaded prediction of the current selected estimators
for i, (estimator, threshold) in enumerate(
zip(self._current_estimators, self._current_thresholds)
):
# All samples are processed
if len(remaining_idx) == 0:
break
# region Predict currently deferred samples
X_remaining = np.take(X, remaining_idx, axis=0)
y_score_remaining = getattr(estimator, self.response_method)(X_remaining)
if self.response_method == "predict_proba":
max_score = np.max(y_score_remaining, axis=1)
else:
max_score = y_score_remaining
# endregion
# region Mask selected samples
if i < n_estimators - 1:
confident_mask = max_score >= threshold
else:
confident_mask = np.ones(len(remaining_idx), dtype=np.bool_)
# endregion
# region Update indices of deferred samples
confident_idx = remaining_idx[confident_mask]
y_score[confident_idx] = y_score_remaining[confident_mask]
remaining_idx = remaining_idx[~confident_mask]
if self.return_earray:
ensemble_mask[confident_idx, i] = True
# endregion
# endregion
if self.return_earray:
return earray(y_score, ensemble_mask)
else:
return y_score
_N_CV_THRESHOLDS = 10
_MAX_DEFAULT_CV_THRESHOLD = 0.95
def _fitting_path(estimators, response_method, X, y, sample_weight):
"""Trains cascaded estimators."""
return fit_one(
ThresholdCascadeClassifier(
clone(estimators),
thresholds=0.0,
response_method=response_method,
return_earray=True,
prefit=False,
n_jobs=None,
verbose=False,
),
X,
y,
sample_weight=sample_weight,
)
def _scoring_path(
cascade,
thresholds,
costs,
X,
y,
sample_weight,
scoring,
scoring_response_method,
):
"""Trains cascade and scores with accuracy metric."""
predictor = getattr(
cascade.set_params(thresholds=thresholds),
scoring_response_method,
)
y_pred = predictor(X)
try:
# NOTE: Some scorers accept the full (n_samples, n_classes) predictions, but
score = scoring._score_func(y, y_pred, sample_weight=sample_weight)
except ValueError:
# others (e.g., `roc_auc_score` for binary classification) only n_samples.
score = scoring._score_func(y, y_pred[:, 1], sample_weight=sample_weight)
cost = y_pred.acceptance_rates @ costs
return score, cost
[docs]class ThresholdCascadeClassifierCV(ThresholdCascadeClassifier):
"""Cascade of classifiers with Pareto-optimized deferral thresholds.
Optimizes deferral thresholds via cross-validation grid search, identifying
non-dominated (Pareto-optimal) threshold configurations that balance performance
and computational cost. Users can select thresholds based on performance
constraints or cost budgets (e.g., select the best threshold configuration s.t.
it gives at least `min_score` classification score on validation).
During inference, runs the first estimator and if a predicted score is lower
than ``thresholds[0]``, tries the second, and so on. The last estimator always
makes predictions on deferred samples.
Parameters
----------
estimators : array-like of object, length n_estimators
Base estimators. Preferably ordered from weakest (fast, low-accuracy) to
strongest (slow, high-accuracy).
costs : array-like of shape (n_estimators,) or float, default=None
Computational cost per estimator. Used to identify non-dominated
threshold configurations along the cost-performance tradeoff. Defaults to
uniform costs summing to 1.0.
cv_thresholds : array-like of shape (n_thresholds,) or int, default=None
Candidate deferral thresholds for grid search. If None, defaults to 10
thresholds linearly spaced from 1/n_classes to 0.95. If int, generates
that many thresholds in the same range.
cv : int, cross-validation generator or iterable, default=5
Cross-validation splitting strategy. Accepts:
- int: number of folds (uses StratifiedKFold for classification)
- CV splitter object
- Iterable yielding (train_idx, test_idx) splits
scoring : callable or str, default="accuracy"
Scorer for threshold evaluation. Can be a scikit-learn scorer name
(e.g., "accuracy", "f1") or a callable with signature
``scorer(y_true, y_pred) -> float`` (higher is better).
min_score : float, default=None
Minimum acceptable cross-validation score. If specified, selects the
Pareto config with lowest cost meeting this accuracy constraint.
If None (default), uses the highest-accuracy Pareto config.
max_cost : float, default=None
Maximum acceptable computational cost. If specified, selects the
Pareto config with highest accuracy within this cost budget.
If None (default), uses the highest-accuracy Pareto config.
raise_error : bool, default=False
Whether to raise ``CascadeParetoConfigException`` if no Pareto configuration
satisfies the specified constraints (min_score, max_cost). If False (default),
issues a warning and falls back to the highest-accuracy Pareto config.
response_method : {"predict_proba", "decision_function"}, default="predict_proba"
Method by estimators for computing deferral scores.
return_earray : bool, default=True
Whether to return :class:`~skfb.core.ENDArray` with ensemble mask
or plain numpy ndarray.
n_jobs : int, default=None
Parallel jobs:
- 1) model pre-training for each CV fold;
- 2) score and cost evaluation for each fold and threshold configuration;
- 3) and retraining on full data.
-1 uses all processors. Defaults to one.
verbose : int, default=0
Verbosity level for each stage of training.
Attributes
----------
best_thresholds_ : list of float
Best selected thresholds.
all_cv_thresholds_ : ndarray, shape (n_configs, n_splits)
All generated threshold configurations.
mean_cv_scores_ : ndarray, shape (n_configs,)
Average cross-validated classification scores.
mean_cv_costs_ : ndarray, shape (n_configs,)
Average cross-validated computational costs.
Examples
--------
>>> from skfb.ensemble import ThresholdCascadeClassifierCV
>>> from sklearn.datasets import make_classification
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.linear_model import LogisticRegression
>>> X, y = make_classification(
... n_samples=300, n_features=100, n_redundant=95, class_sep=0.1,
... random_state=0)
>>> cascading = ThresholdCascadeClassifierCV(
... [LogisticRegression(l1_ratio=1.0, solver="liblinear", random_state=0),
... RandomForestClassifier(random_state=0)],
... costs=[1.0, 5.0],
... cv_thresholds=5,
... cv=3).fit(X, y)
>>> cascading.best_thresholds_
Notes
-----
The Pareto front contains all non-dominated configurations: those where no
other configuration achieves both strictly higher score AND strictly lower
cost.
If you want a fallback option for the last estimator, consider rejectors
from :mod:`skfb.estimators`.
"""
_parameter_constraints = {
"estimators": ["array-like"],
"costs": ["array-like", Interval(Real, 0, None, closed="neither"), None],
"cv_thresholds": [
"array-like",
Interval(Real, None, None, closed="neither"),
None,
],
"cv": ["cv_object"],
"scoring": [callable, StrOptions(set(get_scorer_names())), None],
"min_score": [Interval(Real, None, None, closed="neither"), None],
"max_cost": [Interval(Real, 0, None, closed="left"), None],
"strategy": [StrOptions({"min_score", "max_cost", "balanced"})],
"raise_error": ["boolean"],
"response_method": [StrOptions({"decision_function", "predict_proba"})],
"return_earray": ["boolean"],
"n_jobs": [Interval(Integral, -1, None, closed="left"), None],
"verbose": ["verbose"],
}
def __init__(
self,
estimators,
costs=None,
cv_thresholds=None,
min_score=None,
max_cost=None,
strategy="balanced",
cv=5,
scoring="accuracy",
raise_error=False,
response_method="predict_proba",
return_earray=True,
n_jobs=None,
verbose=0,
):
super().__init__(
estimators=estimators,
thresholds=0.0,
response_method=response_method,
return_earray=return_earray,
prefit=False,
n_jobs=n_jobs,
verbose=verbose,
)
self.costs = costs
self.cv_thresholds = cv_thresholds
self.cv = cv
self.scoring = scoring
self.min_score = min_score
self.max_cost = max_cost
self.strategy = strategy
self.raise_error = raise_error
def _select_best_thresholds(self):
"""Identifies non-dominated (Pareto-optimal) threshold configurations.
A configuration is Pareto-optimal if no other configuration achieves both
strictly higher accuracy AND strictly lower cost.
"""
# region Mask thresholds by constraints
feasible = np.ones(len(self.all_cv_thresholds_), dtype=bool)
if self.min_score is not None:
feasible &= self.mean_cv_scores_ >= self.min_score
if self.max_cost is not None:
feasible &= self.mean_cv_costs_ <= self.max_cost
# endregion
# region Handle default case
if not np.any(feasible):
if self.raise_error:
raise CascadeParetoConfigException(
f"No threshold configuration satisfies constraints: "
f"min_score={self.min_score} and max_cost={self.max_cost}."
)
else:
default_idx = np.argmax(self.mean_cv_scores_ / self.mean_cv_costs_)
self.best_thresholds_ = self.all_cv_thresholds_[default_idx]
warnings.warn(
(
f"No threshold configuration satisfies constraints: "
f"min_score={self.min_score} and max_cost={self.max_cost}; "
f"setting thresholds = {self.best_thresholds_} giving "
f"max(scores / costs)."
),
category=CascadeParetoConfigWarning,
)
# endregion
# region Handle constraints
else:
feasible_idx = np.where(feasible)[0]
feasible_scores = self.mean_cv_scores_[feasible_idx]
feasible_costs = self.mean_cv_costs_[feasible_idx]
pareto = np.ones(len(feasible_idx), dtype=bool)
for i in range(len(feasible_idx)):
for j in range(len(feasible_idx)):
if i != j:
j_dominates_i = (
feasible_scores[j] >= feasible_scores[i]
and feasible_costs[j] <= feasible_costs[i]
and (
feasible_scores[j] != feasible_scores[i]
or feasible_costs[j] != feasible_costs[i]
)
)
if j_dominates_i:
pareto[i] = False
break
pareto_idx = feasible_idx[pareto]
pareto_scores = self.mean_cv_scores_[pareto_idx]
pareto_costs = self.mean_cv_costs_[pareto_idx]
if self.strategy == "min_score":
best_idx = np.argmax(pareto_scores)
elif self.strategy == "max_cost":
best_idx = np.argmin(pareto_costs)
else:
best_idx = np.argmax(pareto_scores / pareto_costs)
self.best_thresholds_ = self.all_cv_thresholds_[pareto_idx[best_idx]]
# endregion
[docs] @_fit_context(prefer_skip_nested_validation=False)
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"y": ["array-like"],
"sample_weight": ["array-like", None],
},
prefer_skip_nested_validation=True,
)
def fit(self, X, y, sample_weight=None):
"""Fit estimators and identify Pareto-optimal threshold configurations.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training input samples.
y : array-like, shape (n_samples,) or (n_samples, n_outputs)
Target class labels.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, samples are equally weighted.
Returns
-------
self : object
Fitted estimator. Use ``predict()`` and/or ``set_params()`` methods.
"""
self.classes_ = unique_labels(y)
# region Process costs
if self.costs is None:
self.costs_ = np.array([1.0 / len(self.estimators)] * len(self.estimators))
elif isinstance(self.costs, (float, int)):
self.costs_ = np.array([self.costs] * len(self.estimators))
else:
self.costs_ = np.asarray(self.costs)
# endregion
# region Generate candidate thresholds
if self.cv_thresholds is None or isinstance(self.cv_thresholds, int):
n_thresholds = self.cv_thresholds or _N_CV_THRESHOLDS
self.cv_thresholds_ = np.linspace(
1 / len(self.classes_),
_MAX_DEFAULT_CV_THRESHOLD,
n_thresholds,
)
else:
self.cv_thresholds_ = np.asarray(self.cv_thresholds)
# endregion
# region Setup cross-validation and scorer
self.cv_ = check_cv(self.cv, y=y, classifier=True)
self.scoring_ = get_scorer(self.scoring)
try:
response_methods = self.scoring_._response_method
if isinstance(response_methods, (list, tuple)):
self._scoring_response_method = response_methods[-1]
else:
self._scoring_response_method = response_methods
except AttributeError:
raise ValueError(
f"`scoring` should be either scikit-learn scorer name like 'accuracy' "
f"or custom scorer wrapped with `sklearn.metrics.make_scorer`, "
f"not {self.scoring}."
)
# endregion
# region Generate all threshold combinations
threshold_grids = [self.cv_thresholds_] * (len(self.estimators) - 1)
threshold_combinations = ParameterGrid(
{
f"threshold_{i}": thresholds
for i, thresholds in enumerate(threshold_grids)
},
)
self.all_cv_thresholds_ = [
tuple(combo[f"threshold_{i}"] for i in range(len(self.estimators) - 1))
for combo in threshold_combinations
]
self.all_cv_thresholds_ = np.array(self.all_cv_thresholds_)
# endregion
# region Temporarily train estimators on different folds
cascades = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
delayed(_fitting_path)(
self.estimators,
self.response_method,
np.take(X, train_idx, axis=0),
np.take(y, train_idx, axis=0),
(
np.take(sample_weight, train_idx, axis=0)
if sample_weight is not None
else None
),
)
for train_idx, _ in self.cv_.split(X, y)
)
cascades = np.array(cascades)
# endregion
# region Cross-validation
results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
delayed(_scoring_path)(
cascade,
thresholds,
self.costs_,
np.take(X, test_idx, axis=0),
np.take(y, test_idx, axis=0),
(
np.take(sample_weight, test_idx, axis=0)
if sample_weight is not None
else None
),
self.scoring_,
self._scoring_response_method,
)
for thresholds in self.all_cv_thresholds_
for cascade, (_, test_idx) in zip(cascades, self.cv_.split(X, y))
)
scores_and_costs = np.array(results).reshape(
len(self.all_cv_thresholds_), len(cascades), 2
)
self.mean_cv_scores_ = scores_and_costs[:, :, 0].mean(axis=1)
self.mean_cv_costs_ = scores_and_costs[:, :, 1].mean(axis=1)
# endregion
# region Set default thresholds and estimators
self.set_params(min_score=self.min_score, max_cost=self.max_cost)
super().fit(X, y, sample_weight=sample_weight)
# endregion
return self
[docs] def set_params(self, **params):
"""Sets the parameters of the cascade.
If thresholds or new constraints are provided, the transformations are done
accordingly, so there is no need to refit the cascade.
Parameters
----------
**params : dict
Cascade parameters.
Returns
-------
self : object
Returns self.
Raises
------
ValueError
If all `min_score`, `max_cost`, and `thresholds` are passed.
"""
not_given = "not-given"
max_cost = params.get("max_cost", not_given)
min_score = params.get("min_score", not_given)
thresholds = params.get("thresholds", not_given)
if all(p != not_given for p in (max_cost, min_score, thresholds)):
raise ValueError(
"Pass either min_score and max_cost or thresholds. "
"The former will automatically determine the best thresholds."
)
elif thresholds != not_given:
self._set_thresholds(thresholds)
params.pop("thresholds")
return super().set_params(**params)
elif max_cost != not_given or min_score != not_given:
super().set_params(**params)
self._select_best_thresholds()
self._set_thresholds(self.best_thresholds_)
return self
return super().set_params(**params)