Source code for combo.models.classifier_dcs

# -*- coding: utf-8 -*-
"""Stacking (meta ensembling). See http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/
for more information.
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause

import warnings
import numpy as np

from sklearn.neighbors import KDTree
from sklearn.metrics import accuracy_score
from sklearn.utils import check_array
from sklearn.utils import check_X_y
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.multiclass import check_classification_targets

from pyod.utils.utility import check_parameter

from .base import BaseAggregator


[docs]class DCS_LA(BaseAggregator): """Dynamic Classifier Selection (DCS) is an established combination framework for classification tasks. The technique was first proposed by Ho et al. in 1994 :cite:`ho1994decision` and then extended, under the name DCS Local Accuracy, by Woods et al. in 1997 :cite:`woods1997combination` to select the most accurate base classifier in a local region. The motivation behind this approach is that base classifiers often make distinctive errors and over a degree of complementarity. Consequently, selectively combining base classifier can result in a performance improvement over generic ensembles which use the majority vote of all base classifiers. See :cite:`woods1997combination` for details. Parameters ---------- base_estimators: list or numpy array (n_estimators,) A list of base classifiers. local_region_size : int, optional (default=30) Number of training points to consider in each iteration of the local region generation process (30 by default). threshold : float in (0, 1), optional (default=None) Cut-off value to convert scores into binary labels. pre_fitted : bool, optional (default=False) Whether the base classifiers are trained. If True, `fit` process may be skipped. """ def __init__(self, base_estimators, local_region_size=30, threshold=None, pre_fitted=None): super(DCS_LA, self).__init__( base_estimators=base_estimators, pre_fitted=pre_fitted) # validate input parameters if not isinstance(local_region_size, int): raise ValueError('local_region_size must be an integer variable') check_parameter(local_region_size, low=2, include_left=True, param_name='local_region_size') self.local_region_size = local_region_size if threshold is not None: warnings.warn( "DCS does not support threshold setting option. " "Please set the threshold in classifiers directly.") if pre_fitted is not None: warnings.warn("DCS does not support pre_fitted option.")
[docs] def fit(self, X, y): """Fit classifier. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # Validate inputs X and y X, y = check_X_y(X, y) X = check_array(X) check_classification_targets(y) self._classes = len(np.unique(y)) n_samples = X.shape[0] # save the train ground truth for evaluation purpose self.y_train_ = y # build KDTree out of training subspace self.tree_ = KDTree(X) self.y_train_predicted_ = np.zeros( [n_samples, self.n_base_estimators_]) # train all base classifiers on X, and get their local predicted scores # iterate over all base classifiers for i, clf in enumerate(self.base_estimators): clf.fit(X, y) self.y_train_predicted_[:, i] = clf.predict(X) clf.fitted_ = True self.fitted_ = True return
[docs] def predict(self, X): """Predict the class labels for the provided data. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Returns ------- labels : numpy array of shape (n_samples,) Class labels for each data sample. """ return self._predict_internal(X, predict_proba=False)
[docs] def predict_proba(self, X): """Return probability estimates for the test data X. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Returns ------- p : numpy array of shape (n_samples,) The class probabilities of the input samples. Classes are ordered by lexicographic order. """ return self._predict_internal(X, predict_proba=True)
def _predict_internal(self, X, predict_proba): """Internal function for predict and predict_proba Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. predict_proba : bool if True, return the result of predict_proba Returns ------- """ check_is_fitted(self, ['fitted_']) X = check_array(X) n_samples = X.shape[0] # Find neighbors for all test instances _, ind_arr = self.tree_.query(X, k=self.local_region_size) if predict_proba: y_predicted = np.zeros([n_samples, self._classes]) else: y_predicted = np.zeros([n_samples, ]) # For each test sample for i in range(n_samples): test_sample = X[i, :].reshape(1, -1) train_inds = ind_arr[i, :] # ground truth y_train_sample = self.y_train_[train_inds] clf_performance = np.zeros([self.n_base_estimators_, ]) for j, clf in enumerate(self.base_estimators): y_train_clf = self.y_train_predicted_[train_inds, j] clf_performance[j] = accuracy_score(y_train_sample, y_train_clf) # select the best clf. may get multiple results select_clf_inds = np.argwhere( clf_performance == np.amax(clf_performance)).ravel() # select the first element from all candidates best_clf_ind = select_clf_inds[-1] # make prediction if predict_proba: y_predicted[i] = self.base_estimators[ best_clf_ind].predict_proba(test_sample) else: y_predicted[i] = self.base_estimators[best_clf_ind].predict( test_sample) return y_predicted
[docs] def fit_predict(self, X, y): """Fit estimator and predict on X Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). Returns ------- labels : numpy array of shape (n_samples,) Class labels for each data sample. """ raise NotImplementedError( 'fit_predict should not be used in supervised learning models.')