# -*- coding: utf-8 -*-
"""A collection of methods for combining classifiers
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause
import numpy as np
from sklearn.utils import check_array
from sklearn.utils import check_X_y
from sklearn.utils import column_or_1d
from pyod.utils.utility import check_parameter
from .base import BaseAggregator
from .score_comb import average, maximization, majority_vote, median
from ..utils.utility import score_to_proba
[docs]class SimpleClassifierAggregator(BaseAggregator):
"""A collection of simple classifier combination methods.
Parameters
----------
base_estimators: list or numpy array (n_estimators,)
A list of base classifiers.
method : str, optional (default='average')
Combination method: {'average', 'maximization', 'majority vote',
'median'}. Pass in weights of classifier for weighted version.
threshold : float in (0, 1), optional (default=0.5)
Cut-off value to convert scores into binary labels.
weights : numpy array of shape (1, n_classifiers)
Classifier weights.
pre_fitted : bool, optional (default=False)
Whether the base classifiers are trained. If True, `fit`
process may be skipped.
"""
def __init__(self, base_estimators, method='average', threshold=0.5,
weights=None, pre_fitted=False):
super(SimpleClassifierAggregator, self).__init__(
base_estimators=base_estimators, pre_fitted=pre_fitted)
# validate input parameters
if method not in ['average', 'maximization', 'majority_vote',
'median']:
raise ValueError("{method} is not a valid parameter.".format(
method=method))
self.method = method
check_parameter(threshold, 0, 1, include_left=False,
include_right=False, param_name='threshold')
self.threshold = threshold
# set estimator weights
self._set_weights(weights)
[docs] def fit(self, X, y):
"""Fit classifier.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : numpy array of shape (n_samples,), optional (default=None)
The ground truth of the input samples (labels).
"""
# Validate inputs X and y
X, y = check_X_y(X, y)
X = check_array(X)
self._set_n_classes(y)
if self.pre_fitted:
print("Training skipped")
return
else:
for clf in self.base_estimators:
clf.fit(X, y)
clf.fitted_ = True
return
[docs] def predict(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
labels : numpy array of shape (n_samples,)
Class labels for each data sample.
"""
X = check_array(X)
all_scores = np.zeros([X.shape[0], self.n_base_estimators_])
for i, clf in enumerate(self.base_estimators):
if clf.fitted_ is not True and self.pre_fitted == False:
ValueError('Classifier should be fitted first!')
else:
if hasattr(clf, 'predict'):
all_scores[:, i] = clf.predict(X)
else:
raise ValueError(
"{clf} does not have predict.".format(clf=clf))
if self.method == 'average':
agg_score = average(all_scores, estimator_weights=self.weights)
if self.method == 'maximization':
agg_score = maximization(all_scores)
if self.method == 'majority_vote':
agg_score = majority_vote(all_scores, weights=self.weights)
if self.method == 'median':
agg_score = median(all_scores)
return (agg_score >= self.threshold).astype('int').ravel()
[docs] def predict_proba(self, X):
"""Return probability estimates for the test data X.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
p : numpy array of shape (n_samples,)
The class probabilities of the input samples.
Classes are ordered by lexicographic order.
"""
X = check_array(X)
all_scores = np.zeros(
[X.shape[0], self._classes, self.n_base_estimators_])
for i in range(self.n_base_estimators_):
clf = self.base_estimators[i]
if clf.fitted_ is not True and self.pre_fitted == False:
ValueError('Classifier should be fitted first!')
else:
if hasattr(clf, 'predict_proba'):
all_scores[:, :, i] = clf.predict_proba(X)
else:
raise ValueError(
"{clf} does not have predict_proba.".format(clf=clf))
if self.method == 'average':
return np.mean(all_scores * self.weights, axis=2)
if self.method == 'maximization':
scores = np.max(all_scores * self.weights, axis=2)
return score_to_proba(scores)
if self.method == 'majority_vote':
Warning('average method is invoked for predict_proba as'
'probability is not continuous')
return np.mean(all_scores * self.weights, axis=2)
if self.method == 'median':
Warning('average method is invoked for predict_proba as'
'probability is not continuous')
return np.mean(all_scores * self.weights, axis=2)
[docs] def fit_predict(self, X, y):
"""Fit estimator and predict on X
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : numpy array of shape (n_samples,), optional (default=None)
The ground truth of the input samples (labels).
Returns
-------
labels : numpy array of shape (n_samples,)
Class labels for each data sample.
"""
raise NotImplementedError(
'fit_predict should not be used in supervised learning models.')