# -*- coding: utf-8 -*-
"""Combining multiple clusterings using evidence accumulation (EAC).
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause
import warnings
import numpy as np
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import linkage
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from pyod.utils.utility import check_parameter
from .base import BaseAggregator
def _generate_similarity_mat(labels):
"""Internal function to generate similarity matrix.
Parameters
----------
labels : numpy array of shape (n_samples, 1)
Returns
-------
sim_mat : numpy array of shape (n_samples, n_samples)
Similarity matrix. If label_i == label_j, sim_mat[i,j] = 1, else 0.
"""
l_mat = np.repeat(labels, len(labels), axis=1)
l_mat_t = l_mat.T
sim_mat = np.equal(l_mat, l_mat_t).astype(int)
return sim_mat
[docs]class EAC(BaseAggregator):
"""Combining multiple clusterings using evidence accumulation (EAC) first
builds similarity matrix for each base clustering to model the similarity
among the cluster assignment among each sample. After the similarity
matrices are aggregated, a hierarchical clustering is built on it. See
:cite:`fred2005combining` for details.
Parameters
----------
base_estimators : list or numpy array of shape (n_estimators,)
A list of base estimators. Estimators must have a `labels_`
attribute once fitted. Sklearn clustering estimators are recommended.
n_clusters : int, optional (default=8)
The number of clusters.
linkage_method : str, optional (default='single')
The linkage method to use (single, complete, average,
weighted, median centroid, ward). See
https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html
for more information.
weights : numpy array of shape (n_estimators,)
Estimator weights. May be used after the alignment.
pre_fitted : bool, optional (default=False)
Whether the base estimators are trained. If True, `fit`
process may be skipped.
Attributes
----------
labels_ : int
The predicted label of the fitted data.
Z_ : numpy array
The linkage matrix encoding the hierarchical clustering. This can be
used to plot dendrogram using scipy.
"""
def __init__(self, base_estimators, n_clusters, linkage_method='single',
weights=None, pre_fitted=False):
super(EAC, self).__init__(
base_estimators=base_estimators, pre_fitted=pre_fitted)
check_parameter(n_clusters, low=2, param_name='n_clusters')
self.n_clusters = n_clusters
# set estimator weights
self._set_weights(weights)
self.linkage_method = linkage_method
[docs] def fit(self, X):
"""Fit estimators.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
"""
# Validate inputs X
X = check_array(X)
n_samples = X.shape[0]
# initialize similarity matrix
sim_mat_all = np.zeros([n_samples, n_samples])
if self.pre_fitted:
print("Training Skipped")
else:
for clf in self.base_estimators:
clf.fit(X)
clf.fitted_ = True
for i, estimator in enumerate(self.base_estimators):
check_is_fitted(estimator, ['labels_'])
# get the labels from each base estimator
labels = estimator.labels_.reshape(n_samples, 1)
# generate the similarity matrix for the current estimator
sim_mat = _generate_similarity_mat(labels)
# add to the main similarity mat
sim_mat_all = sim_mat_all + sim_mat
# get the average of the similarity mat
sim_mat_avg = np.divide(sim_mat_all, self.n_base_estimators_)
# flip the similarity. smaller value implies more similarity
sim_mat_avg = np.abs(np.max(sim_mat_avg) - sim_mat_avg)
# build clusters
self.Z_ = linkage(sim_mat_avg, method=self.linkage_method)
self.labels_ = fcluster(self.Z_, self.n_clusters, criterion='maxclust')
# it may leads to different number of clusters as specified by the user
if len(np.unique(self.labels_)) != self.n_clusters:
warnings.warn(
'EAC generates {n} clusters instead of {n_clusters}'.format(
n=len(np.unique(self.labels_)),
n_clusters=self.n_clusters))
return self
[docs] def predict(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
labels : numpy array of shape (n_samples,)
Class labels for each data sample.
"""
# TODO: decide whether enable predict function for clustering
raise NotImplemented("predict function is currently disabled for"
"clustering due to inconsistent behaviours.")
[docs] def predict_proba(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
labels : numpy array of shape (n_samples,)
Class labels for each data sample.
"""
raise NotImplemented("predict_proba function is currently disabled for"
"clustering due to inconsistent behaviours.")
[docs] def fit_predict(self, X, y=None):
"""Fit estimator and predict on X. y is optional for unsupervised
methods.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : numpy array of shape (n_samples,), optional (default=None)
The ground truth of the input samples (labels).
Returns
-------
labels : numpy array of shape (n_samples,)
Cluster labels for each data sample.
"""
self.fit(X)
return self.labels_