Source code for biosppy.biometrics

# -*- coding: utf-8 -*-
"""
biosppy.biometrics
------------------

This module provides classifier interfaces for identity recognition
(biometrics) applications. The core API methods are:
* enroll: add a new subject;
* dismiss: remove an existing subject;
* identify: determine the identity of collected biometric dataset;
* authenticate: verify the identity of collected biometric dataset.

:copyright: (c) 2015-2018 by Instituto de Telecomunicacoes
:license: BSD 3-clause, see LICENSE for more details.
"""

# Imports
# compat
from __future__ import absolute_import, division, print_function
from six.moves import range
import six

# built-in
import collections

# 3rd party
import numpy as np
import shortuuid
from bidict import bidict
from sklearn import model_selection as skcv
from sklearn import svm as sksvm

# local
from . import metrics, plotting, storage, utils
from .signals import tools


[docs]class SubjectError(Exception):
    """Exception raised when the subject is unknown."""

    def __init__(self, subject=None):
        self.subject = subject

    def __str__(self):
        if self.subject is None:
            return str("Subject is not enrolled.")
        else:
            return str("Subject %r is not enrolled." % self.subject)


[docs]class UntrainedError(Exception):
    """Exception raised when classifier is not trained."""

    def __str__(self):
        return str("The classifier is not trained.")


[docs]class CombinationError(Exception):
    """Exception raised when the combination method fails."""

    def __str__(self):
        return str("Combination of empty array.")


[docs]class BaseClassifier(object):
    """Base biometric classifier class.

    This class is a skeleton for actual classifier classes.
    The following methods must be overridden or adapted to build a
    new classifier:
    
    * __init__
    * _authenticate
    * _get_thresholds
    * _identify
    * _prepare
    * _train
    * _update

    Attributes
    ----------
    EER_IDX : int
        Reference index for the Equal Error Rate.

    """

    EER_IDX = 0

    def __init__(self):
        # generic self things
        self.is_trained = False
        self._subject2label = bidict()
        self._nbSubjects = 0
        self._thresholds = {}
        self._autoThresholds = None

        # init data storage
        self._iofile = {}

        # defer flag
        self._defer_flag = False
        self._reset_defer()

    def _reset_defer(self):
        """Reset defer buffer."""

        self._defer_dict = {'enroll': set(), 'dismiss': set()}

    def _defer(self, label, case):
        """Add deferred task.

        Parameters
        ----------
        label : str
            Internal classifier subject label.
        case : str
            One of 'enroll' or 'dismiss'.

        Notes
        -----
        * An enroll overrides a previous dismiss for the same subject.
        * A dismiss overrides a previous enroll for the same subject.

        """

        if case == 'enroll':
            self._defer_dict['enroll'].add(label)
            if label in self._defer_dict['dismiss']:
                self._defer_dict['dismiss'].remove(label)
        elif case == 'dismiss':
            self._defer_dict['dismiss'].add(label)
            if label in self._defer_dict['enroll']:
                self._defer_dict['enroll'].remove(label)

        self._defer_flag = True

    def _check_state(self):
        """Check and update the train state."""

        if self._nbSubjects > 0:
            self.is_trained = True
        else:
            self.is_trained = False

[docs]    def io_load(self, label):
        """Load enrolled subject data.

        Parameters
        ----------
        label : str
            Internal classifier subject label.

        Returns
        -------
        data : array
            Subject data.

        """

        return self._iofile[label]

[docs]    def io_save(self, label, data):
        """Save subject data.

        Parameters
        ----------
        label : str
            Internal classifier subject label.
        data : array
            Subject data.

        """

        self._iofile[label] = data

[docs]    def io_del(self, label):
        """Delete subject data.

        Parameters
        ----------
        label : str
            Internal classifier subject label.

        """

        del self._iofile[label]

[docs]    def save(self, path):
        """Save classifier instance to a file.

        Parameters
        ----------
        path : str
            Destination file path.

        """

        storage.serialize(self, path)

[docs]    @classmethod
    def load(cls, path):
        """Load classifier instance from a file.

        Parameters
        ----------
        path : str
            Source file path.

        Returns
        -------
        clf : object
            Loaded classifier instance.

        """

        # load classifier
        clf = storage.deserialize(path)

        # check class type
        if not isinstance(clf, cls):
            raise TypeError("Mismatch between target class and loaded file.")

        return clf

[docs]    def check_subject(self, subject):
        """Check if a subject is enrolled.

        Parameters
        ----------
        subject : hashable
            Subject identity.

        Returns
        -------
        check : bool
            If True, the subject is enrolled.

        """

        if self.is_trained:
            return subject in self._subject2label

        return False

[docs]    def list_subjects(self):
        """List all the enrolled subjects.

        Returns
        -------
        subjects : list
            Enrolled subjects.

        """

        subjects = list(self._subject2label)

        return subjects

[docs]    def enroll(self, data=None, subject=None, deferred=False):
        """Enroll new data for a subject.

        If the subject is already enrolled, new data is combined with
        existing data.

        Parameters
        ----------
        data : array
            Data to enroll.
        subject : hashable
            Subject identity.
        deferred : bool, optional
            If True, computations are delayed until `flush` is called.

        Notes
        -----
        * When using deferred calls, an enroll overrides a previous dismiss
          for the same subject.

        """

        # check inputs
        if data is None:
            raise TypeError("Please specify the data to enroll.")

        if subject is None:
            raise TypeError("Plase specify the subject identity.")

        if self.check_subject(subject):
            # load existing
            label = self._subject2label[subject]
            old = self.io_load(label)

            # combine data
            data = self._update(old, data)
        else:
            # create new label
            label = shortuuid.uuid()
            self._subject2label[subject] = label
            self._nbSubjects += 1

        # store data
        self.io_save(label, data)

        if deferred:
            # delay computations
            self._defer(label, 'enroll')
        else:
            self._train([label], None)
            self._check_state()
            self.update_thresholds()

[docs]    def dismiss(self, subject=None, deferred=False):
        """Remove a subject.

        Parameters
        ----------
        subject : hashable
            Subject identity.
        deferred : bool, optional
            If True, computations are delayed until `flush` is called.

        Raises
        ------
        SubjectError
            If the subject to remove is not enrolled.

        Notes
        -----
        * When using deferred calls, a dismiss overrides a previous enroll
          for the same subject.

        """

        # check inputs
        if subject is None:
            raise TypeError("Please specify the subject identity.")

        if not self.check_subject(subject):
            raise SubjectError(subject)

        label = self._subject2label[subject]
        del self._subject2label[subject]
        del self._thresholds[label]
        self._nbSubjects -= 1
        self.io_del(label)

        if deferred:
            self._defer(label, 'dismiss')
        else:
            self._train(None, [label])
            self._check_state()
            self.update_thresholds()

[docs]    def batch_train(self, data=None):
        """Train the classifier in batch mode.

        Parameters
        ----------
        data : dict
            Dictionary holding training data for each subject; if the object
            for a subject is `None`, performs a `dismiss`.

        """

        # check inputs
        if data is None:
            raise TypeError("Please specify the data to train.")

        for sub, val in six.iteritems(data):
            if val is None:
                try:
                    self.dismiss(sub, deferred=True)
                except SubjectError:
                    continue
            else:
                self.enroll(val, sub, deferred=True)

        self.flush()

[docs]    def flush(self):
        """Flush deferred computations."""

        if self._defer_flag:
            self._defer_flag = False

            # train
            enroll = list(self._defer_dict['enroll'])
            dismiss = list(self._defer_dict['dismiss'])
            self._train(enroll, dismiss)

            # update thresholds
            self._check_state()
            self.update_thresholds()

            # reset
            self._reset_defer()

[docs]    def update_thresholds(self, fraction=1.):
        """Update subject-specific thresholds based on the enrolled data.

        Parameters
        ----------
        fraction : float, optional
            Fraction of samples to select from training data.

        """

        ths = self.get_thresholds(force=True)

        # gather data to test
        data = {}
        for subject, label in six.iteritems(self._subject2label):
            # select a random fraction of the training data
            aux = self.io_load(label)
            indx = list(range(len(aux)))
            use, _ = utils.random_fraction(indx, fraction, sort=True)

            data[subject] = aux[use]

        # evaluate classifier
        _, res = self.evaluate(data, ths)

        # choose thresholds at EER
        for subject, label in six.iteritems(self._subject2label):
            EER_auth = res['subject'][subject]['authentication']['rates']['EER']
            self.set_auth_thr(label, EER_auth[self.EER_IDX, 0], ready=True)

            EER_id = res['subject'][subject]['identification']['rates']['EER']
            self.set_id_thr(label, EER_id[self.EER_IDX, 0], ready=True)

[docs]    def set_auth_thr(self, subject, threshold, ready=False):
        """Set the authentication threshold of a subject.

        Parameters
        ----------
        subject : hashable
            Subject identity.
        threshold : int, float
            Threshold value.
        ready : bool, optional
            If True, `subject` is the internal classifier label.

        """

        if not ready:
            if not self.check_subject(subject):
                raise SubjectError(subject)
            subject = self._subject2label[subject]

        try:
            self._thresholds[subject]['auth'] = threshold
        except KeyError:
            self._thresholds[subject] = {'auth': threshold, 'id': None}

[docs]    def get_auth_thr(self, subject, ready=False):
        """Get the authentication threshold of a subject.

        Parameters
        ----------
        subject : hashable
            Subject identity.
        ready : bool, optional
            If True, `subject` is the internal classifier label.

        Returns
        -------
        threshold : int, float
            Threshold value.

        """

        if not ready:
            if not self.check_subject(subject):
                raise SubjectError(subject)
            subject = self._subject2label[subject]

        return self._thresholds[subject].get('auth', None)

[docs]    def set_id_thr(self, subject, threshold, ready=False):
        """Set the identification threshold of a subject.

        Parameters
        ----------
        subject : hashable
            Subject identity.
        threshold : int, float
            Threshold value.
        ready : bool, optional
            If True, `subject` is the internal classifier label.

        """

        if not ready:
            if not self.check_subject(subject):
                raise SubjectError(subject)
            subject = self._subject2label[subject]

        try:
            self._thresholds[subject]['id'] = threshold
        except KeyError:
            self._thresholds[subject] = {'auth': None, 'id': threshold}

[docs]    def get_id_thr(self, subject, ready=False):
        """Get the identification threshold of a subject.

        Parameters
        ----------
        subject : hashable
            Subject identity.
        ready : bool, optional
            If True, `subject` is the internal classifier label.

        Returns
        -------
        threshold : int, float
            Threshold value.

        """

        if not ready:
            if not self.check_subject(subject):
                raise SubjectError(subject)
            subject = self._subject2label[subject]

        return self._thresholds[subject].get('id', None)

[docs]    def get_thresholds(self, force=False):
        """Get an array of reasonable thresholds.

        Parameters
        ----------
        force : bool, optional
            If True, forces generation of thresholds.

        Returns
        -------
        ths : array
            Generated thresholds.

        """

        if force or (self._autoThresholds is None):
            self._autoThresholds = self._get_thresholds()

        return self._autoThresholds

[docs]    def authenticate(self, data, subject, threshold=None):
        """Authenticate a set of feature vectors, allegedly belonging to the
        given subject.

        Parameters
        ----------
        data : array
            Input test data.
        subject : hashable
            Subject identity.
        threshold : int, float, optional
            Authentication threshold.

        Returns
        -------
        decision : array
            Authentication decision for each input sample.

        """

        # check train state
        if not self.is_trained:
            raise UntrainedError

        # check subject
        if not self.check_subject(subject):
            raise SubjectError(subject)

        label = self._subject2label[subject]

        # check threshold
        if threshold is None:
            threshold = self.get_auth_thr(label, ready=True)

        # prepare data
        aux = self._prepare(data, targets=label)

        # authenticate
        decision = self._authenticate(aux, label, threshold)

        return decision

[docs]    def identify(self, data, threshold=None):
        """Identify a set of feature vectors.

        Parameters
        ----------
        data : array
            Input test data.
        threshold : int, float, optional
            Identification threshold.

        Returns
        -------
        subjects : list
            Identity of each input sample.

        """

        # check train state
        if not self.is_trained:
            raise UntrainedError

        # prepare data
        aux = self._prepare(data)

        # identify
        labels = self._identify(aux, threshold)

        # translate class labels
        subjects = [self._subject2label.inv.get(item, '') for item in labels]

        return subjects

[docs]    def evaluate(self, data, thresholds=None, path=None, show=False):
        """Assess the performance of the classifier in both authentication and
        identification scenarios.

        Parameters
        ----------
        data : dict
            Dictionary holding test data for each subject.
        thresholds : array, optional
            Classifier thresholds to use.
        path : str, optional
            If provided, the plot will be saved to the specified file.
        show : bool, optional
            If True, show a summary plot.

        Returns
        -------
        classification : dict
            Classification results.
        assessment : dict
            Biometric statistics.

        """

        # check train state
        if not self.is_trained:
            raise UntrainedError

        # check thresholds
        if thresholds is None:
            thresholds = self.get_thresholds()

        # get subjects
        subjects = [item for item in data if self.check_subject(item)]
        if len(subjects) == 0:
            raise ValueError("No enrolled subjects in test set.")

        results = {
            'subjectList': subjects,
            'subjectDict': self._subject2label,
        }

        for subject in subjects:
            # prepare data
            aux = self._prepare(data[subject])

            # test
            auth_res = []
            id_res = []
            for th in thresholds:
                # authentication
                auth = []
                for subject_tst in subjects:
                    label = self._subject2label[subject_tst]
                    auth.append(self._authenticate(aux, label, th))

                auth_res.append(np.array(auth))

                # identification
                id_res.append(self._identify(aux, th))

            auth_res = np.array(auth_res)
            id_res = np.array(id_res)
            results[subject] = {'authentication': auth_res,
                                'identification': id_res,
                                }

        # assess classification results
        assess, = assess_classification(results, thresholds)

        # output
        args = (results, assess)
        names = ('classification', 'assessment')
        out = utils.ReturnTuple(args, names)

        if show:
            # plot
            plotting.plot_biometrics(assess,
                                     self.EER_IDX,
                                     path=path,
                                     show=True)

        return out

[docs]    @classmethod
    def cross_validation(cls, data, labels, cv, thresholds=None, **kwargs):
        """Perform Cross Validation (CV) on a data set.

        Parameters
        ----------
        data : array
            An m by n array of m data samples in an n-dimensional space.
        labels : list, array
            A list of m class labels.
        cv : CV iterator
            A `sklearn.model_selection` iterator.
        thresholds : array, optional
            Classifier thresholds to use.
        ``**kwargs`` : dict, optional
            Classifier parameters.

        Returns
        -------
        runs : list
            Evaluation results for each CV run.
        assessment : dict
            Final CV biometric statistics.

        """

        runs = []
        aux = []
        for train, test in cv:
            # train data set
            train_idx = collections.defaultdict(list)
            for item in train:
                lbl = labels[item]
                train_idx[lbl].append(item)

            train_data = {sub: data[idx] for sub, idx in six.iteritems(train_idx)}

            # test data set
            test_idx = collections.defaultdict(list)
            for item in test:
                lbl = labels[item]
                test_idx[lbl].append(item)

            test_data = {sub: data[idx] for sub, idx in six.iteritems(test_idx)}

            # instantiate classifier
            clf = cls(**kwargs)
            clf.batch_train(train_data)
            res = clf.evaluate(test_data, thresholds=thresholds)
            del clf

            aux.append(res['assessment'])
            runs.append(res)

        # assess runs
        if len(runs) > 0:
            subjects = runs[0]['classification']['subjectList']
            assess, = assess_runs(results=aux, subjects=subjects)
        else:
            raise ValueError("CV iterator empty or exhausted.")

        # output
        args = (runs, assess)
        names = ('runs', 'assessment')

        return utils.ReturnTuple(args, names)

    def _authenticate(self, data, label, threshold):
        """Authenticate a set of feature vectors, allegedly belonging to the
        given subject.

        Parameters
        ----------
        data : array
            Input test data.
        label : str
            Internal classifier subject label.
        threshold : int, float
            Authentication threshold.

        Returns
        -------
        decision : array
            Authentication decision for each input sample.

        """

        decision = np.zeros(len(data), dtype='bool')

        return decision

    def _get_thresholds(self):
        """Generate an array of reasonable thresholds.

        Returns
        -------
        ths : array
            Generated thresholds.

        """

        ths = np.array([])

        return ths

    def _identify(self, data, threshold=None):
        """Identify a set of feature vectors.

        Parameters
        ----------
        data : array
            Input test data.
        threshold : int, float
            Identification threshold.

        Returns
        -------
        labels : list
            Identity (internal label) of each input sample.

        """

        labels = [''] * len(data)

        return labels

    def _prepare(self, data, targets=None):
        """Prepare data to be processed.

        Parameters
        ----------
        data : array
            Data to process.
        targets : list, str, optional
            Target subject labels.

        Returns
        -------
        out : object
            Processed data.

        """

        # target class labels
        if targets is None:
            targets = list(self._subject2label.values())
        elif isinstance(targets, six.string_types):
            targets = [targets]

        return data

    def _train(self, enroll=None, dismiss=None):
        """Train the classifier.

        Parameters
        ----------
        enroll : list, optional
            Labels of new or updated subjects.
        dismiss : list, optional
            Labels of deleted subjects.

        """

        if enroll is None:
            enroll = []
        if dismiss is None:
            dismiss = []

        # process dismiss
        for _ in dismiss:
            pass

        # process enroll
        for _ in enroll:
            pass

    def _update(self, old, new):
        """Combine new data with existing templates (for one subject).

        Parameters
        ----------
        old : array
            Existing data.
        new : array
            New data.

        Returns
        -------
        out : array
            Combined data.

        """

        return new


[docs]class KNN(BaseClassifier):
    """K Nearest Neighbors (k-NN) biometric classifier.

    Parameters
    ----------
    k : int, optional
        Number of neighbors.
    metric : str, optional
        Distance metric.
    metric_args : dict, optional
        Additional keyword arguments are passed to the distance function.

    Attributes
    ----------
    EER_IDX : int
        Reference index for the Equal Error Rate.

    """

    EER_IDX = 0

    def __init__(self, k=3, metric='euclidean', metric_args=None):
        # parent __init__
        super(KNN, self).__init__()

        # algorithm self things
        self.k = k
        self.metric = metric
        if metric_args is None:
            metric_args = {}
        self.metric_args = metric_args

        # test metric args
        _ = metrics.pdist(np.zeros((2, 2)), metric, **metric_args)

        # minimum threshold
        self.min_thr = 10 * np.finfo('float').eps

    def _sort(self, dists, train_labels):
        """Sort the computed distances.

        Parameters
        ----------
        dists : array
            Unsorted computed distances.
        train_labels : list
            Unsorted target subject labels.

        Returns
        -------
        dists : array
            Sorted computed distances.
        train_labels : list
            Sorted target subject labels.

        """

        ind = dists.argsort()
        # sneaky trick from http://stackoverflow.com/questions/6155649
        static_inds = np.arange(dists.shape[0]).reshape((dists.shape[0], 1))
        dists = dists[static_inds, ind]
        train_labels = train_labels[static_inds, ind]

        return dists, train_labels

    def _authenticate(self, data, label, threshold):
        """Authenticate a set of feature vectors, allegedly belonging to the
        given subject.

        Parameters
        ----------
        data : array
            Input test data.
        label : str
            Internal classifier subject label.
        threshold : int, float
            Authentication threshold.

        Returns
        -------
        decision : array
            Authentication decision for each input sample.

        """

        # unpack prepared data
        dists = data['dists']
        train_labels = data['train_labels']

        # select based on subject label
        aux = []
        ns = len(dists)
        for i in range(ns):
            aux.append(dists[i, train_labels[i, :] == label])

        dists = np.array(aux)

        # nearest neighbors
        dists = dists[:, :self.k]

        decision = np.zeros(ns, dtype='bool')
        for i in range(ns):
            # compare distances to threshold
            count = np.sum(dists[i, :] <= threshold)

            # decide accept
            if count > (self.k // 2):
                decision[i] = True

        return decision

    def _get_thresholds(self):
        """Generate an array of reasonable thresholds.

        For metrics other than 'cosine' or 'pcosine', which have a clear
        limits, generates an array based on the maximum distances between
        enrolled subjects.

        Returns
        -------
        ths : array
            Generated thresholds.

        """

        if self.metric == 'cosine':
            return np.linspace(self.min_thr, 2., 100)
        elif self.metric == 'pcosine':
            return np.linspace(self.min_thr, 1., 100)

        maxD = []
        for _ in range(3):
            for label in list(six.itervalues(self._subject2label)):
                # randomly select samples
                aux = self.io_load(label)
                ind = np.random.randint(0, aux.shape[0], 3)
                obs = aux[ind]

                # compute distances
                dists = self._prepare(obs)['dists']
                maxD.append(np.max(dists))

        # maximum distance
        maxD = 1.5 * np.max(maxD)

        ths = np.linspace(self.min_thr, maxD, 100)

        return ths

    def _identify(self, data, threshold=None):
        """Identify a set of feature vectors.

        Parameters
        ----------
        data : array
            Input test data.
        threshold : int, float
            Identification threshold.

        Returns
        -------
        labels :list
            Identity (internal label) of each input sample.

        """

        if threshold is None:
            thrFcn = lambda label: self.get_id_thr(label, ready=True)
        else:
            thrFcn = lambda label: threshold

        # unpack prepared data
        dists = data['dists']
        train_labels = data['train_labels']

        # nearest neighbors
        dists = dists[:, :self.k]
        train_labels = train_labels[:, :self.k]
        ns = len(dists)

        labels = []
        for i in range(ns):
            lbl, _ = majority_rule(train_labels[i, :], random=True)

            # compare distances to threshold
            count = np.sum(dists[i, :] <= thrFcn(lbl))

            # decide
            if count > (self.k // 2):
                # accept
                labels.append(lbl)
            else:
                # reject
                labels.append('')

        return labels

    def _prepare(self, data, targets=None):
        """Prepare data to be processed.

        Computes the distances of the input data set to the target subjects.

        Parameters
        ----------
        data : array
            Data to process.
        targets : list, str, optional
            Target subject labels.

        Returns
        -------
        out : dict
            Processed data containing the computed distances (`dists`) and the
            target subject labels (`train_labels`).

        """

        # target class labels
        if targets is None:
            targets = list(six.itervalues(self._subject2label))
        elif isinstance(targets, six.string_types):
            targets = [targets]

        dists = []
        train_labels = []

        for label in targets:
            # compute distances
            D = metrics.cdist(data, self.io_load(label),
                              metric=self.metric, **self.metric_args)

            dists.append(D)
            train_labels.append(np.tile(label, D.shape))

        dists = np.concatenate(dists, axis=1)
        train_labels = np.concatenate(train_labels, axis=1)

        # sort
        dists, train_labels = self._sort(dists, train_labels)

        return {'dists': dists, 'train_labels': train_labels}

    def _update(self, old, new):
        """Combine new data with existing templates (for one subject).

        Simply concatenates old data with new data.

        Parameters
        ----------
        old : array
            Existing data.
        new : array
            New data.

        Returns
        -------
        out : array
            Combined data.

        """

        out = np.concatenate([old, new], axis=0)

        return out


[docs]class SVM(BaseClassifier):
    """Support Vector Machines (SVM) biometric classifier.

    Wraps the 'OneClassSVM' and 'SVC' classes from 'scikit-learn'.

    Parameters
    ----------
    C : float, optional
        Penalty parameter C of the error term.
    kernel : str, optional
        Specifies the kernel type to be used in the algorithm. It must be one
        of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable.
        If none is given, ‘rbf’ will be used. If a callable is given it is
        used to precompute the kernel matrix.
    degree : int, optional
        Degree of the polynomial kernel function (‘poly’). Ignored by all other
        kernels.
    gamma : float, optional
        Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is 'auto'
        then 1/n_features will be used instead.
    coef0 : float, optional
        Independent term in kernel function. It is only significant in ‘poly’
        and ‘sigmoid’.
    shrinking : bool, optional
        Whether to use the shrinking heuristic.
    tol : float, optional
        Tolerance for stopping criterion.
    cache_size : float, optional
        Specify the size of the kernel cache (in MB).
    max_iter : int, optional
        Hard limit on iterations within solver, or -1 for no limit.
    random_state : int, RandomState, optional
        The seed of the pseudo random number generator to use when shuffling
        the data for probability estimation.

    Attributes
    ----------
    EER_IDX : int
        Reference index for the Equal Error Rate.

    """

    EER_IDX = -1

    def __init__(self,
                 C=1.0,
                 kernel='linear',
                 degree=3,
                 gamma='auto',
                 coef0=0.0,
                 shrinking=True,
                 tol=0.001,
                 cache_size=200,
                 max_iter=-1,
                 random_state=None):
        # parent __init__
        super(SVM, self).__init__()

        # algorithm self things
        self._models = {}
        self._clf_kwargs = {
            'C': C,
            'kernel': kernel,
            'degree': degree,
            'gamma': gamma,
            'coef0': coef0,
            'shrinking': shrinking,
            'tol': tol,
            'cache_size': cache_size,
            'max_iter': max_iter,
            'random_state': random_state,
        }

        # minimum threshold
        self.min_thr = 10 * np.finfo('float').eps

    def _get_weights(self, n1, n2):
        """Compute class weights.

        The weights are inversely proportional to the number of samples in each
        class.

        Parameters
        ----------
        n1 : int
            Number of samples in the first class.
        n2 : int
            Number of samples in the second class.

        Returns
        -------
        weights : dict
            Weights for each class.

        """

        w = np.array([1. / n1, 1. / n2])
        w *= 2 / np.sum(w)
        weights = {-1: w[0], 1: w[1]}

        return weights

    def _get_single_clf(self, X, label):
        """Instantiate and train a One Class SVM classifier.

        Parameters
        ----------
        X : array
            Training data.
        label : str
            Class label.

        """

        clf = sksvm.OneClassSVM(kernel='rbf', nu=0.1)
        clf.fit(X)

        # add to models
        self._models[('', label)] = clf

    def _get_kernel_clf(self, X1, X2, n1, n2, label1, label2):
        """Instantiate and train a SVC SVM classifier.

        Parameters
        ----------
        X1 : array
            Trainig data for the first class.
        X2 : array
            Training data for the second class.
        n1 : int
            Number of samples in the first class.
        n2 : int
            Number of samples in the second class.
        label1 : str
            Label for the first class.
        label2 : str
            Label for the first class.

        """

        # prepare data to train
        X = np.concatenate((X1, X2), axis=0)
        Y = np.ones(n1 + n2)

        pair = self._convert_pair((label1, label2))
        if pair[0] == label1:
            Y[:n1] = -1
        else:
            Y[n1:] = -1

        # class weights
        weights = self._get_weights(n1, n2)

        # instantiate and fit
        clf = sksvm.SVC(class_weight=weights, **self._clf_kwargs)
        clf.fit(X, Y)

        # add to models
        self._models[pair] = clf

    def _del_clf(self, pair):
        """Delete a binary classifier.

        Parameters
        ----------
        pair : list, tuple
            Label pair.

        """

        pair = self._convert_pair(pair)
        m = self._models.pop(pair)
        del m

    def _convert_pair(self, pair):
        """Sort and convert a label pair to the internal representation format.

        Parameters
        ----------
        pair : list, tuple
            Input label pair.

        Returns
        -------
        pair : tuple
            Sorted label pair.

        """

        pair = tuple(sorted(pair))

        return pair

    def _predict(self, pair, X):
        """Get a classifier prediction of the input data, given the label pair.

        Parameters
        ----------
        pair : list, tuple
            Label pair.
        X : array
            Input data to classify.

        Returns
        -------
        prediction : array
            Prediction for each sample in the input data.

        """

        # convert pair
        pair = self._convert_pair(pair)

        # classify
        aux = self._models[pair].predict(X)

        prediction = []
        for item in aux:
            if item < 0:
                prediction.append(pair[0])
            elif item > 0:
                prediction.append(pair[1])
            else:
                prediction.append('')

        prediction = np.array(prediction)

        return prediction

    def _authenticate(self, data, label, threshold):
        """Authenticate a set of feature vectors, allegedly belonging to the
        given subject.

        Parameters
        ----------
        data : array
            Input test data.
        label : str
            Internal classifier subject label.
        threshold : int, float
            Authentication threshold.

        Returns
        -------
        decision : array
            Authentication decision for each input sample.

        """

        # unpack prepared data
        aux = data['predictions']
        ns = aux.shape[1]
        pairs = data['pairs']

        # normalization
        if self._nbSubjects > 1:
            norm = float(self._nbSubjects - 1)
        else:
            norm = 1.0

        # select pairs
        sel = np.nonzero([label in p for p in pairs])[0]
        aux = aux[sel, :]

        decision = []
        for i in range(ns):
            # determine majority
            predMax, count = majority_rule(aux[:, i], random=True)
            rate = float(count) / norm

            if predMax == '':
                decision.append(False)
            else:
                # compare with threshold
                if rate > threshold:
                    decision.append(predMax == label)
                else:
                    decision.append(False)

        decision = np.array(decision)

        return decision

    def _get_thresholds(self):
        """Generate an array of reasonable thresholds.

        The thresholds correspond to the relative number of binary classifiers
        that agree on a class.

        Returns
        -------
        ths : array
            Generated thresholds.

        """

        ths = np.linspace(self.min_thr, 1.0, 100)

        return ths

    def _identify(self, data, threshold=None):
        """Identify a set of feature vectors.

        Parameters
        ----------
        data : array
            Input test data.
        threshold : int, float
            Identification threshold.

        Returns
        -------
        labels : list
            Identity (internal label) of each input sample.

        """

        if threshold is None:
            thrFcn = lambda label: self.get_id_thr(label, ready=True)
        else:
            thrFcn = lambda label: threshold

        # unpack prepared data
        aux = data['predictions']
        ns = aux.shape[1]

        # normalization
        if self._nbSubjects > 1:
            norm = float(self._nbSubjects - 1)
        else:
            norm = 1.0

        labels = []
        for i in range(ns):
            # determine majority
            predMax, count = majority_rule(aux[:, i], random=True)
            rate = float(count) / norm

            if predMax == '':
                labels.append('')
            else:
                # compare with threshold
                if rate > thrFcn(predMax):
                    # accept
                    labels.append(predMax)
                else:
                    # reject
                    labels.append('')

        return labels

    def _prepare(self, data, targets=None):
        """Prepare data to be processed.

        Computes the predictions for each of the targeted classifier pairs.

        Parameters
        ----------
        data : array
            Data to process.
        targets : list, str, optional
            Target subject labels.

        Returns
        -------
        out : dict
            Processed data containing an array with the predictions of each
            input sample (`predictions`) and a list with the target label
            pairs (`pairs`).

        """

        # target class labels
        if self._nbSubjects == 1:
            pairs = list(self._models)
        else:
            if targets is None:
                pairs = list(self._models)
            elif isinstance(targets, six.string_types):
                labels = list(
                    set(self._subject2label.values()) - set([targets]))
                pairs = [[targets, lbl] for lbl in labels]
            else:
                pairs = []
                for t in targets:
                    labels = list(set(self._subject2label.values()) - set([t]))
                    pairs.extend([t, lbl] for lbl in labels)

        # predict
        predictions = np.array([self._predict(p, data) for p in pairs])

        out = {'predictions': predictions, 'pairs': pairs}

        return out

    def _train(self, enroll=None, dismiss=None):
        """Train the classifier.

        Parameters
        ----------
        enroll : list, optional
            Labels of new or updated subjects.
        dismiss : list, optional
            Labels of deleted subjects.

        """

        if enroll is None:
            enroll = []
        if dismiss is None:
            dismiss = []

        # process dismiss
        src_pairs = list(self._models)
        pairs = []
        for t in dismiss:
            pairs.extend([p for p in src_pairs if t in p])

        for p in pairs:
            self._del_clf(p)

        # process enroll
        existing = list(set(self._subject2label.values()) - set(enroll))
        for i, t1 in enumerate(enroll):
            X1 = self.io_load(t1)
            n1 = len(X1)

            # existing subjects
            for t2 in existing:
                X2 = self.io_load(t2)
                n2 = len(X2)
                self._get_kernel_clf(X1, X2, n1, n2, t1, t2)

            # new subjects
            for t2 in enroll[i + 1:]:
                X2 = self.io_load(t2)
                n2 = len(X2)
                self._get_kernel_clf(X1, X2, n1, n2, t1, t2)

        # check singles
        if self._nbSubjects == 1:
            label = list(six.itervalues(self._subject2label))[0]
            X = self.io_load(label)
            self._get_single_clf(X, label)
        elif self._nbSubjects > 1:
            aux = [p for p in self._models if '' in p]
            if len(aux) != 0:
                for p in aux:
                    self._del_clf(p)

    def _update(self, old, new):
        """Combine new data with existing templates (for one subject).

        Simply concatenates old data with new data.

        Parameters
        ----------
        old : array
            Existing data.
        new : array
            New data.

        Returns
        -------
        out : array
            Combined data.

        """

        out = np.concatenate([old, new], axis=0)

        return out


[docs]def get_auth_rates(TP=None, FP=None, TN=None, FN=None, thresholds=None):
    """Compute authentication rates from the confusion matrix.

    Parameters
    ----------
    TP : array
        True Positive counts for each classifier threshold.
    FP : array
        False Positive counts for each classifier threshold.
    TN : array
        True Negative counts for each classifier threshold.
    FN : array
        False Negative counts for each classifier threshold.
    thresholds : array
        Classifier thresholds.

    Returns
    -------
    Acc : array
        Accuracy at each classifier threshold.
    TAR : array
        True Accept Rate at each classifier threshold.
    FAR : array
        False Accept Rate at each classifier threshold.
    FRR : array
        False Reject Rate at each classifier threshold.
    TRR : array
        True Reject Rate at each classifier threshold.
    EER : array
        Equal Error Rate points, with format (threshold, rate).
    Err : array
        Error rate at each classifier threshold.
    PPV : array
        Positive Predictive Value at each classifier threshold.
    FDR : array
        False Discovery Rate at each classifier threshold.
    NPV : array
        Negative Predictive Value at each classifier threshold.
    FOR : array
        False Omission Rate at each classifier threshold.
    MCC : array
        Matthrews Correlation Coefficient at each classifier threshold.

    """

    # check inputs
    if TP is None:
        raise TypeError("Please specify the input TP counts.")
    if FP is None:
        raise TypeError("Please specify the input FP counts.")
    if TN is None:
        raise TypeError("Please specify the input TN counts.")
    if FN is None:
        raise TypeError("Please specify the input FN counts.")
    if thresholds is None:
        raise TypeError("Please specify the input classifier thresholds.")

    # ensure numpy
    TP = np.array(TP)
    FP = np.array(FP)
    TN = np.array(TN)
    FN = np.array(FN)
    thresholds = np.array(thresholds)

    # helper variables
    A = TP + FP
    B = TP + FN
    C = TN + FP
    D = TN + FN
    E = A * B * C * D
    F = A + D

    # avoid divisions by zero
    A[A == 0] = 1.
    B[B == 0] = 1.
    C[C == 0] = 1.
    D[D == 0] = 1.
    E[E == 0] = 1.
    F[F == 0] = 1.

    # rates
    Acc = (TP + TN) / F # accuracy
    Err = (FP + FN) / F # error rate

    TAR = TP / B # true accept rate /true positive rate
    FRR = FN / B # false rejection rate / false negative rate

    TRR = TN / C # true rejection rate / true negative rate
    FAR = FP / C # false accept rate / false positive rate

    PPV = TP / A # positive predictive value
    FDR = FP / A # false discovery rate

    NPV = TN / D # negative predictive value
    FOR = FN / D # false omission rate

    MCC = (TP*TN - FP*FN) / np.sqrt(E) # matthews correlation coefficient 

    # determine EER
    roots, values = tools.find_intersection(thresholds, FAR, thresholds, FRR)
    EER = np.vstack((roots, values)).T

    # output
    args = (Acc, TAR, FAR, FRR, TRR, EER, Err, PPV, FDR, NPV, FOR, MCC)
    names = ('Acc', 'TAR', 'FAR', 'FRR', 'TRR', 'EER', 'Err', 'PPV', 'FDR',
             'NPV', 'FOR', 'MCC')

    return utils.ReturnTuple(args, names)


[docs]def get_id_rates(H=None, M=None, R=None, N=None, thresholds=None):
    """Compute identification rates from the confusion matrix.

    Parameters
    ----------
    H : array
        Hit counts for each classifier threshold.
    M : array
        Miss counts for each classifier threshold.
    R : array
        Reject counts for each classifier threshold.
    N : int
        Number of test samples.
    thresholds : array
        Classifier thresholds.

    Returns
    -------
    Acc : array
        Accuracy at each classifier threshold.
    Err : array
        Error rate at each classifier threshold.
    MR : array
        Miss Rate at each classifier threshold.
    RR : array
        Reject Rate at each classifier threshold.
    EID : array
        Error of Identification points, with format (threshold, rate).
    EER : array
        Equal Error Rate points, with format (threshold, rate).

    """

    # check inputs
    if H is None:
        raise TypeError("Please specify the input H counts.")
    if M is None:
        raise TypeError("Please specify the input M counts.")
    if R is None:
        raise TypeError("Please specify the input R counts.")
    if N is None:
        raise TypeError("Please specify the total number of test samples.")
    if thresholds is None:
        raise TypeError("Please specify the input classifier thresholds.")

    # ensure numpy
    H = np.array(H)
    M = np.array(M)
    R = np.array(R)
    thresholds = np.array(thresholds)

    Acc = H / N
    Err = 1 - Acc
    MR = M / N
    RR = R / N

    # EER
    roots, values = tools.find_intersection(thresholds, MR, thresholds, RR)
    EER = np.vstack((roots, values)).T

    # EID
    y2 = np.min(Err) * np.ones(len(thresholds), dtype='float')
    roots, values = tools.find_intersection(thresholds, Err, thresholds, y2)
    EID = np.vstack((roots, values)).T

    # output
    args = (Acc, Err, MR, RR, EID, EER)
    names = ('Acc', 'Err', 'MR', 'RR', 'EID', 'EER')

    return utils.ReturnTuple(args, names)


[docs]def get_subject_results(results=None,
                        subject=None,
                        thresholds=None,
                        subjects=None,
                        subject_dict=None,
                        subject_idx=None):
    """Compute authentication and identification performance metrics for a
    given subject.

    Parameters
    ----------
    results : dict
        Classification results.
    subject : hashable
        True subject label.
    thresholds : array
        Classifier thresholds.
    subjects : list
        Target subject classes.
    subject_dict : bidict
        Subject-label conversion dictionary.
    subject_idx : list
        Subject index.

    Returns
    -------
    assessment : dict
        Authentication and identification results.

    """

    # check inputs
    if results is None:
        raise TypeError("Please specify the input classification results.")
    if subject is None:
        raise TypeError("Please specify the input subject class.")
    if thresholds is None:
        raise TypeError("Please specify the input classifier thresholds.")
    if subjects is None:
        raise TypeError("Please specify the target subject classes.")
    if subject_dict is None:
        raise TypeError("Please specify the subject-label dictionary.")
    if subject_idx is None:
        raise TypeError("Plase specify subject index.")

    nth = len(thresholds)
    auth_res = results['authentication']
    id_res = results['identification']
    ns = auth_res.shape[2]

    # sanity checks
    if auth_res.shape[0] != id_res.shape[0]:
        raise ValueError("Authentication and identification number of \
                          thresholds do not match.")
    if auth_res.shape[0] != nth:
        raise ValueError("Number of thresholds in vector does not match \
                          biometric results.")
    if auth_res.shape[2] != id_res.shape[1]:
        raise ValueError("Authentication and identification number of tests \
                          do not match.")

    label = subject_dict[subject]

    # authentication vars
    TP = np.zeros(nth, dtype='float')
    FP = np.zeros(nth, dtype='float')
    TN = np.zeros(nth, dtype='float')
    FN = np.zeros(nth, dtype='float')

    # identification vars
    H = np.zeros(nth, dtype='float')
    M = np.zeros(nth, dtype='float')
    R = np.zeros(nth, dtype='float')
    CM = []

    for i in range(nth):  # for each threshold
        # authentication
        for k, lbl in enumerate(subject_idx):  # for each subject
            subject_tst = subjects[k]

            d = auth_res[i, lbl, :]
            if subject == subject_tst:
                # true positives
                aux = np.sum(d)
                TP[i] += aux
                # false negatives
                FN[i] += (ns - aux)
            else:
                # false positives
                aux = np.sum(d)
                FP[i] += aux
                # true negatives
                TN[i] += (ns - aux)

        # identification
        res = id_res[i, :]
        hits = res == label
        nhits = np.sum(hits)
        rejects = res == ''
        nrejects = np.sum(rejects)
        misses = np.logical_not(np.logical_or(hits, rejects))
        nmisses = ns - (nhits + nrejects)
        missCounts = {
            subject_dict.inv[ms]: np.sum(res == ms)
            for ms in np.unique(res[misses])
        }

        # appends
        H[i] = nhits
        M[i] = nmisses
        R[i] = nrejects
        CM.append(missCounts)

    # compute rates
    auth_rates = get_auth_rates(TP, FP, TN, FN, thresholds).as_dict()
    id_rates = get_id_rates(H, M, R, ns, thresholds).as_dict()

    output = {
        'authentication': {
            'confusionMatrix': {'TP': TP, 'FP': FP, 'TN': TN, 'FN': FN},
            'rates': auth_rates,
        },
        'identification': {
            'confusionMatrix': {'H': H, 'M': M, 'R': R, 'CM': CM},
            'rates': id_rates,
        },
    }

    return utils.ReturnTuple((output,), ('assessment',))


[docs]def assess_classification(results=None, thresholds=None):
    """Assess the performance of a biometric classification test.

    Parameters
    ----------
    results : dict
        Classification results.
    thresholds : array
        Classifier thresholds.

    Returns
    -------
    assessment : dict
        Classification assessment.

    """

    # check inputs
    if results is None:
        raise TypeError("Please specify the input classification results.")
    if thresholds is None:
        raise TypeError("Please specify the input classifier thresholds.")

    # test subjects
    subjectDict = results['subjectDict']
    subParent = results['subjectList']
    subIdx = [subParent.index(item) for item in subParent]
    subIdx.sort()
    subjects = [subParent[item] for item in subIdx]

    # output object
    output = {
        'global': {
            'authentication': {
                'confusionMatrix': {'TP': 0., 'TN': 0., 'FP': 0., 'FN': 0.},
            },
            'identification': {
                'confusionMatrix': {'H': 0., 'M': 0., 'R': 0.},
            },
        },
        'subject': {},
        'thresholds': thresholds,
    }

    nth = len(thresholds)
    C = np.zeros((nth, len(subjects)), dtype='float')

    # update variables
    auth = output['global']['authentication']['confusionMatrix']
    authM = ['TP', 'TN', 'FP', 'FN']
    iden = output['global']['identification']['confusionMatrix']
    idenM = ['H', 'M', 'R']

    for test_user in subjects:
        aux, = get_subject_results(results[test_user], test_user, thresholds,
                                   subjects, subjectDict, subIdx)

        # copy to subject
        output['subject'][test_user] = aux

        # authentication
        for m in authM:
            auth[m] += aux['authentication']['confusionMatrix'][m]

        # identification
        for m in idenM:
            iden[m] += aux['identification']['confusionMatrix'][m]

        # subject misses
        for i, item in enumerate(aux['identification']['confusionMatrix']['CM']):
            for k, sub in enumerate(subjects):
                try:
                    C[i, k] += item[sub]
                except KeyError:
                    pass

    # normalize subject misses
    sC = C.sum(axis=1).reshape((nth, 1))
    # avoid division by zero
    sC[sC <= 0] = 1.
    CR = C / sC

    # update subjects
    for k, sub in enumerate(subjects):
        output['subject'][sub]['identification']['confusionMatrix']['C'] = C[:,
                                                                             k]
        output['subject'][sub]['identification']['rates']['CR'] = CR[:, k]

    # compute global rates
    aux = get_auth_rates(auth['TP'], auth['FP'], auth['TN'], auth['FN'],
                         thresholds)
    output['global']['authentication']['rates'] = aux.as_dict()

    # identification
    Ns = iden['H'] + iden['M'] + iden['R']
    aux = get_id_rates(iden['H'], iden['M'], iden['R'], Ns, thresholds)
    output['global']['identification']['rates'] = aux.as_dict()

    return utils.ReturnTuple((output,), ('assessment',))


[docs]def assess_runs(results=None, subjects=None):
    """Assess the performance of multiple biometric classification runs.

    Parameters
    ----------
    results : list
        Classification assessment for each run.
    subjects : list
        Common target subject classes.

    Returns
    -------
    assessment : dict
        Global classification assessment.

    """

    # check inputs
    if results is None:
        raise TypeError("Please specify the input classification results.")
    if subjects is None:
        raise TypeError("Please specify the common subject classes.")

    nb = len(results)
    if nb == 0:
        raise ValueError("Please provide at least one classification run.")
    elif nb == 1:
        return utils.ReturnTuple((results[0],), ('assessment',))

    # output
    output = {
        'global': {
            'authentication': {
                'confusionMatrix': {'TP': 0., 'TN': 0., 'FP': 0., 'FN': 0.},
            },
            'identification': {
                'confusionMatrix': {'H': 0., 'M': 0., 'R': 0.},
            },
        },
        'subject': {},
        'thresholds': None,
    }

    thresholds = output['thresholds'] = results[0]['thresholds']

    # global helpers
    auth = output['global']['authentication']['confusionMatrix']
    iden = output['global']['identification']['confusionMatrix']
    authM = ['TP', 'TN', 'FP', 'FN']
    idenM1 = ['H', 'M', 'R', 'C']
    idenM2 = ['H', 'M', 'R']

    for sub in subjects:
        # create subject confusion matrix, rates
        output['subject'][sub] = {
            'authentication': {
                'confusionMatrix': {'TP': 0., 'TN': 0., 'FP': 0., 'FN': 0.},
                'rates': {},
            },
            'identification': {
                'confusionMatrix': {'H': 0., 'M': 0., 'R': 0., 'C': 0.},
                'rates': {},
            },
        }

        # subject helpers
        authS = output['subject'][sub]['authentication']['confusionMatrix']
        idenS = output['subject'][sub]['identification']['confusionMatrix']

        # update confusions
        for run in results:
            # authentication
            auth_run = run['subject'][sub]['authentication']['confusionMatrix']
            for m in authM:
                auth[m] += auth_run[m]
                authS[m] += auth_run[m]

            # identification
            iden_run = run['subject'][sub]['identification']['confusionMatrix']
            for m in idenM1:
                idenS[m] += iden_run[m]
            for m in idenM2:
                iden[m] += iden_run[m]

        # compute subject mean
        # authentication
        for m in authM:
            authS[m] /= float(nb)

        # identification
        for m in idenM1:
            idenS[m] /= float(nb)

        # compute subject rates
        aux = get_auth_rates(authS['TP'], authS['FP'], authS['TN'],
                             authS['FN'], thresholds)
        output['subject'][sub]['authentication']['rates'] = aux.as_dict()

        Ns = idenS['H'] + idenS['M'] + idenS['R']
        aux = get_id_rates(idenS['H'], idenS['M'], idenS['R'], Ns, thresholds)
        output['subject'][sub]['identification']['rates'] = aux.as_dict()
        M = np.array(idenS['M'], copy=True)
        M[M <= 0] = 1.
        output['subject'][sub]['identification']['rates']['CR'] = idenS['C'] / M

    # compute global mean
    # authentication
    for m in authM:
        auth[m] /= float(nb)

    # identification
    for m in idenM2:
        iden[m] /= float(nb)

    # compute rates
    aux = get_auth_rates(auth['TP'], auth['FP'], auth['TN'], auth['FN'],
                         thresholds)
    output['global']['authentication']['rates'] = aux.as_dict()

    Ns = iden['H'] + iden['M'] + iden['R']
    aux = get_id_rates(iden['H'], iden['M'], iden['R'], Ns, thresholds)
    output['global']['identification']['rates'] = aux.as_dict()

    return utils.ReturnTuple((output,), ('assessment',))


[docs]def combination(results=None, weights=None):
    """Combine results from multiple classifiers.

    Parameters
    ----------
    results : dict
        Results for each classifier.
    weights : dict, optional
        Weight for each classifier.

    Returns
    -------
    decision : object
        Consensus decision.
    confidence : float
        Confidence estimate of the decision.
    counts : array
        Weight for each possible decision outcome.
    classes : array
        List of possible decision outcomes.

    """

    # check inputs
    if results is None:
        raise TypeError("Please specify the input classification results.")
    if weights is None:
        weights = {}

    # compile results to find all classes
    vec = list(six.itervalues(results))
    if len(vec) == 0:
        raise CombinationError("No keys found.")

    unq = np.unique(np.concatenate(vec))

    nb = len(unq)
    if nb == 0:
        # empty array
        raise CombinationError("No values found.")
    elif nb == 1:
        # unanimous result
        decision = unq[0]
        confidence = 1.
        counts = [1.]
    else:
        # multi-class
        counts = np.zeros(nb, dtype='float')

        for n in results:
            # ensure array
            res = np.array(results[n])
            ns = float(len(res))

            # get count for each unique class
            for i in range(nb):
                aux = float(np.sum(res == unq[i]))
                w = weights.get(n, 1.)
                counts[i] += ((aux / ns) * w)

        # most frequent class
        predMax = counts.argmax()
        counts /= counts.sum()

        decision = unq[predMax]
        confidence = counts[predMax]

    # output
    args = (decision, confidence, counts, unq)
    names = ('decision', 'confidence', 'counts', 'classes')

    return utils.ReturnTuple(args, names)


[docs]def majority_rule(labels=None, random=True):
    """Determine the most frequent class label.

    Parameters
    ----------
    labels : array, list
        List of clas labels.
    random : bool, optional
        If True, will choose randomly in case of tied classes, otherwise the
        first element is chosen.

    Returns
    -------
        decision : object
            Consensus decision.
        count : int
            Number of elements of the consensus decision.

    """

    # check inputs
    if labels is None:
        raise TypeError("Please specify the input list of class labels.")

    if len(labels) == 0:
        raise CombinationError("Empty list of class labels.")

    # count unique occurrences
    unq, counts = np.unique(labels, return_counts=True)

    # most frequent class
    predMax = counts.argmax()

    if random:
        # check for repeats
        ind = np.nonzero(counts == counts[predMax])[0]
        length = len(ind)

        if length > 1:
            predMax = ind[np.random.randint(0, length)]

    decision = unq[predMax]
    cnt = counts[predMax]

    out = utils.ReturnTuple((decision, cnt), ('decision', 'count'))

    return out


[docs]def cross_validation(labels,
                     n_iter=10,
                     test_size=0.1,
                     train_size=None,
                     random_state=None):
    """Return a Cross Validation (CV) iterator.

    Wraps the StratifiedShuffleSplit iterator from sklearn.model_selection.
    This iterator returns stratified randomized folds, which preserve the
    percentage of samples for each class.

    Parameters
    ----------
    labels : list, array
        List of class labels for each data sample.
    n_iter : int, optional
        Number of splitting iterations.
    test_size : float, int, optional
        If float, represents the proportion of the dataset to include in the
        test split; if int, represents the absolute number of test samples.
    train_size : float, int, optional
        If float, represents the proportion of the dataset to include in the
        train split; if int, represents the absolute number of train samples.
    random_state : int, RandomState, optional
        The seed of the pseudo random number generator to use when shuffling
        the data.

    Returns
    -------
    cv : CV iterator
        Cross Validation iterator.

    """

    cv = skcv.StratifiedShuffleSplit(
        n_splits=n_iter,
        test_size=test_size,
        train_size=train_size,
        random_state=random_state,
    ).split(np.zeros(len(labels)), labels)

    return utils.ReturnTuple((cv,), ('cv',))