Source code for madmom.evaluation

# encoding: utf-8
# pylint: disable=no-member
# pylint: disable=invalid-name
# pylint: disable=too-many-arguments
# pylint: disable=wrong-import-position
"""
Evaluation package.

"""

from __future__ import absolute_import, division, print_function

import numpy as np


# evaluation helper functions
[docs]def find_closest_matches(detections, annotations):
    """
    Find the closest annotation for each detection.

    Parameters
    ----------
    detections : list or numpy array
        Detected events.
    annotations : list or numpy array
        Annotated events.

    Returns
    -------
    indices : numpy array
        Indices of the closest matches.

    Notes
    -----
    The sequences must be ordered.

    """
    # make sure the arrays have the correct types
    detections = np.asarray(detections, dtype=np.float)
    annotations = np.asarray(annotations, dtype=np.float)
    # TODO: right now, it only works with 1D arrays
    if detections.ndim > 1 or annotations.ndim > 1:
        raise NotImplementedError('please implement multi-dim support')
    # if no detections or annotations are given
    if len(detections) == 0 or len(annotations) == 0:
        # return a empty array
        return np.zeros(0, dtype=np.int)
    # if only a single annotation is given
    if len(annotations) == 1:
        # return an array as long as the detections with indices 0
        return np.zeros(len(detections), dtype=np.int)
    # solution found at: http://stackoverflow.com/questions/8914491/
    indices = annotations.searchsorted(detections)
    indices = np.clip(indices, 1, len(annotations) - 1)
    left = annotations[indices - 1]
    right = annotations[indices]
    indices -= detections - left < right - detections
    # return the indices of the closest matches
    return indices


[docs]def calc_errors(detections, annotations, matches=None):
    """
    Errors of the detections to the closest annotations.

    Parameters
    ----------
    detections : list or numpy array
        Detected events.
    annotations : list or numpy array
        Annotated events.
    matches : list or numpy array
        Indices of the closest events.

    Returns
    -------
    errors : numpy array
        Errors.

    Notes
    -----
    The sequences must be ordered. To speed up the calculation, a list of
    pre-computed indices of the closest matches can be used.

    """
    # make sure the arrays have the correct types
    detections = np.asarray(detections, dtype=np.float)
    annotations = np.asarray(annotations, dtype=np.float)
    if matches is not None:
        matches = np.asarray(matches, dtype=np.int)
    # TODO: right now, it only works with 1D arrays
    if detections.ndim > 1 or annotations.ndim > 1:
        raise NotImplementedError('please implement multi-dim support')
    # if no detections or annotations are given
    if len(detections) == 0 or len(annotations) == 0:
        # return a empty array
        return np.zeros(0, dtype=np.float)
    # determine the closest annotations
    if matches is None:
        matches = find_closest_matches(detections, annotations)
    # calc error relative to those annotations
    errors = detections - annotations[matches]
    # return the errors
    return errors


[docs]def calc_absolute_errors(detections, annotations, matches=None):
    """
    Absolute errors of the detections to the closest annotations.

    Parameters
    ----------
    detections : list or numpy array
        Detected events.
    annotations : list or numpy array
        Annotated events.
    matches : list or numpy array
        Indices of the closest events.

    Returns
    -------
    errors : numpy array
        Absolute errors.

    Notes
    -----
    The sequences must be ordered. To speed up the calculation, a list of
    pre-computed indices of the closest matches can be used.

    """
    # make sure the arrays have the correct types
    detections = np.asarray(detections, dtype=np.float)
    annotations = np.asarray(annotations, dtype=np.float)
    if matches is not None:
        matches = np.asarray(matches, dtype=np.int)
    # TODO: right now, it only works with 1D arrays
    if detections.ndim > 1 or annotations.ndim > 1:
        raise NotImplementedError('please implement multi-dim support')
    # return the errors
    return np.abs(calc_errors(detections, annotations, matches))


[docs]def calc_relative_errors(detections, annotations, matches=None):
    """
    Relative errors of the detections to the closest annotations.

    Parameters
    ----------
    detections : list or numpy array
        Detected events.
    annotations : list or numpy array
        Annotated events.
    matches : list or numpy array
        Indices of the closest events.

    Returns
    -------
    errors : numpy array
        Relative errors.

    Notes
    -----
    The sequences must be ordered. To speed up the calculation, a list of
    pre-computed indices of the closest matches can be used.

    """
    # make sure the arrays have the correct types
    detections = np.asarray(detections, dtype=np.float)
    annotations = np.asarray(annotations, dtype=np.float)
    if matches is not None:
        matches = np.asarray(matches, dtype=np.int)
    # TODO: right now, it only works with 1D arrays
    if detections.ndim > 1 or annotations.ndim > 1:
        raise NotImplementedError('please implement multi-dim support')
    # if no detections or annotations are given
    if len(detections) == 0 or len(annotations) == 0:
        # return a empty array
        return np.zeros(0, dtype=np.float)
    # determine the closest annotations
    if matches is None:
        matches = find_closest_matches(detections, annotations)
    # calculate the absolute errors
    errors = calc_errors(detections, annotations, matches)
    # return the relative errors
    return np.abs(1 - (errors / annotations[matches]))


# abstract evaluation base class
[docs]class EvaluationMixin(object):
    """
    Evaluation mixin class.

    This class has a `name` attribute which is used for display purposes and
    defaults to 'None'.

    `METRIC_NAMES` is a list of tuples, containing the attribute's name and the
    corresponding label, e.g.:

    The attributes defined in `METRIC_NAMES` will be provided as an ordered
    dictionary as the `metrics` property unless the subclass overwrites the
    property.

    `FLOAT_FORMAT` is used to format floats.

    """

    # Example:
    # METRIC_NAMES = [
    #     ('precision', 'Precision'),
    #     ('recall', 'Recall'),
    #     ('fmeasure', 'F-measure'),
    # ]
    name = None
    METRIC_NAMES = []
    FLOAT_FORMAT = '{:.3f}'

    @property
    def metrics(self):
        """Metrics as a dictionary."""
        from collections import OrderedDict
        metrics = OrderedDict()
        # metrics = {}
        for metric in [m[0] for m in self.METRIC_NAMES]:
            metrics[metric] = getattr(self, metric)
        return metrics

    def __len__(self):
        """Length of the evaluation object."""
        raise NotImplementedError('must be implemented by subclass.')

[docs]    def tostring(self, **kwargs):
        """
        Format the evaluation metrics as a human readable string.

        Returns
        -------
        str
            Evaluation metrics formatted as a human readable string.

        Notes
        -----
        This is a fallback method formatting the `metrics` dictionary in a
        human readable way. Classes inheriting from this mixin class should
        provide a method better suitable.

        """
        # pylint: disable=unused-argument
        import pprint
        return pprint.pformat(dict(self.metrics), indent=4)


# evaluation classes
[docs]class SimpleEvaluation(EvaluationMixin):
    """
    Simple Precision, Recall, F-measure and Accuracy evaluation based on the
    numbers of true/false positive/negative detections.

    Parameters
    ----------
    num_tp : int
        Number of true positive detections.
    num_fp : int
        Number of false positive detections.
    num_tn : int
        Number of true negative detections.
    num_fn : int
        Number of false negative detections.
    name : str
        Name to be displayed.

    Notes
    -----
    This class is only suitable for a 1-class evaluation problem.

    """

    METRIC_NAMES = [
        ('num_tp', 'No. of true positives'),
        ('num_fp', 'No. of false positives'),
        ('num_tn', 'No. of true negatives'),
        ('num_fn', 'No. of false negatives'),
        ('num_annotations', 'No. Annotations'),
        ('precision', 'Precision'),
        ('recall', 'Recall'),
        ('fmeasure', 'F-measure'),
        ('accuracy', 'Accuracy'),
    ]

    def __init__(self, num_tp=0, num_fp=0, num_tn=0, num_fn=0, name=None,
                 **kwargs):
        # pylint: disable=unused-argument
        # hidden variables, to be able to overwrite them in subclasses
        self._num_tp = int(num_tp)
        self._num_fp = int(num_fp)
        self._num_tn = int(num_tn)
        self._num_fn = int(num_fn)
        # name of the evaluation
        self.name = name

    @property
    def num_tp(self):
        """Number of true positive detections."""
        return self._num_tp

    @property
    def num_fp(self):
        """Number of false positive detections."""
        return self._num_fp

    @property
    def num_tn(self):
        """Number of true negative detections."""
        return self._num_tn

    @property
    def num_fn(self):
        """Number of false negative detections."""
        return self._num_fn

    @property
    def num_annotations(self):
        """Number of annotations."""
        return self.num_tp + self.num_fn

    def __len__(self):
        # the length equals the number of annotations
        return self.num_annotations

    @property
    def precision(self):
        """Precision."""
        # correct / retrieved
        retrieved = float(self.num_tp + self.num_fp)
        # if there are no positive predictions, none of them are wrong
        if retrieved == 0:
            return 1.
        return self.num_tp / retrieved

    @property
    def recall(self):
        """Recall."""
        # correct / relevant
        relevant = float(self.num_tp + self.num_fn)
        # if there are no positive annotations, we recalled all of them
        if relevant == 0:
            return 1.
        return self.num_tp / relevant

    @property
    def fmeasure(self):
        """F-measure."""
        # 2pr / (p+r)
        numerator = 2. * self.precision * self.recall
        if numerator == 0:
            return 0.
        return numerator / (self.precision + self.recall)

    @property
    def accuracy(self):
        """Accuracy."""
        # acc: (TP + TN) / (TP + FP + TN + FN)
        denominator = self.num_fp + self.num_fn + self.num_tp + self.num_tn
        if denominator == 0:
            return 1.
        numerator = float(self.num_tp + self.num_tn)
        if numerator == 0:
            return 0.
        return numerator / denominator

[docs]    def tostring(self, **kwargs):
        """
        Format the evaluation metrics as a human readable string.

        Returns
        -------
        str
            Evaluation metrics formatted as a human readable string.


        """
        ret = ''
        if self.name is not None:
            ret += '%s\n  ' % self.name
        ret += 'Annotations: %5d TP: %5d FP: %5d FN: %5d ' \
               'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \
               (self.num_annotations, self.num_tp, self.num_fp, self.num_fn,
                self.precision, self.recall, self.fmeasure, self.accuracy)
        return ret

    def __str__(self):
        return self.tostring()


# evaluate Precision, Recall, F-measure and Accuracy with lists or numpy arrays
[docs]class Evaluation(SimpleEvaluation):
    """
    Evaluation class for measuring Precision, Recall and F-measure based on
    numpy arrays or lists with true/false positive/negative detections.

    Parameters
    ----------
    tp : list or numpy array
        True positive detections.
    fp : list or numpy array
        False positive detections.
    tn : list or numpy array
        True negative detections.
    fn : list or numpy array
        False negative detections.
    name : str
        Name to be displayed.

    """

    def __init__(self, tp=None, fp=None, tn=None, fn=None, **kwargs):
        # set default values
        if tp is None:
            tp = []
        if fp is None:
            fp = []
        if tn is None:
            tn = []
        if fn is None:
            fn = []
        # instantiate a SimpleEvaluation object
        super(Evaluation, self).__init__(**kwargs)
        # convert everything to numpy arrays and save them
        self.tp = np.asarray(list(tp), dtype=np.float)
        self.fp = np.asarray(list(fp), dtype=np.float)
        self.tn = np.asarray(list(tn), dtype=np.float)
        self.fn = np.asarray(list(fn), dtype=np.float)

    @property
    def num_tp(self):
        """Number of true positive detections."""
        return len(self.tp)

    @property
    def num_fp(self):
        """Number of false positive detections."""
        return len(self.fp)

    @property
    def num_tn(self):
        """Number of true negative detections."""
        return len(self.tn)

    @property
    def num_fn(self):
        """Number of false negative detections."""
        return len(self.fn)


# class for evaluation of Precision, Recall, F-measure with 2D arrays
[docs]class MultiClassEvaluation(Evaluation):
    """
    Evaluation class for measuring Precision, Recall and F-measure based on
    2D numpy arrays with true/false positive/negative detections.

    Parameters
    ----------
    tp : list of tuples or numpy array, shape (num_tp, 2)
        True positive detections.
    fp : list of tuples or numpy array, shape (num_fp, 2)
        False positive detections.
    tn : list of tuples or numpy array, shape (num_tn, 2)
        True negative detections.
    fn : list of tuples or numpy array, shape (num_fn, 2)
        False negative detections.
    name : str
        Name to be displayed.

    Notes
    -----
    The second item of the tuples or the second column of the arrays denote
    the class the detection belongs to.

    """
    def __init__(self, tp=None, fp=None, tn=None, fn=None, **kwargs):
        # set default values
        if tp is None:
            tp = np.zeros((0, 2))
        if fp is None:
            fp = np.zeros((0, 2))
        if tn is None:
            tn = np.zeros((0, 2))
        if fn is None:
            fn = np.zeros((0, 2))
        super(MultiClassEvaluation, self).__init__(**kwargs)
        self.tp = np.asarray(tp, dtype=np.float)
        self.fp = np.asarray(fp, dtype=np.float)
        self.tn = np.asarray(tn, dtype=np.float)
        self.fn = np.asarray(fn, dtype=np.float)

[docs]    def tostring(self, verbose=False, **kwargs):
        """
        Format the evaluation metrics as a human readable string.

        Parameters
        ----------
        verbose : bool
            Add evaluation for individual classes.

        Returns
        -------
        str
            Evaluation metrics formatted as a human readable string.

        """
        ret = ''

        if verbose:
            # extract all classes
            classes = []
            if self.tp.any():
                classes = np.append(classes, np.unique(self.tp[:, 1]))
            if self.fp.any():
                classes = np.append(classes, np.unique(self.fp[:, 1]))
            if self.tn.any():
                classes = np.append(classes, np.unique(self.tn[:, 1]))
            if self.fn.any():
                classes = np.append(classes, np.unique(self.fn[:, 1]))
            for cls in sorted(np.unique(classes)):
                # extract the TP, FP, TN and FN of this class
                tp = self.tp[self.tp[:, 1] == cls]
                fp = self.fp[self.fp[:, 1] == cls]
                tn = self.tn[self.tn[:, 1] == cls]
                fn = self.fn[self.fn[:, 1] == cls]
                # evaluate them
                e = Evaluation(tp, fp, tn, fn, name='Class %s' % cls)
                # append to the output string
                ret += '  %s\n' % e.tostring(verbose=False)
        # normal formatting
        ret += 'Annotations: %5d TP: %5d FP: %4d FN: %4d ' \
               'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \
               (self.num_annotations, self.num_tp, self.num_fp, self.num_fn,
                self.precision, self.recall, self.fmeasure, self.accuracy)
        # return
        return ret


# class for summing Evaluations
[docs]class SumEvaluation(SimpleEvaluation):
    """
    Simple class for summing evaluations.

    Parameters
    ----------
    eval_objects : list
        Evaluation objects.
    name : str
        Name to be displayed.

    """

    def __init__(self, eval_objects, name=None):
        # pylint: disable=super-init-not-called
        # Note: we want to inherit the evaluation functions/properties, no need
        #       to call __super__, but we need to take care of 'name'
        if not isinstance(eval_objects, list):
            # wrap the given eval_object in a list
            eval_objects = [eval_objects]
        self.eval_objects = eval_objects
        self.name = name or 'sum for %d files' % len(self)

    def __len__(self):
        # just use the length of the evaluation objects
        return len(self.eval_objects)

    # redefine the counters (number of TP, FP, TN, FN & number of annotations)

    @property
    def num_tp(self):
        """Number of true positive detections."""
        return sum(e.num_tp for e in self.eval_objects)

    @property
    def num_fp(self):
        """Number of false positive detections."""
        return sum(e.num_fp for e in self.eval_objects)

    @property
    def num_tn(self):
        """Number of true negative detections."""
        return sum(e.num_tn for e in self.eval_objects)

    @property
    def num_fn(self):
        """Number of false negative detections."""
        return sum(e.num_fn for e in self.eval_objects)

    @property
    def num_annotations(self):
        """Number of annotations."""
        return sum(e.num_annotations for e in self.eval_objects)


# class for averaging Evaluations
[docs]class MeanEvaluation(SumEvaluation):
    """
    Simple class for averaging evaluation.

    Parameters
    ----------
    eval_objects : list
        Evaluation objects.
    name : str
        Name to be displayed.

    """

    def __init__(self, eval_objects, name=None, **kwargs):
        super(MeanEvaluation, self).__init__(eval_objects, **kwargs)
        # handle the 'name' here to be able to set a different default value
        self.name = name or 'mean for %d files' % len(self)

    # overwrite the properties to calculate the mean instead of the sum

    @property
    def num_tp(self):
        """Number of true positive detections."""
        if not self.eval_objects:
            return 0.
        return np.nanmean([e.num_tp for e in self.eval_objects])

    @property
    def num_fp(self):
        """Number of false positive detections."""
        if not self.eval_objects:
            return 0.
        return np.nanmean([e.num_fp for e in self.eval_objects])

    @property
    def num_tn(self):
        """Number of true negative detections."""
        if not self.eval_objects:
            return 0.
        return np.nanmean([e.num_tn for e in self.eval_objects])

    @property
    def num_fn(self):
        """Number of false negative detections."""
        if not self.eval_objects:
            return 0.
        return np.nanmean([e.num_fn for e in self.eval_objects])

    @property
    def num_annotations(self):
        """Number of annotations."""
        if not self.eval_objects:
            return 0.
        return np.nanmean([e.num_annotations for e in self.eval_objects])

    @property
    def precision(self):
        """Precision."""
        return np.nanmean([e.precision for e in self.eval_objects])

    @property
    def recall(self):
        """Recall."""
        return np.nanmean([e.recall for e in self.eval_objects])

    @property
    def fmeasure(self):
        """F-measure."""
        return np.nanmean([e.fmeasure for e in self.eval_objects])

    @property
    def accuracy(self):
        """Accuracy."""
        return np.nanmean([e.accuracy for e in self.eval_objects])

[docs]    def tostring(self, **kwargs):
        """
        Format the evaluation metrics as a human readable string.

        Returns
        -------
        str
            Evaluation metrics formatted as a human readable string.

        """
        ret = ''
        if self.name is not None:
            ret += '%s\n  ' % self.name
        # TODO: unify this with SimpleEvaluation but
        #       add option to provide field formatters (e.g. 3d or 5.2f)
        # format with floats instead of integers
        ret = 'Annotations: %5.2f TP: %5.2f FP: %5.2f FN: %5.2f' \
              'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \
              (self.num_annotations, self.num_tp, self.num_fp, self.num_fn,
               self.precision, self.recall, self.fmeasure, self.accuracy)
        return ret


[docs]def tostring(eval_objects, **kwargs):
    """
    Format the given evaluation objects as human readable strings.

    Parameters
    ----------
    eval_objects : list
        Evaluation objects.

    Returns
    -------
    str
        Evaluation metrics formatted as a human readable string.

    """
    # pylint: disable=unused-argument
    return '\n'.join([e.tostring() for e in eval_objects])


[docs]def tocsv(eval_objects, metric_names=None, float_format='{:.3f}', **kwargs):
    """
    Format the given evaluation objects as a CSV table.

    Parameters
    ----------
    eval_objects : list
        Evaluation objects.
    metric_names : list of tuples, optional
        List of tuples defining the name of the property corresponding to the
        metric, and the metric label e.g. ('fp', 'False Positives').
    float_format : str, optional
        How to format the metrics.

    Returns
    -------
    str
        CSV table representation of the evaluation objects.

    Notes
    -----
    If no `metric_names` are given, they will be extracted from the first
    evaluation object.

    """
    # pylint: disable=unused-argument

    if metric_names is None:
        # get the evaluation metrics from the first evaluation object
        metric_names = eval_objects[0].METRIC_NAMES
    metric_names, metric_labels = list(zip(*metric_names))
    # add header
    lines = ['Name,' + ','.join(metric_labels)]
    # TODO: use e.metrics dict?
    # add the evaluation objects
    for e in eval_objects:
        values = [float_format.format(getattr(e, mn)) for mn in metric_names]
        lines.append(e.name + ',' + ','.join(values))
    # return everything
    return '\n'.join(lines)


[docs]def totex(eval_objects, metric_names=None, float_format='{:.3f}', **kwargs):
    """
    Format the given evaluation objects as a LaTeX table.

    Parameters
    ----------
    eval_objects : list
        Evaluation objects.
    metric_names : list of tuples, optional
        List of tuples defining the name of the property corresponding to the
        metric, and the metric label e.g. ('fp', 'False Positives').
    float_format : str, optional
        How to format the metrics.

    Returns
    -------
    str
        LaTeX table representation of the evaluation objects.

    Notes
    -----
    If no `metric_names` are given, they will be extracted from the first
    evaluation object.

    """
    # pylint: disable=unused-argument

    if metric_names is None:
        # get the evaluation metrics from the first evaluation object
        metric_names = eval_objects[0].METRIC_NAMES
    metric_names, metric_labels = list(zip(*metric_names))
    # add header
    lines = ['Name & ' + ' & '.join(metric_labels) + '\\\\']
    # TODO: use e.metrics dict
    # TODO: add a generic totable() function which accepts columns separator,
    #       newline stuff (e.g. tex \\\\) and others
    # add the evaluation objects
    for e in eval_objects:
        values = [float_format.format(getattr(e, mn)) for mn in metric_names]
        lines.append(e.name + ' & ' + ' & '.join(values) + '\\\\')
    # return everything
    return '\n'.join(lines)


[docs]def evaluation_io(parser, ann_suffix, det_suffix, ann_dir=None, det_dir=None):
    """
    Add evaluation input/output and formatting related arguments to an existing
    parser object.

    Parameters
    ----------
    parser : argparse parser instance
        Existing argparse parser object.
    ann_suffix : str
        Suffix of the annotation files.
    det_suffix : str
        Suffix of the detection files.
    ann_dir : str, optional
        Use only annotations from this folder (and sub-folders).
    det_dir : str, optional
        Use only detections from this folder (and sub-folders).

    Returns
    -------
    io_group : argparse argument group
        Evaluation input / output argument group.
    formatter_group : argparse argument group
        Evaluation formatter argument group.

    """
    import sys
    import argparse
    # general input output file handling
    parser.add_argument('files', nargs='*',
                        help='files (or folders) to be evaluated')
    parser.add_argument('-o', dest='outfile', type=argparse.FileType('w'),
                        default=sys.stdout,
                        help='output file [default: STDOUT]')
    # file suffixes used for evaluation
    g = parser.add_argument_group('file/folder/suffix arguments')
    g.add_argument('-a', dest='ann_suffix', action='store', default=ann_suffix,
                   help='suffix of the annotation files '
                        '[default: %(default)s]')
    g.add_argument('--ann_dir', action='store', default=ann_dir,
                   help='search only this directory (recursively) for '
                        'annotation files [default: %(default)s]')
    g.add_argument('-d', dest='det_suffix', action='store', default=det_suffix,
                   help='suffix of the detection files [default: %(default)s]')
    g.add_argument('--det_dir', action='store', default=det_dir,
                   help='search only this directory (recursively) for '
                        'detection files [default: %(default)s]')
    # option to ignore non-existing detections
    g.add_argument('-i', '--ignore_non_existing', action='store_true',
                   help='ignore non-existing detections [default: raise a '
                        'warning and assume empty detections]')
    # verbose
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help='increase verbosity level')
    # option to suppress warnings
    parser.add_argument('-q', '--quiet', action='store_true',
                        help='suppress any warnings')
    # output format options
    parser.set_defaults(output_formatter=tostring)
    f = parser.add_argument_group('formatting arguments')
    formats = f.add_mutually_exclusive_group()
    formats.add_argument('--tex', dest='output_formatter',
                         action='store_const', const=totex,
                         help='format output to be used in .tex files')
    formats.add_argument('--csv', dest='output_formatter',
                         action='store_const', const=tocsv,
                         help='format output to be used in .csv files')
    # return the output formatting group so the caller can add more options
    return g, f


# finally import the submodules
from . import chords, beats, notes, onsets, tempo

# import often used classes
from .beats import BeatEvaluation, BeatMeanEvaluation
from .chords import ChordEvaluation, ChordMeanEvaluation, ChordSumEvaluation
from .key import KeyEvaluation, KeyMeanEvaluation
from .notes import NoteEvaluation, NoteMeanEvaluation, NoteSumEvaluation
from .onsets import OnsetEvaluation, OnsetMeanEvaluation, OnsetSumEvaluation
from .tempo import TempoEvaluation, TempoMeanEvaluation