Source code for madmom.evaluation

# encoding: utf-8
# pylint: disable=no-member
# pylint: disable=invalid-name
# pylint: disable=too-many-arguments
# pylint: disable=wrong-import-position
"""
Evaluation package.

"""

from __future__ import absolute_import, division, print_function

import numpy as np


# evaluation helper functions
[docs]def find_closest_matches(detections, annotations): """ Find the closest annotation for each detection. Parameters ---------- detections : list or numpy array Detected events. annotations : list or numpy array Annotated events. Returns ------- indices : numpy array Indices of the closest matches. Notes ----- The sequences must be ordered. """ # make sure the arrays have the correct types detections = np.asarray(detections, dtype=np.float) annotations = np.asarray(annotations, dtype=np.float) # TODO: right now, it only works with 1D arrays if detections.ndim > 1 or annotations.ndim > 1: raise NotImplementedError('please implement multi-dim support') # if no detections or annotations are given if len(detections) == 0 or len(annotations) == 0: # return a empty array return np.zeros(0, dtype=np.int) # if only a single annotation is given if len(annotations) == 1: # return an array as long as the detections with indices 0 return np.zeros(len(detections), dtype=np.int) # solution found at: http://stackoverflow.com/questions/8914491/ indices = annotations.searchsorted(detections) indices = np.clip(indices, 1, len(annotations) - 1) left = annotations[indices - 1] right = annotations[indices] indices -= detections - left < right - detections # return the indices of the closest matches return indices
[docs]def calc_errors(detections, annotations, matches=None): """ Errors of the detections to the closest annotations. Parameters ---------- detections : list or numpy array Detected events. annotations : list or numpy array Annotated events. matches : list or numpy array Indices of the closest events. Returns ------- errors : numpy array Errors. Notes ----- The sequences must be ordered. To speed up the calculation, a list of pre-computed indices of the closest matches can be used. """ # make sure the arrays have the correct types detections = np.asarray(detections, dtype=np.float) annotations = np.asarray(annotations, dtype=np.float) if matches is not None: matches = np.asarray(matches, dtype=np.int) # TODO: right now, it only works with 1D arrays if detections.ndim > 1 or annotations.ndim > 1: raise NotImplementedError('please implement multi-dim support') # if no detections or annotations are given if len(detections) == 0 or len(annotations) == 0: # return a empty array return np.zeros(0, dtype=np.float) # determine the closest annotations if matches is None: matches = find_closest_matches(detections, annotations) # calc error relative to those annotations errors = detections - annotations[matches] # return the errors return errors
[docs]def calc_absolute_errors(detections, annotations, matches=None): """ Absolute errors of the detections to the closest annotations. Parameters ---------- detections : list or numpy array Detected events. annotations : list or numpy array Annotated events. matches : list or numpy array Indices of the closest events. Returns ------- errors : numpy array Absolute errors. Notes ----- The sequences must be ordered. To speed up the calculation, a list of pre-computed indices of the closest matches can be used. """ # make sure the arrays have the correct types detections = np.asarray(detections, dtype=np.float) annotations = np.asarray(annotations, dtype=np.float) if matches is not None: matches = np.asarray(matches, dtype=np.int) # TODO: right now, it only works with 1D arrays if detections.ndim > 1 or annotations.ndim > 1: raise NotImplementedError('please implement multi-dim support') # return the errors return np.abs(calc_errors(detections, annotations, matches))
[docs]def calc_relative_errors(detections, annotations, matches=None): """ Relative errors of the detections to the closest annotations. Parameters ---------- detections : list or numpy array Detected events. annotations : list or numpy array Annotated events. matches : list or numpy array Indices of the closest events. Returns ------- errors : numpy array Relative errors. Notes ----- The sequences must be ordered. To speed up the calculation, a list of pre-computed indices of the closest matches can be used. """ # make sure the arrays have the correct types detections = np.asarray(detections, dtype=np.float) annotations = np.asarray(annotations, dtype=np.float) if matches is not None: matches = np.asarray(matches, dtype=np.int) # TODO: right now, it only works with 1D arrays if detections.ndim > 1 or annotations.ndim > 1: raise NotImplementedError('please implement multi-dim support') # if no detections or annotations are given if len(detections) == 0 or len(annotations) == 0: # return a empty array return np.zeros(0, dtype=np.float) # determine the closest annotations if matches is None: matches = find_closest_matches(detections, annotations) # calculate the absolute errors errors = calc_errors(detections, annotations, matches) # return the relative errors return np.abs(1 - (errors / annotations[matches]))
# abstract evaluation base class
[docs]class EvaluationMixin(object): """ Evaluation mixin class. This class has a `name` attribute which is used for display purposes and defaults to 'None'. `METRIC_NAMES` is a list of tuples, containing the attribute's name and the corresponding label, e.g.: The attributes defined in `METRIC_NAMES` will be provided as an ordered dictionary as the `metrics` property unless the subclass overwrites the property. `FLOAT_FORMAT` is used to format floats. """ # Example: # METRIC_NAMES = [ # ('precision', 'Precision'), # ('recall', 'Recall'), # ('fmeasure', 'F-measure'), # ] name = None METRIC_NAMES = [] FLOAT_FORMAT = '{:.3f}' @property def metrics(self): """Metrics as a dictionary.""" from collections import OrderedDict metrics = OrderedDict() # metrics = {} for metric in [m[0] for m in self.METRIC_NAMES]: metrics[metric] = getattr(self, metric) return metrics def __len__(self): """Length of the evaluation object.""" raise NotImplementedError('must be implemented by subclass.')
[docs] def tostring(self, **kwargs): """ Format the evaluation metrics as a human readable string. Returns ------- str Evaluation metrics formatted as a human readable string. Notes ----- This is a fallback method formatting the `metrics` dictionary in a human readable way. Classes inheriting from this mixin class should provide a method better suitable. """ # pylint: disable=unused-argument import pprint return pprint.pformat(dict(self.metrics), indent=4)
# evaluation classes
[docs]class SimpleEvaluation(EvaluationMixin): """ Simple Precision, Recall, F-measure and Accuracy evaluation based on the numbers of true/false positive/negative detections. Parameters ---------- num_tp : int Number of true positive detections. num_fp : int Number of false positive detections. num_tn : int Number of true negative detections. num_fn : int Number of false negative detections. name : str Name to be displayed. Notes ----- This class is only suitable for a 1-class evaluation problem. """ METRIC_NAMES = [ ('num_tp', 'No. of true positives'), ('num_fp', 'No. of false positives'), ('num_tn', 'No. of true negatives'), ('num_fn', 'No. of false negatives'), ('num_annotations', 'No. Annotations'), ('precision', 'Precision'), ('recall', 'Recall'), ('fmeasure', 'F-measure'), ('accuracy', 'Accuracy'), ] def __init__(self, num_tp=0, num_fp=0, num_tn=0, num_fn=0, name=None, **kwargs): # pylint: disable=unused-argument # hidden variables, to be able to overwrite them in subclasses self._num_tp = int(num_tp) self._num_fp = int(num_fp) self._num_tn = int(num_tn) self._num_fn = int(num_fn) # name of the evaluation self.name = name @property def num_tp(self): """Number of true positive detections.""" return self._num_tp @property def num_fp(self): """Number of false positive detections.""" return self._num_fp @property def num_tn(self): """Number of true negative detections.""" return self._num_tn @property def num_fn(self): """Number of false negative detections.""" return self._num_fn @property def num_annotations(self): """Number of annotations.""" return self.num_tp + self.num_fn def __len__(self): # the length equals the number of annotations return self.num_annotations @property def precision(self): """Precision.""" # correct / retrieved retrieved = float(self.num_tp + self.num_fp) # if there are no positive predictions, none of them are wrong if retrieved == 0: return 1. return self.num_tp / retrieved @property def recall(self): """Recall.""" # correct / relevant relevant = float(self.num_tp + self.num_fn) # if there are no positive annotations, we recalled all of them if relevant == 0: return 1. return self.num_tp / relevant @property def fmeasure(self): """F-measure.""" # 2pr / (p+r) numerator = 2. * self.precision * self.recall if numerator == 0: return 0. return numerator / (self.precision + self.recall) @property def accuracy(self): """Accuracy.""" # acc: (TP + TN) / (TP + FP + TN + FN) denominator = self.num_fp + self.num_fn + self.num_tp + self.num_tn if denominator == 0: return 1. numerator = float(self.num_tp + self.num_tn) if numerator == 0: return 0. return numerator / denominator
[docs] def tostring(self, **kwargs): """ Format the evaluation metrics as a human readable string. Returns ------- str Evaluation metrics formatted as a human readable string. """ ret = '' if self.name is not None: ret += '%s\n ' % self.name ret += 'Annotations: %5d TP: %5d FP: %5d FN: %5d ' \ 'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \ (self.num_annotations, self.num_tp, self.num_fp, self.num_fn, self.precision, self.recall, self.fmeasure, self.accuracy) return ret
def __str__(self): return self.tostring()
# evaluate Precision, Recall, F-measure and Accuracy with lists or numpy arrays
[docs]class Evaluation(SimpleEvaluation): """ Evaluation class for measuring Precision, Recall and F-measure based on numpy arrays or lists with true/false positive/negative detections. Parameters ---------- tp : list or numpy array True positive detections. fp : list or numpy array False positive detections. tn : list or numpy array True negative detections. fn : list or numpy array False negative detections. name : str Name to be displayed. """ METRIC_NAMES = [ ('tp', 'True positives'), ('fp', 'False positives'), ('tn', 'True negatives'), ('fn', 'False negatives'), ('num_tp', 'No. of true positives'), ('num_fp', 'No. of false positives'), ('num_tn', 'No. of true negatives'), ('num_fn', 'No. of false negatives'), ('num_annotations', 'No. Annotations'), ('precision', 'Precision'), ('recall', 'Recall'), ('fmeasure', 'F-measure'), ('accuracy', 'Accuracy'), ] def __init__(self, tp=None, fp=None, tn=None, fn=None, **kwargs): # set default values if tp is None: tp = [] if fp is None: fp = [] if tn is None: tn = [] if fn is None: fn = [] # instantiate a SimpleEvaluation object super(Evaluation, self).__init__(**kwargs) # convert everything to numpy arrays and save them self.tp = np.asarray(list(tp), dtype=np.float) self.fp = np.asarray(list(fp), dtype=np.float) self.tn = np.asarray(list(tn), dtype=np.float) self.fn = np.asarray(list(fn), dtype=np.float) @property def num_tp(self): """Number of true positive detections.""" return len(self.tp) @property def num_fp(self): """Number of false positive detections.""" return len(self.fp) @property def num_tn(self): """Number of true negative detections.""" return len(self.tn) @property def num_fn(self): """Number of false negative detections.""" return len(self.fn)
# class for evaluation of Precision, Recall, F-measure with 2D arrays
[docs]class MultiClassEvaluation(Evaluation): """ Evaluation class for measuring Precision, Recall and F-measure based on 2D numpy arrays with true/false positive/negative detections. Parameters ---------- tp : list of tuples or numpy array, shape (num_tp, 2) True positive detections. fp : list of tuples or numpy array, shape (num_fp, 2) False positive detections. tn : list of tuples or numpy array, shape (num_tn, 2) True negative detections. fn : list of tuples or numpy array, shape (num_fn, 2) False negative detections. name : str Name to be displayed. Notes ----- The second item of the tuples or the second column of the arrays denote the class the detection belongs to. """ def __init__(self, tp=None, fp=None, tn=None, fn=None, **kwargs): # set default values if tp is None: tp = np.zeros((0, 2)) if fp is None: fp = np.zeros((0, 2)) if tn is None: tn = np.zeros((0, 2)) if fn is None: fn = np.zeros((0, 2)) super(MultiClassEvaluation, self).__init__(**kwargs) self.tp = np.asarray(tp, dtype=np.float) self.fp = np.asarray(fp, dtype=np.float) self.tn = np.asarray(tn, dtype=np.float) self.fn = np.asarray(fn, dtype=np.float)
[docs] def tostring(self, verbose=False, **kwargs): """ Format the evaluation metrics as a human readable string. Parameters ---------- verbose : bool Add evaluation for individual classes. Returns ------- str Evaluation metrics formatted as a human readable string. """ ret = '' if verbose: # extract all classes classes = [] if self.tp.any(): classes = np.append(classes, np.unique(self.tp[:, 1])) if self.fp.any(): classes = np.append(classes, np.unique(self.fp[:, 1])) if self.tn.any(): classes = np.append(classes, np.unique(self.tn[:, 1])) if self.fn.any(): classes = np.append(classes, np.unique(self.fn[:, 1])) for cls in sorted(np.unique(classes)): # extract the TP, FP, TN and FN of this class tp = self.tp[self.tp[:, 1] == cls] fp = self.fp[self.fp[:, 1] == cls] tn = self.tn[self.tn[:, 1] == cls] fn = self.fn[self.fn[:, 1] == cls] # evaluate them e = Evaluation(tp, fp, tn, fn, name='Class %s' % cls) # append to the output string ret += ' %s\n' % e.tostring(verbose=False) # normal formatting ret += 'Annotations: %5d TP: %5d FP: %4d FN: %4d ' \ 'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \ (self.num_annotations, self.num_tp, self.num_fp, self.num_fn, self.precision, self.recall, self.fmeasure, self.accuracy) # return return ret
# class for summing Evaluations
[docs]class SumEvaluation(SimpleEvaluation): """ Simple class for summing evaluations. Parameters ---------- eval_objects : list Evaluation objects. name : str Name to be displayed. """ def __init__(self, eval_objects, name=None): # pylint: disable=super-init-not-called # Note: we want to inherit the evaluation functions/properties, no need # to call __super__, but we need to take care of 'name' if not isinstance(eval_objects, list): # wrap the given eval_object in a list eval_objects = [eval_objects] self.eval_objects = eval_objects self.name = name or 'sum for %d files' % len(self) def __len__(self): # just use the length of the evaluation objects return len(self.eval_objects) # redefine the counters (number of TP, FP, TN, FN & number of annotations) @property def num_tp(self): """Number of true positive detections.""" return sum(e.num_tp for e in self.eval_objects) @property def num_fp(self): """Number of false positive detections.""" return sum(e.num_fp for e in self.eval_objects) @property def num_tn(self): """Number of true negative detections.""" return sum(e.num_tn for e in self.eval_objects) @property def num_fn(self): """Number of false negative detections.""" return sum(e.num_fn for e in self.eval_objects) @property def num_annotations(self): """Number of annotations.""" return sum(e.num_annotations for e in self.eval_objects)
# class for averaging Evaluations
[docs]class MeanEvaluation(SumEvaluation): """ Simple class for averaging evaluation. Parameters ---------- eval_objects : list Evaluation objects. name : str Name to be displayed. """ def __init__(self, eval_objects, name=None, **kwargs): super(MeanEvaluation, self).__init__(eval_objects, **kwargs) # handle the 'name' here to be able to set a different default value self.name = name or 'mean for %d files' % len(self) # overwrite the properties to calculate the mean instead of the sum @property def num_tp(self): """Number of true positive detections.""" if not self.eval_objects: return 0. return np.nanmean([e.num_tp for e in self.eval_objects]) @property def num_fp(self): """Number of false positive detections.""" if not self.eval_objects: return 0. return np.nanmean([e.num_fp for e in self.eval_objects]) @property def num_tn(self): """Number of true negative detections.""" if not self.eval_objects: return 0. return np.nanmean([e.num_tn for e in self.eval_objects]) @property def num_fn(self): """Number of false negative detections.""" if not self.eval_objects: return 0. return np.nanmean([e.num_fn for e in self.eval_objects]) @property def num_annotations(self): """Number of annotations.""" if not self.eval_objects: return 0. return np.nanmean([e.num_annotations for e in self.eval_objects]) @property def precision(self): """Precision.""" return np.nanmean([e.precision for e in self.eval_objects]) @property def recall(self): """Recall.""" return np.nanmean([e.recall for e in self.eval_objects]) @property def fmeasure(self): """F-measure.""" return np.nanmean([e.fmeasure for e in self.eval_objects]) @property def accuracy(self): """Accuracy.""" return np.nanmean([e.accuracy for e in self.eval_objects])
[docs] def tostring(self, **kwargs): """ Format the evaluation metrics as a human readable string. Returns ------- str Evaluation metrics formatted as a human readable string. """ ret = '' if self.name is not None: ret += '%s\n ' % self.name # TODO: unify this with SimpleEvaluation but # add option to provide field formatters (e.g. 3d or 5.2f) # format with floats instead of integers ret = 'Annotations: %5.2f TP: %5.2f FP: %5.2f FN: %5.2f' \ 'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \ (self.num_annotations, self.num_tp, self.num_fp, self.num_fn, self.precision, self.recall, self.fmeasure, self.accuracy) return ret
[docs]def tostring(eval_objects, **kwargs): """ Format the given evaluation objects as human readable strings. Parameters ---------- eval_objects : list Evaluation objects. Returns ------- str Evaluation metrics formatted as a human readable string. """ # pylint: disable=unused-argument return '\n'.join([e.tostring() for e in eval_objects])
[docs]def tocsv(eval_objects, metric_names=None, float_format='{:.3f}', **kwargs): """ Format the given evaluation objects as a CSV table. Parameters ---------- eval_objects : list Evaluation objects. metric_names : list of tuples, optional List of tuples defining the name of the property corresponding to the metric, and the metric label e.g. ('fp', 'False Positives'). float_format : str, optional How to format the metrics. Returns ------- str CSV table representation of the evaluation objects. Notes ----- If no `metric_names` are given, they will be extracted from the first evaluation object. """ # pylint: disable=unused-argument if metric_names is None: # get the evaluation metrics from the first evaluation object metric_names = eval_objects[0].METRIC_NAMES metric_names, metric_labels = list(zip(*metric_names)) # add header lines = ['Name,' + ','.join(metric_labels)] # TODO: use e.metrics dict? # add the evaluation objects for e in eval_objects: values = [float_format.format(getattr(e, mn)) for mn in metric_names] lines.append(e.name + ',' + ','.join(values)) # return everything return '\n'.join(lines)
[docs]def totex(eval_objects, metric_names=None, float_format='{:.3f}', **kwargs): """ Format the given evaluation objects as a LaTeX table. Parameters ---------- eval_objects : list Evaluation objects. metric_names : list of tuples, optional List of tuples defining the name of the property corresponding to the metric, and the metric label e.g. ('fp', 'False Positives'). float_format : str, optional How to format the metrics. Returns ------- str LaTeX table representation of the evaluation objects. Notes ----- If no `metric_names` are given, they will be extracted from the first evaluation object. """ # pylint: disable=unused-argument if metric_names is None: # get the evaluation metrics from the first evaluation object metric_names = eval_objects[0].METRIC_NAMES metric_names, metric_labels = list(zip(*metric_names)) # add header lines = ['Name & ' + ' & '.join(metric_labels) + '\\\\'] # TODO: use e.metrics dict # TODO: add a generic totable() function which accepts columns separator, # newline stuff (e.g. tex \\\\) and others # add the evaluation objects for e in eval_objects: values = [float_format.format(getattr(e, mn)) for mn in metric_names] lines.append(e.name + ' & ' + ' & '.join(values) + '\\\\') # return everything return '\n'.join(lines)
[docs]def evaluation_io(parser, ann_suffix, det_suffix, ann_dir=None, det_dir=None): """ Add evaluation input/output and formatting related arguments to an existing parser object. Parameters ---------- parser : argparse parser instance Existing argparse parser object. ann_suffix : str Suffix of the annotation files. det_suffix : str Suffix of the detection files. ann_dir : str, optional Use only annotations from this folder (and sub-folders). det_dir : str, optional Use only detections from this folder (and sub-folders). Returns ------- io_group : argparse argument group Evaluation input / output argument group. formatter_group : argparse argument group Evaluation formatter argument group. """ import sys import argparse # general input output file handling parser.add_argument('files', nargs='*', help='files (or folders) to be evaluated') parser.add_argument('-o', dest='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file [default: STDOUT]') # file suffixes used for evaluation g = parser.add_argument_group('file/folder/suffix arguments') g.add_argument('-a', dest='ann_suffix', action='store', default=ann_suffix, help='suffix of the annotation files ' '[default: %(default)s]') g.add_argument('--ann_dir', action='store', default=ann_dir, help='search only this directory (recursively) for ' 'annotation files [default: %(default)s]') g.add_argument('-d', dest='det_suffix', action='store', default=det_suffix, help='suffix of the detection files [default: %(default)s]') g.add_argument('--det_dir', action='store', default=det_dir, help='search only this directory (recursively) for ' 'detection files [default: %(default)s]') # option to ignore non-existing detections g.add_argument('-i', '--ignore_non_existing', action='store_true', help='ignore non-existing detections [default: raise a ' 'warning and assume empty detections]') # verbose parser.add_argument('-v', '--verbose', action='count', default=0, help='increase verbosity level') # option to suppress warnings parser.add_argument('-q', '--quiet', action='store_true', help='suppress any warnings') # output format options parser.set_defaults(output_formatter=tostring) f = parser.add_argument_group('formatting arguments') formats = f.add_mutually_exclusive_group() formats.add_argument('--tex', dest='output_formatter', action='store_const', const=totex, help='format output to be used in .tex files') formats.add_argument('--csv', dest='output_formatter', action='store_const', const=tocsv, help='format output to be used in .csv files') # return the output formatting group so the caller can add more options return g, f
# finally import the submodules from . import onsets, beats, notes, tempo, alignment