# encoding: utf-8
# pylint: disable=no-member
# pylint: disable=invalid-name
# pylint: disable=too-many-arguments
# pylint: disable=wrong-import-position
"""
Evaluation package.
"""
from __future__ import absolute_import, division, print_function
import numpy as np
# evaluation helper functions
[docs]def find_closest_matches(detections, annotations):
"""
Find the closest annotation for each detection.
Parameters
----------
detections : list or numpy array
Detected events.
annotations : list or numpy array
Annotated events.
Returns
-------
indices : numpy array
Indices of the closest matches.
Notes
-----
The sequences must be ordered.
"""
# make sure the arrays have the correct types
detections = np.asarray(detections, dtype=np.float)
annotations = np.asarray(annotations, dtype=np.float)
# TODO: right now, it only works with 1D arrays
if detections.ndim > 1 or annotations.ndim > 1:
raise NotImplementedError('please implement multi-dim support')
# if no detections or annotations are given
if len(detections) == 0 or len(annotations) == 0:
# return a empty array
return np.zeros(0, dtype=np.int)
# if only a single annotation is given
if len(annotations) == 1:
# return an array as long as the detections with indices 0
return np.zeros(len(detections), dtype=np.int)
# solution found at: http://stackoverflow.com/questions/8914491/
indices = annotations.searchsorted(detections)
indices = np.clip(indices, 1, len(annotations) - 1)
left = annotations[indices - 1]
right = annotations[indices]
indices -= detections - left < right - detections
# return the indices of the closest matches
return indices
[docs]def calc_errors(detections, annotations, matches=None):
"""
Errors of the detections to the closest annotations.
Parameters
----------
detections : list or numpy array
Detected events.
annotations : list or numpy array
Annotated events.
matches : list or numpy array
Indices of the closest events.
Returns
-------
errors : numpy array
Errors.
Notes
-----
The sequences must be ordered. To speed up the calculation, a list of
pre-computed indices of the closest matches can be used.
"""
# make sure the arrays have the correct types
detections = np.asarray(detections, dtype=np.float)
annotations = np.asarray(annotations, dtype=np.float)
if matches is not None:
matches = np.asarray(matches, dtype=np.int)
# TODO: right now, it only works with 1D arrays
if detections.ndim > 1 or annotations.ndim > 1:
raise NotImplementedError('please implement multi-dim support')
# if no detections or annotations are given
if len(detections) == 0 or len(annotations) == 0:
# return a empty array
return np.zeros(0, dtype=np.float)
# determine the closest annotations
if matches is None:
matches = find_closest_matches(detections, annotations)
# calc error relative to those annotations
errors = detections - annotations[matches]
# return the errors
return errors
[docs]def calc_absolute_errors(detections, annotations, matches=None):
"""
Absolute errors of the detections to the closest annotations.
Parameters
----------
detections : list or numpy array
Detected events.
annotations : list or numpy array
Annotated events.
matches : list or numpy array
Indices of the closest events.
Returns
-------
errors : numpy array
Absolute errors.
Notes
-----
The sequences must be ordered. To speed up the calculation, a list of
pre-computed indices of the closest matches can be used.
"""
# make sure the arrays have the correct types
detections = np.asarray(detections, dtype=np.float)
annotations = np.asarray(annotations, dtype=np.float)
if matches is not None:
matches = np.asarray(matches, dtype=np.int)
# TODO: right now, it only works with 1D arrays
if detections.ndim > 1 or annotations.ndim > 1:
raise NotImplementedError('please implement multi-dim support')
# return the errors
return np.abs(calc_errors(detections, annotations, matches))
[docs]def calc_relative_errors(detections, annotations, matches=None):
"""
Relative errors of the detections to the closest annotations.
Parameters
----------
detections : list or numpy array
Detected events.
annotations : list or numpy array
Annotated events.
matches : list or numpy array
Indices of the closest events.
Returns
-------
errors : numpy array
Relative errors.
Notes
-----
The sequences must be ordered. To speed up the calculation, a list of
pre-computed indices of the closest matches can be used.
"""
# make sure the arrays have the correct types
detections = np.asarray(detections, dtype=np.float)
annotations = np.asarray(annotations, dtype=np.float)
if matches is not None:
matches = np.asarray(matches, dtype=np.int)
# TODO: right now, it only works with 1D arrays
if detections.ndim > 1 or annotations.ndim > 1:
raise NotImplementedError('please implement multi-dim support')
# if no detections or annotations are given
if len(detections) == 0 or len(annotations) == 0:
# return a empty array
return np.zeros(0, dtype=np.float)
# determine the closest annotations
if matches is None:
matches = find_closest_matches(detections, annotations)
# calculate the absolute errors
errors = calc_errors(detections, annotations, matches)
# return the relative errors
return np.abs(1 - (errors / annotations[matches]))
# abstract evaluation base class
[docs]class EvaluationMixin(object):
"""
Evaluation mixin class.
This class has a `name` attribute which is used for display purposes and
defaults to 'None'.
`METRIC_NAMES` is a list of tuples, containing the attribute's name and the
corresponding label, e.g.:
The attributes defined in `METRIC_NAMES` will be provided as an ordered
dictionary as the `metrics` property unless the subclass overwrites the
property.
`FLOAT_FORMAT` is used to format floats.
"""
# Example:
# METRIC_NAMES = [
# ('precision', 'Precision'),
# ('recall', 'Recall'),
# ('fmeasure', 'F-measure'),
# ]
name = None
METRIC_NAMES = []
FLOAT_FORMAT = '{:.3f}'
@property
def metrics(self):
"""Metrics as a dictionary."""
from collections import OrderedDict
metrics = OrderedDict()
# metrics = {}
for metric in [m[0] for m in self.METRIC_NAMES]:
metrics[metric] = getattr(self, metric)
return metrics
def __len__(self):
"""Length of the evaluation object."""
raise NotImplementedError('must be implemented by subclass.')
[docs] def tostring(self, **kwargs):
"""
Format the evaluation metrics as a human readable string.
Returns
-------
str
Evaluation metrics formatted as a human readable string.
Notes
-----
This is a fallback method formatting the `metrics` dictionary in a
human readable way. Classes inheriting from this mixin class should
provide a method better suitable.
"""
# pylint: disable=unused-argument
import pprint
return pprint.pformat(dict(self.metrics), indent=4)
# evaluation classes
[docs]class SimpleEvaluation(EvaluationMixin):
"""
Simple Precision, Recall, F-measure and Accuracy evaluation based on the
numbers of true/false positive/negative detections.
Parameters
----------
num_tp : int
Number of true positive detections.
num_fp : int
Number of false positive detections.
num_tn : int
Number of true negative detections.
num_fn : int
Number of false negative detections.
name : str
Name to be displayed.
Notes
-----
This class is only suitable for a 1-class evaluation problem.
"""
METRIC_NAMES = [
('num_tp', 'No. of true positives'),
('num_fp', 'No. of false positives'),
('num_tn', 'No. of true negatives'),
('num_fn', 'No. of false negatives'),
('num_annotations', 'No. Annotations'),
('precision', 'Precision'),
('recall', 'Recall'),
('fmeasure', 'F-measure'),
('accuracy', 'Accuracy'),
]
def __init__(self, num_tp=0, num_fp=0, num_tn=0, num_fn=0, name=None,
**kwargs):
# pylint: disable=unused-argument
# hidden variables, to be able to overwrite them in subclasses
self._num_tp = int(num_tp)
self._num_fp = int(num_fp)
self._num_tn = int(num_tn)
self._num_fn = int(num_fn)
# name of the evaluation
self.name = name
@property
def num_tp(self):
"""Number of true positive detections."""
return self._num_tp
@property
def num_fp(self):
"""Number of false positive detections."""
return self._num_fp
@property
def num_tn(self):
"""Number of true negative detections."""
return self._num_tn
@property
def num_fn(self):
"""Number of false negative detections."""
return self._num_fn
@property
def num_annotations(self):
"""Number of annotations."""
return self.num_tp + self.num_fn
def __len__(self):
# the length equals the number of annotations
return self.num_annotations
@property
def precision(self):
"""Precision."""
# correct / retrieved
retrieved = float(self.num_tp + self.num_fp)
# if there are no positive predictions, none of them are wrong
if retrieved == 0:
return 1.
return self.num_tp / retrieved
@property
def recall(self):
"""Recall."""
# correct / relevant
relevant = float(self.num_tp + self.num_fn)
# if there are no positive annotations, we recalled all of them
if relevant == 0:
return 1.
return self.num_tp / relevant
@property
def fmeasure(self):
"""F-measure."""
# 2pr / (p+r)
numerator = 2. * self.precision * self.recall
if numerator == 0:
return 0.
return numerator / (self.precision + self.recall)
@property
def accuracy(self):
"""Accuracy."""
# acc: (TP + TN) / (TP + FP + TN + FN)
denominator = self.num_fp + self.num_fn + self.num_tp + self.num_tn
if denominator == 0:
return 1.
numerator = float(self.num_tp + self.num_tn)
if numerator == 0:
return 0.
return numerator / denominator
[docs] def tostring(self, **kwargs):
"""
Format the evaluation metrics as a human readable string.
Returns
-------
str
Evaluation metrics formatted as a human readable string.
"""
ret = ''
if self.name is not None:
ret += '%s\n ' % self.name
ret += 'Annotations: %5d TP: %5d FP: %5d FN: %5d ' \
'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \
(self.num_annotations, self.num_tp, self.num_fp, self.num_fn,
self.precision, self.recall, self.fmeasure, self.accuracy)
return ret
def __str__(self):
return self.tostring()
# evaluate Precision, Recall, F-measure and Accuracy with lists or numpy arrays
[docs]class Evaluation(SimpleEvaluation):
"""
Evaluation class for measuring Precision, Recall and F-measure based on
numpy arrays or lists with true/false positive/negative detections.
Parameters
----------
tp : list or numpy array
True positive detections.
fp : list or numpy array
False positive detections.
tn : list or numpy array
True negative detections.
fn : list or numpy array
False negative detections.
name : str
Name to be displayed.
"""
def __init__(self, tp=None, fp=None, tn=None, fn=None, **kwargs):
# set default values
if tp is None:
tp = []
if fp is None:
fp = []
if tn is None:
tn = []
if fn is None:
fn = []
# instantiate a SimpleEvaluation object
super(Evaluation, self).__init__(**kwargs)
# convert everything to numpy arrays and save them
self.tp = np.asarray(list(tp), dtype=np.float)
self.fp = np.asarray(list(fp), dtype=np.float)
self.tn = np.asarray(list(tn), dtype=np.float)
self.fn = np.asarray(list(fn), dtype=np.float)
@property
def num_tp(self):
"""Number of true positive detections."""
return len(self.tp)
@property
def num_fp(self):
"""Number of false positive detections."""
return len(self.fp)
@property
def num_tn(self):
"""Number of true negative detections."""
return len(self.tn)
@property
def num_fn(self):
"""Number of false negative detections."""
return len(self.fn)
# class for evaluation of Precision, Recall, F-measure with 2D arrays
[docs]class MultiClassEvaluation(Evaluation):
"""
Evaluation class for measuring Precision, Recall and F-measure based on
2D numpy arrays with true/false positive/negative detections.
Parameters
----------
tp : list of tuples or numpy array, shape (num_tp, 2)
True positive detections.
fp : list of tuples or numpy array, shape (num_fp, 2)
False positive detections.
tn : list of tuples or numpy array, shape (num_tn, 2)
True negative detections.
fn : list of tuples or numpy array, shape (num_fn, 2)
False negative detections.
name : str
Name to be displayed.
Notes
-----
The second item of the tuples or the second column of the arrays denote
the class the detection belongs to.
"""
def __init__(self, tp=None, fp=None, tn=None, fn=None, **kwargs):
# set default values
if tp is None:
tp = np.zeros((0, 2))
if fp is None:
fp = np.zeros((0, 2))
if tn is None:
tn = np.zeros((0, 2))
if fn is None:
fn = np.zeros((0, 2))
super(MultiClassEvaluation, self).__init__(**kwargs)
self.tp = np.asarray(tp, dtype=np.float)
self.fp = np.asarray(fp, dtype=np.float)
self.tn = np.asarray(tn, dtype=np.float)
self.fn = np.asarray(fn, dtype=np.float)
[docs] def tostring(self, verbose=False, **kwargs):
"""
Format the evaluation metrics as a human readable string.
Parameters
----------
verbose : bool
Add evaluation for individual classes.
Returns
-------
str
Evaluation metrics formatted as a human readable string.
"""
ret = ''
if verbose:
# extract all classes
classes = []
if self.tp.any():
classes = np.append(classes, np.unique(self.tp[:, 1]))
if self.fp.any():
classes = np.append(classes, np.unique(self.fp[:, 1]))
if self.tn.any():
classes = np.append(classes, np.unique(self.tn[:, 1]))
if self.fn.any():
classes = np.append(classes, np.unique(self.fn[:, 1]))
for cls in sorted(np.unique(classes)):
# extract the TP, FP, TN and FN of this class
tp = self.tp[self.tp[:, 1] == cls]
fp = self.fp[self.fp[:, 1] == cls]
tn = self.tn[self.tn[:, 1] == cls]
fn = self.fn[self.fn[:, 1] == cls]
# evaluate them
e = Evaluation(tp, fp, tn, fn, name='Class %s' % cls)
# append to the output string
ret += ' %s\n' % e.tostring(verbose=False)
# normal formatting
ret += 'Annotations: %5d TP: %5d FP: %4d FN: %4d ' \
'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \
(self.num_annotations, self.num_tp, self.num_fp, self.num_fn,
self.precision, self.recall, self.fmeasure, self.accuracy)
# return
return ret
# class for summing Evaluations
[docs]class SumEvaluation(SimpleEvaluation):
"""
Simple class for summing evaluations.
Parameters
----------
eval_objects : list
Evaluation objects.
name : str
Name to be displayed.
"""
def __init__(self, eval_objects, name=None):
# pylint: disable=super-init-not-called
# Note: we want to inherit the evaluation functions/properties, no need
# to call __super__, but we need to take care of 'name'
if not isinstance(eval_objects, list):
# wrap the given eval_object in a list
eval_objects = [eval_objects]
self.eval_objects = eval_objects
self.name = name or 'sum for %d files' % len(self)
def __len__(self):
# just use the length of the evaluation objects
return len(self.eval_objects)
# redefine the counters (number of TP, FP, TN, FN & number of annotations)
@property
def num_tp(self):
"""Number of true positive detections."""
return sum(e.num_tp for e in self.eval_objects)
@property
def num_fp(self):
"""Number of false positive detections."""
return sum(e.num_fp for e in self.eval_objects)
@property
def num_tn(self):
"""Number of true negative detections."""
return sum(e.num_tn for e in self.eval_objects)
@property
def num_fn(self):
"""Number of false negative detections."""
return sum(e.num_fn for e in self.eval_objects)
@property
def num_annotations(self):
"""Number of annotations."""
return sum(e.num_annotations for e in self.eval_objects)
# class for averaging Evaluations
[docs]class MeanEvaluation(SumEvaluation):
"""
Simple class for averaging evaluation.
Parameters
----------
eval_objects : list
Evaluation objects.
name : str
Name to be displayed.
"""
def __init__(self, eval_objects, name=None, **kwargs):
super(MeanEvaluation, self).__init__(eval_objects, **kwargs)
# handle the 'name' here to be able to set a different default value
self.name = name or 'mean for %d files' % len(self)
# overwrite the properties to calculate the mean instead of the sum
@property
def num_tp(self):
"""Number of true positive detections."""
if not self.eval_objects:
return 0.
return np.nanmean([e.num_tp for e in self.eval_objects])
@property
def num_fp(self):
"""Number of false positive detections."""
if not self.eval_objects:
return 0.
return np.nanmean([e.num_fp for e in self.eval_objects])
@property
def num_tn(self):
"""Number of true negative detections."""
if not self.eval_objects:
return 0.
return np.nanmean([e.num_tn for e in self.eval_objects])
@property
def num_fn(self):
"""Number of false negative detections."""
if not self.eval_objects:
return 0.
return np.nanmean([e.num_fn for e in self.eval_objects])
@property
def num_annotations(self):
"""Number of annotations."""
if not self.eval_objects:
return 0.
return np.nanmean([e.num_annotations for e in self.eval_objects])
@property
def precision(self):
"""Precision."""
return np.nanmean([e.precision for e in self.eval_objects])
@property
def recall(self):
"""Recall."""
return np.nanmean([e.recall for e in self.eval_objects])
@property
def fmeasure(self):
"""F-measure."""
return np.nanmean([e.fmeasure for e in self.eval_objects])
@property
def accuracy(self):
"""Accuracy."""
return np.nanmean([e.accuracy for e in self.eval_objects])
[docs] def tostring(self, **kwargs):
"""
Format the evaluation metrics as a human readable string.
Returns
-------
str
Evaluation metrics formatted as a human readable string.
"""
ret = ''
if self.name is not None:
ret += '%s\n ' % self.name
# TODO: unify this with SimpleEvaluation but
# add option to provide field formatters (e.g. 3d or 5.2f)
# format with floats instead of integers
ret = 'Annotations: %5.2f TP: %5.2f FP: %5.2f FN: %5.2f' \
'Precision: %.3f Recall: %.3f F-measure: %.3f Acc: %.3f' % \
(self.num_annotations, self.num_tp, self.num_fp, self.num_fn,
self.precision, self.recall, self.fmeasure, self.accuracy)
return ret
[docs]def tostring(eval_objects, **kwargs):
"""
Format the given evaluation objects as human readable strings.
Parameters
----------
eval_objects : list
Evaluation objects.
Returns
-------
str
Evaluation metrics formatted as a human readable string.
"""
# pylint: disable=unused-argument
return '\n'.join([e.tostring() for e in eval_objects])
[docs]def tocsv(eval_objects, metric_names=None, float_format='{:.3f}', **kwargs):
"""
Format the given evaluation objects as a CSV table.
Parameters
----------
eval_objects : list
Evaluation objects.
metric_names : list of tuples, optional
List of tuples defining the name of the property corresponding to the
metric, and the metric label e.g. ('fp', 'False Positives').
float_format : str, optional
How to format the metrics.
Returns
-------
str
CSV table representation of the evaluation objects.
Notes
-----
If no `metric_names` are given, they will be extracted from the first
evaluation object.
"""
# pylint: disable=unused-argument
if metric_names is None:
# get the evaluation metrics from the first evaluation object
metric_names = eval_objects[0].METRIC_NAMES
metric_names, metric_labels = list(zip(*metric_names))
# add header
lines = ['Name,' + ','.join(metric_labels)]
# TODO: use e.metrics dict?
# add the evaluation objects
for e in eval_objects:
values = [float_format.format(getattr(e, mn)) for mn in metric_names]
lines.append(e.name + ',' + ','.join(values))
# return everything
return '\n'.join(lines)
[docs]def totex(eval_objects, metric_names=None, float_format='{:.3f}', **kwargs):
"""
Format the given evaluation objects as a LaTeX table.
Parameters
----------
eval_objects : list
Evaluation objects.
metric_names : list of tuples, optional
List of tuples defining the name of the property corresponding to the
metric, and the metric label e.g. ('fp', 'False Positives').
float_format : str, optional
How to format the metrics.
Returns
-------
str
LaTeX table representation of the evaluation objects.
Notes
-----
If no `metric_names` are given, they will be extracted from the first
evaluation object.
"""
# pylint: disable=unused-argument
if metric_names is None:
# get the evaluation metrics from the first evaluation object
metric_names = eval_objects[0].METRIC_NAMES
metric_names, metric_labels = list(zip(*metric_names))
# add header
lines = ['Name & ' + ' & '.join(metric_labels) + '\\\\']
# TODO: use e.metrics dict
# TODO: add a generic totable() function which accepts columns separator,
# newline stuff (e.g. tex \\\\) and others
# add the evaluation objects
for e in eval_objects:
values = [float_format.format(getattr(e, mn)) for mn in metric_names]
lines.append(e.name + ' & ' + ' & '.join(values) + '\\\\')
# return everything
return '\n'.join(lines)
[docs]def evaluation_io(parser, ann_suffix, det_suffix, ann_dir=None, det_dir=None):
"""
Add evaluation input/output and formatting related arguments to an existing
parser object.
Parameters
----------
parser : argparse parser instance
Existing argparse parser object.
ann_suffix : str
Suffix of the annotation files.
det_suffix : str
Suffix of the detection files.
ann_dir : str, optional
Use only annotations from this folder (and sub-folders).
det_dir : str, optional
Use only detections from this folder (and sub-folders).
Returns
-------
io_group : argparse argument group
Evaluation input / output argument group.
formatter_group : argparse argument group
Evaluation formatter argument group.
"""
import sys
import argparse
# general input output file handling
parser.add_argument('files', nargs='*',
help='files (or folders) to be evaluated')
parser.add_argument('-o', dest='outfile', type=argparse.FileType('w'),
default=sys.stdout,
help='output file [default: STDOUT]')
# file suffixes used for evaluation
g = parser.add_argument_group('file/folder/suffix arguments')
g.add_argument('-a', dest='ann_suffix', action='store', default=ann_suffix,
help='suffix of the annotation files '
'[default: %(default)s]')
g.add_argument('--ann_dir', action='store', default=ann_dir,
help='search only this directory (recursively) for '
'annotation files [default: %(default)s]')
g.add_argument('-d', dest='det_suffix', action='store', default=det_suffix,
help='suffix of the detection files [default: %(default)s]')
g.add_argument('--det_dir', action='store', default=det_dir,
help='search only this directory (recursively) for '
'detection files [default: %(default)s]')
# option to ignore non-existing detections
g.add_argument('-i', '--ignore_non_existing', action='store_true',
help='ignore non-existing detections [default: raise a '
'warning and assume empty detections]')
# verbose
parser.add_argument('-v', '--verbose', action='count', default=0,
help='increase verbosity level')
# option to suppress warnings
parser.add_argument('-q', '--quiet', action='store_true',
help='suppress any warnings')
# output format options
parser.set_defaults(output_formatter=tostring)
f = parser.add_argument_group('formatting arguments')
formats = f.add_mutually_exclusive_group()
formats.add_argument('--tex', dest='output_formatter',
action='store_const', const=totex,
help='format output to be used in .tex files')
formats.add_argument('--csv', dest='output_formatter',
action='store_const', const=tocsv,
help='format output to be used in .csv files')
# return the output formatting group so the caller can add more options
return g, f
# finally import the submodules
from . import chords, beats, notes, onsets, tempo
# import often used classes
from .beats import BeatEvaluation, BeatMeanEvaluation
from .chords import ChordEvaluation, ChordMeanEvaluation, ChordSumEvaluation
from .key import KeyEvaluation, KeyMeanEvaluation
from .notes import NoteEvaluation, NoteMeanEvaluation, NoteSumEvaluation
from .onsets import OnsetEvaluation, OnsetMeanEvaluation, OnsetSumEvaluation
from .tempo import TempoEvaluation, TempoMeanEvaluation