Source code for pumpp.task.chord

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''Chord recognition task transformer'''

import re
from itertools import product

import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

import mir_eval
import jams

from .base import BaseTaskTransformer
from ..exceptions import ParameterError

__all__ = ['ChordTransformer', 'SimpleChordTransformer', 'ChordTagTransformer']


def _pad_nochord(target, axis=-1):
    '''Pad a chord annotation with no-chord flags.

    Parameters
    ----------
    target : np.ndarray
        the input data

    axis : int
        the axis along which to pad

    Returns
    -------
    target_pad
        `target` expanded by 1 along the specified `axis`.
        The expanded dimension will be 0 when `target` is non-zero
        before padding, and 1 otherwise.
    '''
    ncmask = ~np.max(target, axis=axis, keepdims=True)

    return np.concatenate([target, ncmask], axis=axis)


[docs]class ChordTransformer(BaseTaskTransformer):
    '''Chord annotation transformers.

    This transformer uses a (pitch, root, bass) decomposition of
    chord annotations.

    Attributes
    ----------
    name : str
        The name of the chord transformer

    sr : number > 0
        The sampling rate of audio

    hop_length : int > 0
        The number of samples between each annotation frame

    sparse : bool
        If True, root and bass values are sparsely encoded as integers in [0, 12].
        If False, root and bass values are densely encoded as 13-dimensional booleans.

    See Also
    --------
    SimpleTransformer
    '''
[docs]    def __init__(self, name='chord', sr=22050, hop_length=512, sparse=False):
        '''Initialize a chord task transformer'''

        super(ChordTransformer, self).__init__(name=name,
                                               namespace='chord',
                                               sr=sr, hop_length=hop_length)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([list(range(12))])
        self._classes = set(self.encoder.classes_)
        self.sparse = sparse

        self.register('pitch', [None, 12], np.bool)
        if self.sparse:
            self.register('root', [None, 1], np.int)
            self.register('bass', [None, 1], np.int)
        else:
            self.register('root', [None, 13], np.bool)
            self.register('bass', [None, 13], np.bool)

    def empty(self, duration):
        '''Empty chord annotations

        Parameters
        ----------
        duration : number
            The length (in seconds) of the empty annotation

        Returns
        -------
        ann : jams.Annotation
            A chord annotation consisting of a single `no-chord` observation.
        '''
        ann = super(ChordTransformer, self).empty(duration)

        ann.append(time=0,
                   duration=duration,
                   value='N', confidence=0)

        return ann

    def transform_annotation(self, ann, duration):
        '''Apply the chord transformation.

        Parameters
        ----------
        ann : jams.Annotation
            The chord annotation

        duration : number > 0
            The target duration

        Returns
        -------
        data : dict
            data['pitch'] : np.ndarray, shape=(n, 12)
            data['root'] : np.ndarray, shape=(n, 13) or (n, 1)
            data['bass'] : np.ndarray, shape=(n, 13) or (n, 1)

            `pitch` is a binary matrix indicating pitch class
            activation at each frame.

            `root` is a one-hot matrix indicating the chord
            root's pitch class at each frame.

            `bass` is a one-hot matrix indicating the chord
            bass (lowest note) pitch class at each frame.

            If sparsely encoded, `root` and `bass` are integers
            in the range [0, 12] where 12 indicates no chord.

            If densely encoded, `root` and `bass` have an extra
            final dimension which is active when there is no chord
            sounding.
        '''
        # Construct a blank annotation with mask = 0
        intervals, chords = ann.data.to_interval_values()

        # Get the dtype for root/bass
        if self.sparse:
            dtype = np.int
        else:
            dtype = np.bool

        # If we don't have any labeled intervals, fill in a no-chord
        if not chords:
            intervals = np.asarray([[0, duration]])
            chords = ['N']

        # Suppress all intervals not in the encoder
        pitches = []
        roots = []
        basses = []

        # default value when data is missing
        if self.sparse:
            fill = 12
        else:
            fill = False

        for chord in chords:
            # Encode the pitches
            root, semi, bass = mir_eval.chord.encode(chord)
            pitches.append(np.roll(semi, root))

            if self.sparse:
                if root in self._classes:
                    roots.append([root])
                    basses.append([(root + bass) % 12])
                else:
                    roots.append([fill])
                    basses.append([fill])
            else:
                if root in self._classes:
                    roots.extend(self.encoder.transform([[root]]))
                    basses.extend(self.encoder.transform([[(root + bass) % 12]]))
                else:
                    roots.extend(self.encoder.transform([[]]))
                    basses.extend(self.encoder.transform([[]]))

        pitches = np.asarray(pitches, dtype=np.bool)
        roots = np.asarray(roots, dtype=dtype)
        basses = np.asarray(basses, dtype=dtype)

        target_pitch = self.encode_intervals(duration, intervals, pitches)

        target_root = self.encode_intervals(duration, intervals, roots,
                                            multi=False,
                                            dtype=dtype,
                                            fill=fill)
        target_bass = self.encode_intervals(duration, intervals, basses,
                                            multi=False,
                                            dtype=dtype,
                                            fill=fill)

        if not self.sparse:
            target_root = _pad_nochord(target_root)
            target_bass = _pad_nochord(target_bass)

        return {'pitch': target_pitch,
                'root': target_root,
                'bass': target_bass}

    def inverse(self, pitch, root, bass, duration=None):

        raise NotImplementedError('Chord cannot be inverted')


[docs]class SimpleChordTransformer(ChordTransformer):
    '''Simplified chord transformations.  Only pitch class activity is encoded.

    Attributes
    ----------
    name : str
        name of the transformer

    sr : number > 0
        Sampling rate of audio

    hop_length : int > 0
        Hop length for annotation frames

    See Also
    --------
    ChordTransformer
    '''
[docs]    def __init__(self, name='chord', sr=22050, hop_length=512):
        super(SimpleChordTransformer, self).__init__(name=name,
                                                     sr=sr,
                                                     hop_length=hop_length)
        # Remove the extraneous fields
        self.pop('root')
        self.pop('bass')

    def transform_annotation(self, ann, duration):
        '''Apply the chord transformation.

        Parameters
        ----------
        ann : jams.Annotation
            The chord annotation

        duration : number > 0
            The target duration

        Returns
        -------
        data : dict
            data['pitch'] : np.ndarray, shape=(n, 12)

            `pitch` is a binary matrix indicating pitch class
            activation at each frame.
        '''
        data = super(SimpleChordTransformer,
                     self).transform_annotation(ann, duration)

        data.pop('root', None)
        data.pop('bass', None)
        return data

    def inverse(self, *args, **kwargs):
        raise NotImplementedError('SimpleChord cannot be inverted')


'''A list of normalized pitch class names'''
PITCHES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']


'''A mapping of chord quality encodings to their names'''
QUALITIES = {
    0b000100000000: 'min',
    0b000010000000: 'maj',
    0b000100010000: 'min',
    0b000010010000: 'maj',
    0b000100100000: 'dim',
    0b000010001000: 'aug',
    0b000100010010: 'min7',
    0b000010010001: 'maj7',
    0b000010010010: '7',
    0b000100100100: 'dim7',
    0b000100100010: 'hdim7',
    0b000100010001: 'minmaj7',
    0b000100010100: 'min6',
    0b000010010100: 'maj6',
    0b001000010000: 'sus2',
    0b000001010000: 'sus4'
}


[docs]class ChordTagTransformer(BaseTaskTransformer):
    '''Chord transformer that uses a tag-space encoding for chord labels.

    Attributes
    ----------
    name : str
        name of the transformer

    vocab : str

        A string of chord quality indicators to include:

            - '3': maj/min
            - '5': '3' + aug/dim
            - '6': '3' + '5' + maj6/min6
            - '7': '3' + '5' + '6' + 7/min7/maj7/dim7/hdim7/minmaj7
            - 's': sus2/sus4

        Note: 5 requires 3, 6 requires 5, 7 requires 6.

    sr : number > 0
        Sampling rate of audio

    hop_length : int > 0
        Hop length for annotation frames

    See Also
    --------
    ChordTransformer
    SimpleChordTransformer
    '''
[docs]    def __init__(self, name='chord', vocab='3567s',
                 sr=22050, hop_length=512, sparse=False):

        super(ChordTagTransformer, self).__init__(name=name,
                                                  namespace='chord',
                                                  sr=sr,
                                                  hop_length=hop_length)

        # Stringify and lowercase
        if set(vocab) - set('3567s'):
            raise ParameterError('Invalid vocabulary string: {}'.format(vocab))

        if '5' in vocab and '3' not in vocab:
            raise ParameterError('Invalid vocabulary string: {}'.format(vocab))

        if '6' in vocab and '5' not in vocab:
            raise ParameterError('Invalid vocabulary string: {}'.format(vocab))

        if '7' in vocab and '6' not in vocab:
            raise ParameterError('Invalid vocabulary string: {}'.format(vocab))

        self.vocab = vocab.lower()
        labels = self.vocabulary()
        self.sparse = sparse

        if self.sparse:
            self.encoder = LabelEncoder()
        else:
            self.encoder = LabelBinarizer()
        self.encoder.fit(labels)
        self._classes = set(self.encoder.classes_)

        # Construct the quality mask for chord encoding
        self.mask_ = 0b000000000000
        if '3' in self.vocab:
            self.mask_ |= 0b000110000000
        if '5' in self.vocab:
            self.mask_ |= 0b000110111000
        if '6' in self.vocab:
            self.mask_ |= 0b000110010100
        if '7' in self.vocab:
            self.mask_ |= 0b000110110111
        if 's' in self.vocab:
            self.mask_ |= 0b001001010000

        if self.sparse:
            self.register('chord', [None, 1], np.int)
        else:
            self.register('chord', [None, len(self._classes)], np.bool)

    def empty(self, duration):
        '''Empty chord annotations

        Parameters
        ----------
        duration : number
            The length (in seconds) of the empty annotation

        Returns
        -------
        ann : jams.Annotation
            A chord annotation consisting of a single `no-chord` observation.
        '''
        ann = super(ChordTagTransformer, self).empty(duration)

        ann.append(time=0,
                   duration=duration,
                   value='X', confidence=0)

        return ann

    def vocabulary(self):
        qualities = []

        if '3' in self.vocab or '5' in self.vocab:
            qualities.extend(['min', 'maj'])

        if '5' in self.vocab:
            qualities.extend(['dim', 'aug'])

        if '6' in self.vocab:
            qualities.extend(['min6', 'maj6'])

        if '7' in self.vocab:
            qualities.extend(['min7', 'maj7', '7', 'dim7', 'hdim7', 'minmaj7'])

        if 's' in self.vocab:
            qualities.extend(['sus2', 'sus4'])

        labels = ['N', 'X']

        for chord in product(PITCHES, qualities):
            labels.append('{}:{}'.format(*chord))

        return labels

    def simplify(self, chord):
        '''Simplify a chord string down to the vocabulary space'''
        # Drop inversions
        chord = re.sub(r'/.*$', r'', chord)
        # Drop any additional or suppressed tones
        chord = re.sub(r'\(.*?\)', r'', chord)
        # Drop dangling : indicators
        chord = re.sub(r':$', r'', chord)

        # Encode the chord
        root, pitches, _ = mir_eval.chord.encode(chord)

        # Build the query
        # To map the binary vector pitches down to bit masked integer,
        # we just dot against powers of 2
        P = 2**np.arange(12, dtype=int)
        query = self.mask_ & pitches[::-1].dot(P)

        if root < 0 and chord[0].upper() == 'N':
            return 'N'
        if query not in QUALITIES:
            return 'X'

        return '{}:{}'.format(PITCHES[root], QUALITIES[query])

    def transform_annotation(self, ann, duration):
        '''Transform an annotation to chord-tag encoding

        Parameters
        ----------
        ann : jams.Annotation
            The annotation to convert

        duration : number > 0
            The duration of the track

        Returns
        -------
        data : dict
            data['chord'] : np.ndarray, shape=(n, n_labels)
                A time-varying binary encoding of the chords
        '''

        intervals, values = ann.data.to_interval_values()

        chords = []
        for v in values:
            chords.extend(self.encoder.transform([self.simplify(v)]))

        dtype = self.fields[self.scope('chord')].dtype

        chords = np.asarray(chords)

        if self.sparse:
            chords = chords[:, np.newaxis]

        target = self.encode_intervals(duration, intervals, chords,
                                       multi=False, dtype=dtype)

        return {'chord': target}

    def inverse(self, encoded, duration=None):
        '''Inverse transformation'''

        ann = jams.Annotation(self.namespace, duration=duration)

        for start, end, value in self.decode_intervals(encoded,
                                                       duration=duration,
                                                       multi=False,
                                                       sparse=self.sparse):
            if self.sparse:
                value_dec = self.encoder.inverse_transform(value)
            else:
                value_dec = self.encoder.inverse_transform(np.atleast_2d(value))

            for vd in value_dec:
                ann.append(time=start, duration=end-start, value=vd)

        return ann