# -*- coding: utf-8 -*-
'''Pitch-tracking and tuning estimation'''
from __future__ import division
import numpy as np
import scipy.fftpack as fft
from warnings import warn
from . import audio
from .time_frequency import cqt_frequencies, note_to_hz
from .spectrum import stft
from .pitch import estimate_tuning
from .. import cache
from .. import filters
from .. import util
from ..util.exceptions import ParameterError
__all__ = ['cqt', 'hybrid_cqt', 'pseudo_cqt']
@cache
[docs]def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
bins_per_octave=12, tuning=None, filter_scale=1,
aggregate=None, norm=1, sparsity=0.01, real=True,
resolution=util.Deprecated()):
'''Compute the constant-Q transform of an audio signal.
This implementation is based on the recursive sub-sampling method
described by [1]_.
.. [1] Schoerkhuber, Christian, and Anssi Klapuri.
"Constant-Q transform toolbox for music processing."
7th Sound and Music Computing Conference, Barcelona, Spain. 2010.
Parameters
----------
y : np.ndarray [shape=(n,)]
audio time series
sr : number > 0 [scalar]
sampling rate of `y`
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to C1 ~= 32.70 Hz
n_bins : int > 0 [scalar]
Number of frequency bins, starting at `fmin`
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float in `[-0.5, 0.5)`
Tuning offset in fractions of a bin (cents).
If `None`, tuning will be automatically estimated.
filter_scale : float > 0
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
aggregate : None or function
Aggregation function for time-oversampling energy aggregation.
By default, `np.mean`. See `librosa.util.sync`.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to `sparsity`
fraction of the energy in each basis.
Set `sparsity=0` to disable sparsification.
real : bool
If true, return only the magnitude of the CQT.
resolution : float
.. warning:: This parameter name was in librosa 0.4.2
Use the `filter_scale` parameter instead.
The `resolution` parameter will be removed in librosa 0.5.0.
Returns
-------
CQT : np.ndarray [shape=(n_bins, t), dtype=np.complex or np.float]
Constant-Q value each frequency at each time.
Raises
------
ParameterError
If `hop_length` is not an integer multiple of
`2**(n_bins / bins_per_octave)`
See Also
--------
librosa.core.resample
librosa.util.sync
librosa.util.normalize
Examples
--------
Generate and plot a constant-Q power spectrum
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> C = librosa.cqt(y, sr=sr)
>>> librosa.display.specshow(librosa.logamplitude(C**2, ref_power=np.max),
... sr=sr, x_axis='time', y_axis='cqt_note')
>>> plt.colorbar(format='%+2.0f dB')
>>> plt.title('Constant-Q power spectrum')
>>> plt.tight_layout()
Limit the frequency range
>>> C = librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
... n_bins=60)
>>> C
array([[ 8.827e-04, 9.293e-04, ..., 3.133e-07, 2.942e-07],
[ 1.076e-03, 1.068e-03, ..., 1.153e-06, 1.148e-06],
...,
[ 1.042e-07, 4.087e-07, ..., 1.612e-07, 1.928e-07],
[ 2.363e-07, 5.329e-07, ..., 1.294e-07, 1.611e-07]])
Using a higher frequency resolution
>>> C = librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
... n_bins=60 * 2, bins_per_octave=12 * 2)
>>> C
array([[ 1.536e-05, 5.848e-05, ..., 3.241e-07, 2.453e-07],
[ 1.856e-03, 1.854e-03, ..., 2.397e-08, 3.549e-08],
...,
[ 2.034e-07, 4.245e-07, ..., 6.213e-08, 1.463e-07],
[ 4.896e-08, 5.407e-07, ..., 9.176e-08, 1.051e-07]])
'''
filter_scale = util.rename_kw('resolution', resolution,
'filter_scale', filter_scale,
'0.4.2', '0.5.0')
if real:
warn('Real-valued CQT (real=True) is deprecated in 0.4.2. '
'Complex-valued CQT will become the default in 0.5.0. '
'Consider using np.abs(librosa.cqt(..., real=False)) '
'instead of real=True to maintain forward compatibility.',
DeprecationWarning)
# How many octaves are we dealing with?
n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
if fmin is None:
# C1 by default
fmin = note_to_hz('C1')
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr)
# First thing, get the freqs of the top octave
freqs = cqt_frequencies(n_bins, fmin,
bins_per_octave=bins_per_octave)[-bins_per_octave:]
fmin_t = np.min(freqs)
fmax_t = np.max(freqs)
# Determine required resampling quality
Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1)
filter_cutoff = fmax_t * (1 + filters.window_bandwidth('hann') / Q)
nyquist = sr / 2.0
if filter_cutoff < audio.BW_FASTEST * nyquist:
res_type = 'sinc_fastest'
elif filter_cutoff < audio.BW_MEDIUM * nyquist:
res_type = 'sinc_medium'
elif filter_cutoff < audio.BW_BEST * nyquist:
res_type = 'sinc_best'
else:
res_type = 'sinc_best'
cqt_resp = []
y, sr, hop_length = __early_downsample(y, sr, hop_length,
res_type, n_octaves,
nyquist, filter_cutoff)
n_filters = min(bins_per_octave, n_bins)
if res_type != 'sinc_fastest' and audio._HAS_SAMPLERATE:
# Do two octaves before resampling to allow for usage of sinc_fastest
fft_basis, n_fft, filter_lengths = __fft_filters(sr, fmin_t,
n_filters,
bins_per_octave,
tuning,
filter_scale,
norm,
sparsity)
min_filter_length = np.min(filter_lengths)
# Compute a dynamic hop based on n_fft
my_cqt = __variable_hop_response(y, n_fft,
hop_length,
min_filter_length,
fft_basis,
aggregate)
# Convolve
cqt_resp.append(my_cqt)
fmin_t /= 2
fmax_t /= 2
n_octaves -= 1
filter_cutoff = fmax_t * (1 + filters.window_bandwidth('hann') / Q)
assert filter_cutoff < audio.BW_FASTEST*nyquist
res_type = 'sinc_fastest'
# Make sure our hop is long enough to support the bottom octave
num_twos = __num_two_factors(hop_length)
if num_twos < n_octaves - 1:
raise ParameterError('hop_length must be a positive integer '
'multiple of 2^{0:d} for {1:d}-octave CQT'
.format(n_octaves - 1, n_octaves))
# Now do the recursive bit
fft_basis, n_fft, filter_lengths = __fft_filters(sr, fmin_t,
n_filters,
bins_per_octave,
tuning,
filter_scale,
norm,
sparsity)
min_filter_length = np.min(filter_lengths)
my_y, my_sr, my_hop = y, sr, hop_length
# Iterate down the octaves
for i in range(n_octaves):
# Resample (except first time)
if i > 0:
# The additional scaling of sqrt(2) here is to implicitly rescale the filters
my_y = np.sqrt(2) * audio.resample(my_y, my_sr, my_sr/2.0,
res_type=res_type,
scale=True)
my_sr /= 2.0
assert my_hop % 2 == 0
my_hop //= 2
# Compute a dynamic hop based on n_fft
my_cqt = __variable_hop_response(my_y, n_fft,
my_hop,
min_filter_length,
fft_basis,
aggregate)
# Convolve
cqt_resp.append(my_cqt)
return __trim_stack(cqt_resp, n_bins, real)
@cache
[docs]def hybrid_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
bins_per_octave=12, tuning=None, filter_scale=1,
norm=1, sparsity=0.01,
resolution=util.Deprecated()):
'''Compute the hybrid constant-Q transform of an audio signal.
Here, the hybrid CQT uses the pseudo CQT for higher frequencies where
the hop_length is longer than half the filter length and the full CQT
for lower frequencies.
Parameters
----------
y : np.ndarray [shape=(n,)]
audio time series
sr : number > 0 [scalar]
sampling rate of `y`
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to C1 ~= 32.70 Hz
n_bins : int > 0 [scalar]
Number of frequency bins, starting at `fmin`
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float in `[-0.5, 0.5)`
Tuning offset in fractions of a bin (cents).
If `None`, tuning will be automatically estimated.
filter_scale : float > 0
Filter filter_scale factor. Larger values use longer windows.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to `sparsity`
fraction of the energy in each basis.
Set `sparsity=0` to disable sparsification.
resolution : float
.. warning:: This parameter name was in librosa 0.4.2
Use the `filter_scale` parameter instead.
The `resolution` parameter will be removed in librosa 0.5.0.
Returns
-------
CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
Constant-Q energy for each frequency at each time.
Raises
------
ParameterError
If `hop_length` is not an integer multiple of
`2**(n_bins / bins_per_octave)`
See Also
--------
cqt
pseudo_cqt
'''
filter_scale = util.rename_kw('resolution', resolution,
'filter_scale', filter_scale,
'0.4.2', '0.5.0')
if fmin is None:
# C1 by default
fmin = note_to_hz('C1')
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr)
# Get all CQT frequencies
freqs = cqt_frequencies(n_bins, fmin,
bins_per_octave=bins_per_octave,
tuning=tuning)
# Compute the length of each constant-Q basis function
lengths = filters.constant_q_lengths(sr, fmin,
n_bins=n_bins,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale)
# Determine which filters to use with Pseudo CQT
pseudo_filters = lengths < 2*hop_length
n_bins_pseudo = int(np.sum(pseudo_filters))
cqt_resp = []
if n_bins_pseudo > 0:
fmin_pseudo = np.min(freqs[pseudo_filters])
my_pseudo_cqt = pseudo_cqt(y, sr,
hop_length=hop_length,
fmin=fmin_pseudo,
n_bins=n_bins_pseudo,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity)
cqt_resp.append(my_pseudo_cqt)
n_bins_full = int(np.sum(~pseudo_filters))
if n_bins_full > 0:
fmin_full = np.min(freqs[~pseudo_filters])
my_cqt = np.abs(cqt(y, sr,
hop_length=hop_length,
fmin=fmin_full,
n_bins=n_bins_full,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
real=False))
cqt_resp.append(my_cqt)
return __trim_stack(cqt_resp, n_bins, True)
@cache
[docs]def pseudo_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
bins_per_octave=12, tuning=None, filter_scale=1,
norm=1, sparsity=0.01,
resolution=util.Deprecated()):
'''Compute the pseudo constant-Q transform of an audio signal.
This uses a single fft size that is the smallest power of 2 that is greater
than or equal to the max of:
1. The longest CQT filter
2. 2x the hop_length
Parameters
----------
y : np.ndarray [shape=(n,)]
audio time series
sr : number > 0 [scalar]
sampling rate of `y`
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to C1 ~= 32.70 Hz
n_bins : int > 0 [scalar]
Number of frequency bins, starting at `fmin`
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float in `[-0.5, 0.5)`
Tuning offset in fractions of a bin (cents).
If `None`, tuning will be automatically estimated.
filter_scale : float > 0
Filter filter_scale factor. Larger values use longer windows.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to `sparsity`
fraction of the energy in each basis.
Set `sparsity=0` to disable sparsification.
resolution : float
.. warning:: This parameter name was in librosa 0.4.2
Use the `filter_scale` parameter instead.
The `resolution` parameter will be removed in librosa 0.5.0.
Returns
-------
CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
Pseudo Constant-Q energy for each frequency at each time.
Raises
------
ParameterError
If `hop_length` is not an integer multiple of
`2**(n_bins / bins_per_octave)`
'''
filter_scale = util.rename_kw('resolution', resolution,
'filter_scale', filter_scale,
'0.4.2', '0.5.0')
if fmin is None:
# C1 by default
fmin = note_to_hz('C1')
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr)
fft_basis, n_fft, _ = __fft_filters(sr,
fmin,
n_bins,
bins_per_octave,
tuning,
filter_scale,
norm,
sparsity,
hop_length=hop_length)
fft_basis = np.abs(fft_basis)
# Compute the magnitude STFT with Hann window
D = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length))
# Project onto the pseudo-cqt basis
return fft_basis.dot(D)
def __fft_filters(sr, fmin, n_bins, bins_per_octave, tuning,
filter_scale, norm, sparsity, hop_length=None):
'''Generate the frequency domain constant-Q filter basis.'''
basis, lengths = filters.constant_q(sr,
fmin=fmin,
n_bins=n_bins,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
norm=norm,
pad_fft=True)
# Filters are padded up to the nearest integral power of 2
n_fft = basis.shape[1]
if hop_length is not None and n_fft < 2 * hop_length:
n_fft = int(2.0 ** (np.ceil(np.log2(2 * hop_length))))
# normalize by inverse length to compensate for phase invariance
basis *= lengths.reshape((-1, 1)) / n_fft
# FFT and retain only the non-negative frequencies
fft_basis = fft.fft(basis, n=n_fft, axis=1)[:, :(n_fft // 2)+1]
# normalize as in Parseval's relation, and sparsify the basis
fft_basis = util.sparsify_rows(fft_basis / n_fft, quantile=sparsity)
return fft_basis, n_fft, lengths
def __trim_stack(cqt_resp, n_bins, real):
'''Helper function to trim and stack a collection of CQT responses'''
# cleanup any framing errors at the boundaries
max_col = min([x.shape[1] for x in cqt_resp])
cqt_resp = np.vstack([x[:, :max_col] for x in cqt_resp][::-1])
# Finally, clip out any bottom frequencies that we don't really want
# Transpose magic here to ensure column-contiguity
C = np.ascontiguousarray(cqt_resp[-n_bins:].T).T
if real:
C = np.abs(C)
return C
def __variable_hop_response(y, n_fft, hop_length, min_filter_length,
fft_basis, aggregate):
'''Compute the filter response with a target STFT hop.
If the hop is too large (more than half the frame length),
then over-sample at a smaller hop, and aggregate the results
to the desired resolution.
'''
# If target_hop <= n_fft / 2:
# my_hop = target_hop
# else:
# my_hop = target_hop * 2**(-k)
zoom_factor = np.ceil(np.log2(hop_length) - np.log2(min_filter_length))
zoom_factor = 2**int(np.maximum(0, 1 + zoom_factor))
# Compute the STFT matrix
D = stft(y, n_fft=n_fft, hop_length=int(hop_length / zoom_factor),
window=np.ones)
# And filter response energy
my_cqt = fft_basis.dot(D)
if zoom_factor > 1:
# We need to aggregate. Generate the boundary frames
bounds = np.arange(0, my_cqt.shape[1], zoom_factor, dtype=int)
my_cqt = util.sync(my_cqt, bounds, aggregate=aggregate)
return my_cqt
def __early_downsample(y, sr, hop_length, res_type, n_octaves,
nyquist, filter_cutoff):
'''Perform early downsampling on an audio signal, if it applies.'''
if not (res_type == 'sinc_fastest' and audio._HAS_SAMPLERATE):
return y, sr, hop_length
downsample_count1 = int(np.ceil(np.log2(audio.BW_FASTEST * nyquist /
filter_cutoff)) - 1)
num_twos = __num_two_factors(hop_length)
downsample_count2 = max(0, num_twos - n_octaves + 1)
downsample_count = min(downsample_count1, downsample_count2)
if downsample_count > 0:
downsample_factor = 2.0**(downsample_count)
assert hop_length % downsample_factor == 0
hop_length //= downsample_factor
# The additional scaling of sqrt(downsample_factor) here is to implicitly
# rescale the filters
y = np.sqrt(downsample_factor) * audio.resample(y, sr, sr / downsample_factor,
res_type=res_type, scale=True)
sr /= downsample_factor
return y, sr, hop_length
def __num_two_factors(x):
"""Return how many times integer x can be evenly divided by 2.
Returns 0 for non-positive integers.
"""
if x <= 0:
return 0
num_twos = 0
while x % 2 == 0:
num_twos += 1
x //= 2
return num_twos