Skip to content

Spectrogram

Spectrogram transforms for frequency-domain analysis.

Spectrogram

Spectrogram(n_fft: int = 400, win_length: int | None = None, hop_length: int | None = None, pad: int = 0, window_fn: Callable[..., Tensor] = torch.hann_window, power: float | None = 2.0, normalized: bool = False, wkwargs: dict | None = None)

Bases: Module

Create a spectrogram from an audio signal.

Parameters:

Name Type Description Default
n_fft int

size of FFT, creates n_fft // 2 + 1 bins.

400
win_length int | None

window size, defaults to n_fft.

None
hop_length int | None

hop between STFT windows, defaults to win_length // 2.

None
pad int

two-sided padding of signal.

0
window_fn Callable[..., Tensor]

callable that creates a window tensor for each frame.

hann_window
power float | None

exponent for the magnitude spectrogram (e.g. 1 for energy, 2 for power), or None for complex spectrum.

2.0
normalized bool

whether to normalize by magnitude after STFT.

False
wkwargs dict | None

additional keyword arguments for the window function.

None
Source code in tsfast/spectogram.py
def __init__(
    self,
    n_fft: int = 400,
    win_length: int | None = None,
    hop_length: int | None = None,
    pad: int = 0,
    window_fn: Callable[..., Tensor] = torch.hann_window,
    power: float | None = 2.0,
    normalized: bool = False,
    wkwargs: dict | None = None,
) -> None:
    super(Spectrogram, self).__init__()
    self.n_fft = n_fft
    # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
    # number of frequecies due to onesided=True in torch.stft
    self.win_length = win_length if win_length is not None else n_fft
    self.hop_length = hop_length if hop_length is not None else self.win_length // 2
    window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
    self.register_buffer("window", window)
    self.pad = pad
    self.power = power
    self.normalized = normalized

forward

forward(waveform: Tensor) -> Tensor

Compute the spectrogram of the input waveform.

Returns:

Type Description
Tensor

Spectrogram tensor of shape (..., freq, time), where freq is

Tensor

n_fft // 2 + 1 and time is the number of window hops.

Source code in tsfast/spectogram.py
def forward(self, waveform: Tensor) -> Tensor:
    """Compute the spectrogram of the input waveform.

    Returns:
        Spectrogram tensor of shape (..., freq, time), where freq is
        ``n_fft // 2 + 1`` and time is the number of window hops.
    """
    return spectrogram(
        waveform, self.pad, self.window, self.n_fft, self.hop_length, self.win_length, self.power, self.normalized
    )

complex_norm

complex_norm(complex_tensor: Tensor, power: float = 1.0) -> Tensor

Compute the norm of a complex tensor raised to a power.

Source code in tsfast/spectogram.py
def complex_norm(complex_tensor: Tensor, power: float = 1.0) -> Tensor:
    """Compute the norm of a complex tensor raised to a power."""
    if power == 1.0:
        return torch.norm(complex_tensor, 2, -1)
    return torch.norm(complex_tensor, 2, -1).pow(power)

spectrogram

spectrogram(waveform: Tensor, pad: int, window: Tensor, n_fft: int, hop_length: int, win_length: int, power: float | None, normalized: bool) -> Tensor

Compute a spectrogram from an audio/signal waveform.

Parameters:

Name Type Description Default
waveform Tensor

input signal tensor of shape (..., time).

required
pad int

two-sided zero-padding to apply.

required
window Tensor

window tensor for STFT.

required
n_fft int

FFT size, creates n_fft // 2 + 1 frequency bins.

required
hop_length int

hop between STFT windows.

required
win_length int

window size for STFT.

required
power float | None

exponent for magnitude spectrogram, or None for complex.

required
normalized bool

whether to normalize by window magnitude after STFT.

required

Returns:

Type Description
Tensor

Spectrogram tensor of shape (..., freq, time).

Source code in tsfast/spectogram.py
def spectrogram(
    waveform: Tensor,
    pad: int,
    window: Tensor,
    n_fft: int,
    hop_length: int,
    win_length: int,
    power: float | None,
    normalized: bool,
) -> Tensor:
    """Compute a spectrogram from an audio/signal waveform.

    Args:
        waveform: input signal tensor of shape (..., time).
        pad: two-sided zero-padding to apply.
        window: window tensor for STFT.
        n_fft: FFT size, creates n_fft // 2 + 1 frequency bins.
        hop_length: hop between STFT windows.
        win_length: window size for STFT.
        power: exponent for magnitude spectrogram, or None for complex.
        normalized: whether to normalize by window magnitude after STFT.

    Returns:
        Spectrogram tensor of shape (..., freq, time).
    """
    if pad > 0:
        waveform = torch.nn.functional.pad(waveform, (pad, pad), "constant")

    # pack batch
    shape = waveform.size()
    waveform = waveform.view(-1, shape[-1])

    # default values are consistent with librosa.core.spectrum._spectrogram
    spec_f = torch.view_as_real(
        torch.stft(waveform, n_fft, hop_length, win_length, window, True, "reflect", False, True, return_complex=True)
    )

    # unpack batch
    spec_f = spec_f.view(shape[:-1] + spec_f.shape[-3:])

    if normalized:
        spec_f /= window.pow(2.0).sum().sqrt()
    if power is not None:
        spec_f = complex_norm(spec_f, power=power)

    return spec_f