Source code for speechbrain.augment.codec

"""
Codec Augmentation via torchaudio

This library provides codec augmentation techniques in torchaudio for enhanced
audio data processing.

For detailed guidance and usage examples, refer to the tutorial at:
https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html

Note: This code is compatible with FFmpeg as the torchaudio backend.
When using FFmpeg2, the maximum number of samples for processing is limited to 16.

Authors
 * Mirco Ravanelli 2023
"""

import random
import torch
import torchaudio



[docs]
class CodecAugment(torch.nn.Module):
    """
    Apply random audio codecs to input waveforms using torchaudio.

    This class provides an interface for applying codec augmentation techniques to audio data.

    Arguments
    ---------
    sample_rate: int
        The sample rate of the input waveform.

    Example
    -------
    >>> waveform = torch.rand(4, 16000)
    >>> if torchaudio.list_audio_backends()[0] == 'ffmpeg':
    ...     augmenter = CodecAugment(16000)
    ...     output_waveform = augmenter(waveform)
    """

    def __init__(self, sample_rate=16000):
        super().__init__()
        self.sample_rate = sample_rate
        self.available_format_encoders = [
            ("wav", "pcm_mulaw"),
            ("mp3", None),
            ("g722", None),
        ]


[docs]
    def apply_codec(self, waveform, format=None, encoder=None):
        """
        Apply the selected audio codec.

        Arguments
        ----------
        waveform: torch.Tensor
            Input waveform of shape `[batch, time]`.
        format: str
            The audio format to use (e.g., "wav", "mp3"). Default is None.
        encoder: str
            The encoder to use for the format (e.g., "opus", "vorbis"). Default is None.

        Returns
        ---------
        torch.Tensor:
            Coded version of the input waveform of shape `[batch, time]`.
        """
        audio_effector = torchaudio.io.AudioEffector(
            format=format, encoder=encoder
        )
        waveform_aug = audio_effector.apply(
            waveform.transpose(0, 1).to("cpu"), self.sample_rate
        )
        return waveform_aug.transpose(0, 1).to(waveform.device)



[docs]
    def forward(self, waveform):
        """
        Apply a random audio codec from the available list.

        Arguments
        ---------
        waveform: torch.Tensor
            Input waveform of shape `[batch, time]`.

        Returns
        ---------
        torch.Tensor
            Coded version of the input waveform of shape `[batch, time]`.
        """
        format, encoder = random.choice(self.available_format_encoders)
        return self.apply_codec(waveform, format=format, encoder=encoder)