Source code for speechbrain.augment.codec

"""
Codec Augmentation via torchaudio

This library provides codec augmentation techniques in torchaudio for enhanced
audio data processing.

For detailed guidance and usage examples, refer to the tutorial at:
https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html

Note: This code is compatible with FFmpeg as the torchaudio backend.
When using FFmpeg2, the maximum number of samples for processing is limited to 16.

Authors
 * Mirco Ravanelli 2023
"""

import random
import torch
import torchaudio


[docs] class CodecAugment(torch.nn.Module): """ Apply random audio codecs to input waveforms using torchaudio. This class provides an interface for applying codec augmentation techniques to audio data. Arguments --------- sample_rate: int The sample rate of the input waveform. Example ------- >>> waveform = torch.rand(4, 16000) >>> if torchaudio.list_audio_backends()[0] == 'ffmpeg': ... augmenter = CodecAugment(16000) ... output_waveform = augmenter(waveform) """ def __init__(self, sample_rate=16000): super().__init__() self.sample_rate = sample_rate self.available_format_encoders = [ ("wav", "pcm_mulaw"), ("mp3", None), ("g722", None), ]
[docs] def apply_codec(self, waveform, format=None, encoder=None): """ Apply the selected audio codec. Arguments ---------- waveform: torch.Tensor Input waveform of shape `[batch, time]`. format: str The audio format to use (e.g., "wav", "mp3"). Default is None. encoder: str The encoder to use for the format (e.g., "opus", "vorbis"). Default is None. Returns --------- torch.Tensor: Coded version of the input waveform of shape `[batch, time]`. """ audio_effector = torchaudio.io.AudioEffector( format=format, encoder=encoder ) waveform_aug = audio_effector.apply( waveform.transpose(0, 1).to("cpu"), self.sample_rate ) return waveform_aug.transpose(0, 1).to(waveform.device)
[docs] def forward(self, waveform): """ Apply a random audio codec from the available list. Arguments --------- waveform: torch.Tensor Input waveform of shape `[batch, time]`. Returns --------- torch.Tensor Coded version of the input waveform of shape `[batch, time]`. """ format, encoder = random.choice(self.available_format_encoders) return self.apply_codec(waveform, format=format, encoder=encoder)