Source code for speechbrain.augment.time_domain

"""Time-Domain Sequential Data Augmentation Classes

This module contains classes designed for augmenting sequential data in the time domain.
It is particularly useful for enhancing the robustness of neural models during training.
The available data distortions include adding noise, applying reverberation, adjusting playback speed, and more.
All classes are implemented as `torch.nn.Module`, enabling end-to-end differentiability and gradient backpropagation.

Authors:
- Peter Plantinga (2020)
- Mirco Ravanelli (2023)
"""

# Importing libraries
import random
import torch
import torch.nn.functional as F
import torchaudio
from speechbrain.dataio.legacy import ExtendedCSVDataset
from speechbrain.dataio.dataloader import make_dataloader
from speechbrain.processing.signal_processing import (
    compute_amplitude,
    dB_to_amplitude,
    convolve1d,
    notch_filter,
    reverberate,
)



[docs]
class AddNoise(torch.nn.Module):
    """This class additively combines a noise signal to the input signal.

    Arguments
    ---------
    csv_file : str
        The name of a csv file containing the location of the
        noise audio files. If none is provided, white noise will be used.
    csv_keys : list, None, optional
        Default: None . One data entry for the noise data should be specified.
        If None, the csv file is expected to have only one data entry.
    sorting : str
        The order to iterate the csv file, from one of the
        following options: random, original, ascending, and descending.
    num_workers : int
        Number of workers in the DataLoader (See PyTorch DataLoader docs).
    snr_low : int
        The low end of the mixing ratios, in decibels.
    snr_high : int
        The high end of the mixing ratios, in decibels.
    pad_noise : bool
        If True, copy noise signals that are shorter than
        their corresponding clean signals so as to cover the whole clean
        signal. Otherwise, leave the noise un-padded.
    start_index : int
        The index in the noise waveforms to start from. By default, chooses
        a random index in [0, len(noise) - len(waveforms)].
    normalize : bool
        If True, output noisy signals that exceed [-1,1] will be
        normalized to [-1,1].
    noise_funct: funct object
        function to use to draw a noisy sample. It is enabled if the csv files
        containing the noisy sequences are not provided. By default,
        torch.randn_like is used (to sample white noise). In general, it must
        be a function that takes in input the original waveform and returns
        a tensor with the corresponsing noise to add (e.g., see pink_noise_like).
    replacements : dict
        A set of string replacements to carry out in the
        csv file. Each time a key is found in the text, it will be replaced
        with the corresponding value.
    noise_sample_rate : int
        The sample rate of the noise audio signals, so noise can be resampled
        to the clean sample rate if necessary.
    clean_sample_rate : int
        The sample rate of the clean audio signals, so noise can be resampled
        to the clean sample rate if necessary.

    Example
    -------
    >>> import pytest
    >>> from speechbrain.dataio.dataio import read_audio
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> clean = signal.unsqueeze(0) # [batch, time, channels]
    >>> noisifier = AddNoise('tests/samples/annotation/noise.csv',
    ...                     replacements={'noise_folder': 'tests/samples/noise'})
    >>> noisy = noisifier(clean, torch.ones(1))
    """

    def __init__(
        self,
        csv_file=None,
        csv_keys=None,
        sorting="random",
        num_workers=0,
        snr_low=0,
        snr_high=0,
        pad_noise=False,
        start_index=None,
        normalize=False,
        noise_funct=torch.randn_like,
        replacements={},
        noise_sample_rate=16000,
        clean_sample_rate=16000,
    ):
        super().__init__()

        self.csv_file = csv_file
        self.csv_keys = csv_keys
        self.sorting = sorting
        self.num_workers = num_workers
        self.snr_low = snr_low
        self.snr_high = snr_high
        self.pad_noise = pad_noise
        self.start_index = start_index
        self.normalize = normalize
        self.replacements = replacements
        self.noise_funct = noise_funct
        self.noise_sample_rate = noise_sample_rate
        self.clean_sample_rate = clean_sample_rate


[docs]
    def forward(self, waveforms, lengths):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.
        lengths : tensor
            Shape should be a single dimension, `[batch]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
        """

        # Copy clean waveform to initialize noisy waveform
        noisy_waveform = waveforms.clone()
        lengths = (lengths * waveforms.shape[1]).unsqueeze(1)

        # Compute the average amplitude of the clean waveforms
        clean_amplitude = compute_amplitude(waveforms, lengths, amp_type="rms")

        # Pick an SNR and use it to compute the mixture amplitude factors
        SNR = torch.rand(len(waveforms), 1, device=waveforms.device)
        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)

        # Support for multichannel waveforms
        if len(noisy_waveform.shape) == 3:
            noise_amplitude_factor = noise_amplitude_factor.unsqueeze(1)

        # Scale clean signal appropriately
        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
        noisy_waveform *= 1 - noise_amplitude_factor

        # Loop through clean samples and create mixture
        if self.csv_file is None:
            noise_waveform = self.noise_funct(waveforms)
            if noise_waveform.shape[0] == 1:
                noise_waveform = torch.cat(
                    [noise_waveform] * waveforms.shape[0], dim=0
                )

            noise_length = lengths
        else:
            tensor_length = waveforms.shape[1]
            noise_waveform, noise_length = self._load_noise(
                lengths, tensor_length
            )

        # Rescale and add
        noise_amplitude = compute_amplitude(
            noise_waveform, noise_length, amp_type="rms"
        )
        noise_waveform *= new_noise_amplitude / (noise_amplitude + 1e-14)

        noisy_waveform += noise_waveform
        # Normalizing to prevent clipping
        if self.normalize:
            abs_max, _ = torch.max(
                torch.abs(noisy_waveform), dim=1, keepdim=True
            )
            noisy_waveform = noisy_waveform / abs_max.clamp(min=1.0)

        return noisy_waveform


    def _load_noise(self, lengths, max_length):
        """Load a batch of noises"""
        lengths = lengths.long().squeeze(1)
        batch_size = len(lengths)

        # Load a noise batch
        if not hasattr(self, "data_loader"):

            if self.noise_sample_rate != self.clean_sample_rate:
                self.resampler = Resample(
                    self.noise_sample_rate, self.clean_sample_rate
                )

            # Set parameters based on input
            self.device = lengths.device

            # Create a data loader for the noise wavforms
            if self.csv_file is not None:
                dataset = ExtendedCSVDataset(
                    csvpath=self.csv_file,
                    output_keys=self.csv_keys,
                    sorting=self.sorting
                    if self.sorting != "random"
                    else "original",
                    replacements=self.replacements,
                )
                self.data_loader = make_dataloader(
                    dataset,
                    batch_size=batch_size,
                    num_workers=self.num_workers,
                    shuffle=(self.sorting == "random"),
                )
                self.noise_data = iter(self.data_loader)

        # Load noise to correct device
        noise_batch, noise_len = self._load_noise_batch_of_size(batch_size)
        noise_batch = noise_batch.to(lengths.device)
        noise_len = noise_len.to(lengths.device)

        # Resample noise if necessary
        if hasattr(self, "resampler"):
            noise_batch = self.resampler(noise_batch)

        # Convert relative length to an index
        noise_len = (noise_len * noise_batch.shape[1]).long()

        # Ensure shortest wav can cover speech signal
        # WARNING: THIS COULD BE SLOW IF THERE ARE VERY SHORT NOISES
        if self.pad_noise:
            while torch.any(noise_len < lengths):
                min_len = torch.min(noise_len)
                prepend = noise_batch[:, :min_len]
                noise_batch = torch.cat((prepend, noise_batch), axis=1)
                noise_len += min_len

        # Ensure noise batch is long enough
        elif noise_batch.size(1) < max_length:
            padding = (0, max_length - noise_batch.size(1))
            noise_batch = torch.nn.functional.pad(noise_batch, padding)

        # Select a random starting location in the waveform
        start_index = self.start_index
        if self.start_index is None:
            start_index = 0
            max_chop = (noise_len - lengths).min().clamp(min=1)
            start_index = torch.randint(
                high=max_chop, size=(1,), device=lengths.device
            )

        # Truncate noise_batch to max_length
        noise_batch = noise_batch[:, start_index : start_index + max_length]
        noise_len = (noise_len - start_index).clamp(max=max_length).unsqueeze(1)
        return noise_batch, noise_len

    def _load_noise_batch_of_size(self, batch_size):
        """Concatenate noise batches, then chop to correct size"""

        noise_batch, noise_lens = self._load_noise_batch()

        # Expand
        while len(noise_batch) < batch_size:
            added_noise, added_lens = self._load_noise_batch()
            noise_batch, noise_lens = AddNoise._concat_batch(
                noise_batch, noise_lens, added_noise, added_lens
            )

        # Contract
        if len(noise_batch) > batch_size:
            noise_batch = noise_batch[:batch_size]
            noise_lens = noise_lens[:batch_size]

        return noise_batch, noise_lens

    @staticmethod
    def _concat_batch(noise_batch, noise_lens, added_noise, added_lens):
        """Concatenate two noise batches of potentially different lengths"""

        # pad shorter batch to correct length
        noise_tensor_len = noise_batch.shape[1]
        added_tensor_len = added_noise.shape[1]
        pad = (0, abs(noise_tensor_len - added_tensor_len))
        if noise_tensor_len > added_tensor_len:
            added_noise = torch.nn.functional.pad(added_noise, pad)
            added_lens = added_lens * added_tensor_len / noise_tensor_len
        else:
            noise_batch = torch.nn.functional.pad(noise_batch, pad)
            noise_lens = noise_lens * noise_tensor_len / added_tensor_len

        noise_batch = torch.cat((noise_batch, added_noise))
        noise_lens = torch.cat((noise_lens, added_lens))

        return noise_batch, noise_lens

    def _load_noise_batch(self):
        """Load a batch of noises, restarting iteration if necessary."""

        try:
            # Don't necessarily know the key
            noises, lens = next(self.noise_data).at_position(0)
        except StopIteration:
            self.noise_data = iter(self.data_loader)
            noises, lens = next(self.noise_data).at_position(0)
        return noises, lens




[docs]
class AddReverb(torch.nn.Module):
    """This class convolves an audio signal with an impulse response.

    Arguments
    ---------
    csv_file : str
        The name of a csv file containing the location of the
        impulse response files.
    sorting : str
        The order to iterate the csv file, from one of
        the following options: random, original, ascending, and descending.
    num_workers : int
        Number of workers in the DataLoader (See PyTorch DataLoader docs).
    rir_scale_factor: float
        It compresses or dilates the given impulse response.
        If 0 < scale_factor < 1, the impulse response is compressed
        (less reverb), while if scale_factor > 1 it is dilated
        (more reverb).
    replacements : dict
        A set of string replacements to carry out in the
        csv file. Each time a key is found in the text, it will be replaced
        with the corresponding value.
    reverb_sample_rate : int
        The sample rate of the corruption signals (rirs), so that they
        can be resampled to clean sample rate if necessary.
    clean_sample_rate : int
        The sample rate of the clean signals, so that the corruption
        signals can be resampled to the clean sample rate before convolution.

    Example
    -------
    >>> import pytest
    >>> from speechbrain.dataio.dataio import read_audio
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> clean = signal.unsqueeze(0) # [batch, time, channels]
    >>> reverb = AddReverb('tests/samples/annotation/RIRs.csv',
    ...                     replacements={'rir_folder': 'tests/samples/RIRs'})
    >>> reverbed = reverb(clean)
    """

    def __init__(
        self,
        csv_file,
        sorting="random",
        num_workers=0,
        rir_scale_factor=1.0,
        replacements={},
        reverb_sample_rate=16000,
        clean_sample_rate=16000,
    ):
        super().__init__()
        self.csv_file = csv_file
        self.sorting = sorting
        self.num_workers = num_workers
        self.replacements = replacements
        self.reverb_sample_rate = reverb_sample_rate
        self.clean_sample_rate = clean_sample_rate
        self.rir_scale_factor = rir_scale_factor


[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
        """

        if self.reverb_sample_rate != self.clean_sample_rate:
            self.resampler = Resample(
                self.reverb_sample_rate, self.clean_sample_rate
            )

        # Add channels dimension if necessary
        channel_added = False
        if len(waveforms.shape) == 2:
            waveforms = waveforms.unsqueeze(-1)
            channel_added = True

        # Load and prepare RIR
        rir_waveform = self._load_rir(waveforms)

        # Resample to correct rate
        if hasattr(self, "resampler"):
            rir_waveform = self.resampler(rir_waveform)

        # Compress or dilate RIR
        if self.rir_scale_factor != 1:
            rir_waveform = F.interpolate(
                rir_waveform.transpose(1, -1),
                scale_factor=self.rir_scale_factor,
                mode="linear",
                align_corners=False,
            )
            rir_waveform = rir_waveform.transpose(1, -1)

        rev_waveform = reverberate(waveforms, rir_waveform, rescale_amp="avg")

        # Remove channels dimension if added
        if channel_added:
            return rev_waveform.squeeze(-1)

        return rev_waveform


    def _load_rir(self, waveforms):
        # Create a data loader for the RIR waveforms
        if not hasattr(self, "data_loader"):
            dataset = ExtendedCSVDataset(
                csvpath=self.csv_file,
                sorting=self.sorting
                if self.sorting != "random"
                else "original",
                replacements=self.replacements,
            )
            self.data_loader = make_dataloader(
                dataset,
                shuffle=(self.sorting == "random"),
                num_workers=self.num_workers,
            )
            self.rir_data = iter(self.data_loader)

        try:
            rir_waveform, length = next(self.rir_data).at_position(0)
        except StopIteration:
            self.rir_data = iter(self.data_loader)
            rir_waveform, length = next(self.rir_data).at_position(0)

        # Make sure RIR has correct channels
        if len(rir_waveform.shape) == 2:
            rir_waveform = rir_waveform.unsqueeze(-1)

        # Make sure RIR has correct type and device
        rir_waveform = rir_waveform.type(waveforms.dtype)
        return rir_waveform.to(waveforms.device)




[docs]
class SpeedPerturb(torch.nn.Module):
    """Slightly speed up or slow down an audio signal.

    Resample the audio signal at a rate that is similar to the original rate,
    to achieve a slightly slower or slightly faster signal. This technique is
    outlined in the paper: "Audio Augmentation for Speech Recognition"

    Arguments
    ---------
    orig_freq : int
        The frequency of the original signal.
    speeds : list
        The speeds that the signal should be changed to, as a percentage of the
        original signal (i.e. `speeds` is divided by 100 to get a ratio).

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> perturbator = SpeedPerturb(orig_freq=16000, speeds=[90])
    >>> clean = signal.unsqueeze(0)
    >>> perturbed = perturbator(clean)
    >>> clean.shape
    torch.Size([1, 52173])
    >>> perturbed.shape
    torch.Size([1, 46956])
    """

    def __init__(self, orig_freq, speeds=[90, 100, 110]):
        super().__init__()
        self.orig_freq = orig_freq
        self.speeds = speeds
        # Initialize index of perturbation
        self.samp_index = 0

        # Initialize resamplers
        self.resamplers = []
        for speed in self.speeds:
            config = {
                "orig_freq": self.orig_freq,
                "new_freq": self.orig_freq * speed // 100,
            }
            self.resamplers.append(Resample(**config))


[docs]
    def forward(self, waveform):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
        """

        # Perform a random perturbation
        self.samp_index = torch.randint(0, len(self.speeds), (1,))
        perturbed_waveform = self.resamplers[self.samp_index](waveform)
        return perturbed_waveform





[docs]
class Resample(torch.nn.Module):
    """This class resamples audio using the
    :class:`torchaudio resampler <torchaudio.transforms.Resample>` based on
    sinc interpolation.

    Arguments
    ---------
    orig_freq : int
        the sampling frequency of the input signal.
    new_freq : int
        the new sampling frequency after this operation is performed.
    *args
        additional arguments forwarded to the
        :class:`torchaudio.transforms.Resample` constructor
    **kwargs
        additional keyword arguments forwarded to the
        :class:`torchaudio.transforms.Resample` constructor

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
    >>> resampler = Resample(orig_freq=16000, new_freq=8000)
    >>> resampled = resampler(signal)
    >>> signal.shape
    torch.Size([1, 52173])
    >>> resampled.shape
    torch.Size([1, 26087])
    """

    def __init__(self, orig_freq=16000, new_freq=16000, *args, **kwargs):
        super().__init__()

        self.orig_freq = orig_freq
        self.new_freq = new_freq

        self.resampler = torchaudio.transforms.Resample(
            orig_freq=orig_freq, new_freq=new_freq, *args, **kwargs,
        )


[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
        """

        # Don't do anything if the frequencies are the same
        if self.orig_freq == self.new_freq:
            return waveforms

        unsqueezed = False
        if len(waveforms.shape) == 2:
            waveforms = waveforms.unsqueeze(1)
            unsqueezed = True
        elif len(waveforms.shape) == 3:
            waveforms = waveforms.transpose(1, 2)
        else:
            raise ValueError("Input must be 2 or 3 dimensions")

        # If necessary, migrate the resampler to the current device, for
        # backwards compat with scripts that do not call `resampler.to()`
        # themselves.
        # Please do not reuse the sample resampler for tensors that live on
        # different devices, though.
        self.resampler.to(waveforms.device)  # in-place

        # Do resampling
        resampled_waveform = self.resampler(waveforms)

        if unsqueezed:
            resampled_waveform = resampled_waveform.squeeze(1)
        else:
            resampled_waveform = resampled_waveform.transpose(1, 2)

        return resampled_waveform





[docs]
class DropFreq(torch.nn.Module):
    """This class drops a random frequency from the signal.

    The purpose of this class is to teach models to learn to rely on all parts
    of the signal, not just a few frequency bands.

    Arguments
    ---------
    drop_freq_low : float
        The low end of frequencies that can be dropped,
        as a fraction of the sampling rate / 2.
    drop_freq_high : float
        The high end of frequencies that can be
        dropped, as a fraction of the sampling rate / 2.
    drop_freq_count_low : int
        The low end of number of frequencies that could be dropped.
    drop_freq_count_high : int
        The high end of number of frequencies that could be dropped.
    drop_freq_width : float
        The width of the frequency band to drop, as
        a fraction of the sampling_rate / 2.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> dropper = DropFreq()
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> dropped_signal = dropper(signal.unsqueeze(0))
    """

    def __init__(
        self,
        drop_freq_low=1e-14,
        drop_freq_high=1,
        drop_freq_count_low=1,
        drop_freq_count_high=3,
        drop_freq_width=0.05,
    ):
        super().__init__()
        self.drop_freq_low = drop_freq_low
        self.drop_freq_high = drop_freq_high
        self.drop_freq_count_low = drop_freq_count_low
        self.drop_freq_count_high = drop_freq_count_high
        self.drop_freq_width = drop_freq_width


[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
        """

        # Don't drop (return early) 1-`drop_prob` portion of the batches
        dropped_waveform = waveforms.clone()

        # Add channels dimension
        if len(waveforms.shape) == 2:
            dropped_waveform = dropped_waveform.unsqueeze(-1)

        # Pick number of frequencies to drop
        drop_count = torch.randint(
            low=self.drop_freq_count_low,
            high=self.drop_freq_count_high + 1,
            size=(1,),
        )

        # Pick a frequency to drop
        drop_range = self.drop_freq_high - self.drop_freq_low
        drop_frequency = (
            torch.rand(drop_count) * drop_range + self.drop_freq_low
        )

        # Filter parameters
        filter_length = 101
        pad = filter_length // 2

        # Start with delta function
        drop_filter = torch.zeros(1, filter_length, 1, device=waveforms.device)
        drop_filter[0, pad, 0] = 1

        # Subtract each frequency
        for frequency in drop_frequency:
            notch_kernel = notch_filter(
                frequency, filter_length, self.drop_freq_width
            ).to(waveforms.device)
            drop_filter = convolve1d(drop_filter, notch_kernel, pad)

        # Manage multiple channels
        if len(waveforms.shape) == 3:
            dropped_waveform = dropped_waveform.reshape(
                dropped_waveform.shape[0] * dropped_waveform.shape[2],
                dropped_waveform.shape[1],
                1,
            )

        # Apply filter
        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)

        if len(waveforms.shape) == 3:
            dropped_waveform = dropped_waveform.reshape(
                waveforms.shape[0], waveforms.shape[1], waveforms.shape[2]
            )

        # Remove channels dimension if added
        return dropped_waveform.squeeze(-1)





[docs]
class DropChunk(torch.nn.Module):
    """This class drops portions of the input signal.

    Using `DropChunk` as an augmentation strategy helps a models learn to rely
    on all parts of the signal, since it can't expect a given part to be
    present.

    Arguments
    ---------
    drop_length_low : int
        The low end of lengths for which to set the
        signal to zero, in samples.
    drop_length_high : int
        The high end of lengths for which to set the
        signal to zero, in samples.
    drop_count_low : int
        The low end of number of times that the signal
        can be dropped to zero.
    drop_count_high : int
        The high end of number of times that the signal
        can be dropped to zero.
    drop_start : int
        The first index for which dropping will be allowed.
    drop_end : int
        The last index for which dropping will be allowed.
    noise_factor : float
        The factor relative to average amplitude of an utterance
        to use for scaling the white noise inserted. 1 keeps
        the average amplitude the same, while 0 inserts all 0's.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
    >>> length = torch.ones(1)
    >>> dropped_signal = dropper(signal, length)
    >>> float(dropped_signal[:, 150])
    0.0
    """

    def __init__(
        self,
        drop_length_low=100,
        drop_length_high=1000,
        drop_count_low=1,
        drop_count_high=3,
        drop_start=0,
        drop_end=None,
        noise_factor=0.0,
    ):
        super().__init__()
        self.drop_length_low = drop_length_low
        self.drop_length_high = drop_length_high
        self.drop_count_low = drop_count_low
        self.drop_count_high = drop_count_high
        self.drop_start = drop_start
        self.drop_end = drop_end
        self.noise_factor = noise_factor

        # Validate low < high
        if drop_length_low > drop_length_high:
            raise ValueError("Low limit must not be more than high limit")
        if drop_count_low > drop_count_high:
            raise ValueError("Low limit must not be more than high limit")

        # Make sure the length doesn't exceed end - start
        if drop_end is not None and drop_end >= 0:
            if drop_start > drop_end:
                raise ValueError("Low limit must not be more than high limit")

            drop_range = drop_end - drop_start
            self.drop_length_low = min(drop_length_low, drop_range)
            self.drop_length_high = min(drop_length_high, drop_range)


[docs]
    def forward(self, waveforms, lengths):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.
        lengths : tensor
            Shape should be a single dimension, `[batch]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or
            `[batch, time, channels]`
        """

        # Reading input list
        lengths = (lengths * waveforms.size(1)).long()
        batch_size = waveforms.size(0)
        dropped_waveform = waveforms.clone()

        # Store original amplitude for computing white noise amplitude
        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))

        # Pick a number of times to drop
        drop_times = torch.randint(
            low=self.drop_count_low,
            high=self.drop_count_high + 1,
            size=(batch_size,),
        )

        # Iterate batch to set mask
        for i in range(batch_size):
            if drop_times[i] == 0:
                continue

            # Pick lengths
            length = torch.randint(
                low=self.drop_length_low,
                high=self.drop_length_high + 1,
                size=(drop_times[i],),
            )

            # Compute range of starting locations
            start_min = self.drop_start
            if start_min < 0:
                start_min += lengths[i]
            start_max = self.drop_end
            if start_max is None:
                start_max = lengths[i]
            if start_max < 0:
                start_max += lengths[i]
            start_max = max(0, start_max - length.max())

            # Pick starting locations
            start = torch.randint(
                low=start_min, high=start_max + 1, size=(drop_times[i],)
            )

            end = start + length

            # Update waveform
            if not self.noise_factor:
                for j in range(drop_times[i]):
                    dropped_waveform[i, start[j] : end[j]] = 0.0
            else:
                # Uniform distribution of -2 to +2 * avg amplitude should
                # preserve the average for normalization
                noise_max = 2 * clean_amplitude[i] * self.noise_factor
                for j in range(drop_times[i]):
                    # zero-center the noise distribution
                    noise_vec = torch.rand(length[j], device=waveforms.device)
                    noise_vec = 2 * noise_max * noise_vec - noise_max
                    dropped_waveform[i, start[j] : end[j]] = noise_vec

        return dropped_waveform





[docs]
class FastDropChunk(torch.nn.Module):
    """This class drops portions of the input signal. The difference with
    DropChunk is that in this case we pre-compute the dropping masks in the
    first time the forward function is called. For all the other calls, we only
    shuffle and apply them. This makes the code faster and more suitable for
    data augmentation of large batches.

    It can be used only for fixed-length sequences.

    Arguments
    ---------
    drop_length_low : int
        The low end of lengths for which to set the
        signal to zero, in samples.
    drop_length_high : int
        The high end of lengths for which to set the
        signal to zero, in samples.
    drop_count_low : int
        The low end of number of times that the signal
        can be dropped to zero.
    drop_count_high : int
        The high end of number of times that the signal
        can be dropped to zero.
    drop_start : int
        The first index for which dropping will be allowed.
    drop_end : int
        The last index for which dropping will be allowed.
    n_masks : int
        The number of precomputed masks.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> dropper = FastDropChunk(drop_start=100, drop_end=200)
    >>> signal = torch.rand(10, 250, 22)
    >>> dropped_signal = dropper(signal)
    """

    def __init__(
        self,
        drop_length_low=100,
        drop_length_high=1000,
        drop_count_low=1,
        drop_count_high=10,
        drop_start=0,
        drop_end=None,
        n_masks=1000,
    ):
        super().__init__()
        self.drop_length_low = drop_length_low
        self.drop_length_high = drop_length_high
        self.drop_count_low = drop_count_low
        self.drop_count_high = drop_count_high
        self.drop_start = drop_start
        self.drop_end = drop_end
        self.n_masks = n_masks
        self.first = True

        # Validate low < high
        if drop_length_low > drop_length_high:
            raise ValueError("Low limit must not be more than high limit")
        if drop_count_low > drop_count_high:
            raise ValueError("Low limit must not be more than high limit")

        # Make sure the length doesn't exceed end - start
        if drop_end is not None and drop_end >= 0:
            if drop_start > drop_end:
                raise ValueError("Low limit must not be more than high limit")
            drop_range = drop_end - drop_start
            self.drop_length_low = min(drop_length_low, drop_range)
            self.drop_length_high = min(drop_length_high, drop_range)


[docs]
    def initialize_masks(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.
`.
        Returns
        -------
        dropped_masks: tensor
            Tensor of size `[n_masks, time]` with the dropped chunks. Dropped
            regions are assigned to 0.
        """

        if self.n_masks < waveforms.shape[0]:
            raise ValueError("n_mask cannot be smaller than the batch size")

        # Initiaizing the drop mask
        dropped_masks = torch.ones(
            [self.n_masks, self.sig_len], device=waveforms.device
        )

        # Pick a number of times to drop
        drop_times = torch.randint(
            low=self.drop_count_low,
            high=self.drop_count_high + 1,
            size=(self.n_masks,),
            device=waveforms.device,
        )

        # Iterate batch to set mask
        for i in range(self.n_masks):
            if drop_times[i] == 0:
                continue

            # Pick lengths
            length = torch.randint(
                low=self.drop_length_low,
                high=self.drop_length_high + 1,
                size=(drop_times[i],),
                device=waveforms.device,
            )

            # Compute range of starting locations
            start_min = self.drop_start
            if start_min < 0:
                start_min += self.sig_len
            start_max = self.drop_end
            if start_max is None:
                start_max = self.sig_len
            if start_max < 0:
                start_max += self.sig_len
            start_max = max(0, start_max - length.max())

            # Pick starting locations
            start = torch.randint(
                low=start_min,
                high=start_max + 1,
                size=(drop_times[i],),
                device=waveforms.device,
            )

            end = start + length

            # Update waveform
            for j in range(drop_times[i]):
                dropped_masks[i, start[j] : end[j]] = 0.0

        return dropped_masks



[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`
        """

        dropped_waveforms = waveforms.clone()

        # Initialize the masks
        if self.first:
            self.sig_len = waveforms.shape[1]
            self.dropped_masks = self.initialize_masks(waveforms)
            self.first = False

        # Random Permutation
        rand_perm = torch.randperm(self.dropped_masks.shape[0])
        self.dropped_masks = self.dropped_masks[rand_perm, :]

        # Random shift in time
        rand_shifts = torch.randint(low=0, high=self.sig_len, size=(1,))
        self.dropped_masks = torch.roll(
            self.dropped_masks, shifts=rand_shifts.item(), dims=1
        )

        if len(waveforms.shape) == 3:
            dropped_waveforms = dropped_waveforms * self.dropped_masks[
                0 : waveforms.shape[0]
            ].unsqueeze(2)
        else:
            dropped_waveforms = (
                dropped_waveforms * self.dropped_masks[0 : waveforms.shape[0]]
            )

        return dropped_waveforms





[docs]
class DoClip(torch.nn.Module):
    """This function mimics audio clipping by clamping the input tensor.
    First, it normalizes the waveforms from -1 to -1. Then, clipping is applied.
    Finally, the original amplitude is restored.

    Arguments
    ---------
    clip_low : float
        The low end of amplitudes for which to clip the signal.
    clip_high : float
        The high end of amplitudes for which to clip the signal.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> clipper = DoClip(clip_low=0.01, clip_high=0.01)
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> clipped_signal = clipper(signal.unsqueeze(0))
    """

    def __init__(self, clip_low=0.5, clip_high=0.5):
        super().__init__()
        self.clip_low = clip_low
        self.clip_high = clip_high


[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`
        """

        # Normalize the signal
        abs_max, _ = torch.max(torch.abs(waveforms), dim=1, keepdim=True)
        waveforms = waveforms / abs_max

        # Randomly select clip value
        clipping_range = self.clip_high - self.clip_low
        clip_value = (
            torch.rand(1, device=waveforms.device)[0] * clipping_range
            + self.clip_low
        )

        # Apply clipping
        clipped_waveform = waveforms.clamp(-clip_value, clip_value)

        # Restore orignal amplitude
        clipped_waveform = clipped_waveform * abs_max / clip_value

        return clipped_waveform





[docs]
class RandAmp(torch.nn.Module):
    """This function multiples the signal by a random amplitude. Firist, the
    signal is normalized to have amplitude between -1 and 1. Then it is
    multiplied with a random number.

    Arguments
    ---------
    amp_low : float
        The minumum amplitude multiplication factor.
    amp_high : float
        The maximum amplitude multiplication factor.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> rand_amp = RandAmp(amp_low=0.25, amp_high=1.75)
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> output_signal = rand_amp(signal.unsqueeze(0))
    """

    def __init__(self, amp_low=0.5, amp_high=1.5):
        super().__init__()
        self.amp_low = amp_low
        self.amp_high = amp_high


[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`
        """

        # Normalize the signal
        abs_max, _ = torch.max(torch.abs(waveforms), dim=1, keepdim=True)
        waveforms = waveforms / abs_max

        # Pick a frequency to drop
        rand_range = self.amp_high - self.amp_low
        amp = (
            torch.rand(waveforms.shape[0], device=waveforms.device) * rand_range
            + self.amp_low
        )
        amp = amp.unsqueeze(1)
        if len(waveforms.shape) == 3:
            amp = amp.unsqueeze(2)
        waveforms = waveforms * amp

        return waveforms





[docs]
class ChannelDrop(torch.nn.Module):
    """This function drops random channels in the multi-channel nput waveform.

    Arguments
    ---------
    drop_rate : float
        The channel droput factor

    Example
    -------
    >>> signal = torch.rand(4, 256, 8)
    >>> ch_drop = ChannelDrop(drop_rate=0.5)
    >>> output_signal = ch_drop(signal)
    """

    def __init__(self, drop_rate=0.1):
        super().__init__()
        self.drop_rate = drop_rate


[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`
        """

        # Pick a channel to drop
        x = torch.rand(waveforms.shape[-1], device=waveforms.device)
        channel_mask = x.ge(self.drop_rate)
        waveforms = waveforms * channel_mask.unsqueeze(0).unsqueeze(1)
        return waveforms





[docs]
class ChannelSwap(torch.nn.Module):
    """This function randomly swaps N channels.

    Arguments
    ---------
    min_swap : int
        The mininum number of channels to swap.
    max_swap : int
        The maximum number of channels to swap.

    Example
    -------
    >>> signal = torch.rand(4, 256, 8)
    >>> ch_swap = ChannelSwap()
    >>> output_signal = ch_swap(signal)
    """

    def __init__(self, min_swap=0, max_swap=0):
        super().__init__()
        self.min_swap = min_swap
        self.max_swap = max_swap

        # Check arguments
        if self.min_swap < 0:
            raise ValueError("min_swap must be  >= 0.")
        if self.max_swap < 0:
            raise ValueError("max_swap must be  >= 0.")
        if self.max_swap < self.min_swap:
            raise ValueError("max_swap must be  >= min_swap")


[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`
        """

        # Pick a frequency to drop
        rand_perm1 = torch.randperm(waveforms.shape[-1])
        rand_perm2 = torch.randperm(waveforms.shape[-1])
        N_swaps = torch.randint(
            low=self.min_swap, high=self.max_swap + 1, size=(1,)
        )

        if N_swaps < waveforms.shape[-1]:
            for i in range(N_swaps):
                store_channel = waveforms[:, :, rand_perm2[i]]
                waveforms[:, :, rand_perm2[i]] = waveforms[:, :, rand_perm1[i]]
                waveforms[:, :, rand_perm1[i]] = store_channel
        else:
            # Full swap
            waveforms = waveforms[:, :, rand_perm1]

        return waveforms





[docs]
class CutCat(torch.nn.Module):
    """This function combines segments (with equal length in time) of the time series contained in the batch.
    Proposed for EEG signals in https://doi.org/10.1016/j.neunet.2021.05.032.

    Arguments
    ---------
    min_num_segments : int
        The number of segments to combine.
    max_num_segments : int
        The maximum number of segments to combine. Default is 10.

    Example
    -------
    >>> signal = torch.ones((4, 256, 22)) * torch.arange(4).reshape((4, 1, 1,))
    >>> cutcat =  CutCat()
    >>> output_signal = cutcat(signal)
    """

    def __init__(self, min_num_segments=2, max_num_segments=10):
        super().__init__()
        self.min_num_segments = min_num_segments
        self.max_num_segments = max_num_segments
        # Check arguments
        if self.max_num_segments < self.min_num_segments:
            raise ValueError("max_num_segments must be  >= min_num_segments")


[docs]
    def forward(self, waveforms):
        """
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`
        """
        if (
            waveforms.shape[0] > 1
        ):  # only if there are at least 2 examples in batch
            # rolling waveforms to point to segments of other examples in batch
            waveforms_rolled = torch.roll(waveforms, shifts=1, dims=0)
            # picking number of segments to use
            num_segments = torch.randint(
                low=self.min_num_segments,
                high=self.max_num_segments + 1,
                size=(1,),
            )
            # index of cuts (both starts and stops)
            idx_cut = torch.linspace(
                0, waveforms.shape[1], num_segments.item() + 1, dtype=torch.int
            )
            for i in range(idx_cut.shape[0] - 1):
                # half of segments from other examples in batch
                if i % 2 == 1:
                    start = idx_cut[i]
                    stop = idx_cut[i + 1]
                    waveforms[:, start:stop, ...] = waveforms_rolled[
                        :, start:stop, ...  # noqa: W504
                    ]

        return waveforms





[docs]
def pink_noise_like(waveforms, alpha_low=1.0, alpha_high=1.0, sample_rate=50):
    """Creates a sequence of pink noise (also known as 1/f). The pink noise
    is obtained by multipling the spectrum of a white noise sequence by a
    factor (1/f^alpha).
    The alpha factor controls the decrease factor in the frequnecy domain
    (alpha=0 adds white noise, alpha>>0 adds low frequnecy noise). It is
    randomly sampled between alpha_low and alpha_high. With negative alpha this
    funtion generates blue noise.

    Arguments
    ---------
    waveforms : torch.Tensor
        The original waveform. It is just used to infer the shape.
    alpha_low : float
        The minimum value for the alpha spectral smooting factor.
    alpha_high : float
        The maximum value for the alpha spectral smooting factor.
    sample_rate : float
        The sample rate of the original signal.

    Example
    -------
    >>> waveforms = torch.randn(4,257,10)
    >>> noise = pink_noise_like(waveforms)
    >>> noise.shape
    torch.Size([4, 257, 10])
    """
    # Sampling white noise (flat spectrum)
    white_noise = torch.randn_like(waveforms)

    # Computing the fft of the input white noise
    white_noise_fft = torch.fft.fft(white_noise, dim=1)

    # Sampling the spectral smoothing factor
    rand_range = alpha_high - alpha_low
    alpha = (
        torch.rand(waveforms.shape[0], device=waveforms.device) * rand_range
        + alpha_low
    )

    # preparing the spectral mask (1/f^alpha)
    f = torch.linspace(
        0,
        sample_rate / 2,
        int(white_noise.shape[1] / 2),
        device=waveforms.device,
    )
    spectral_mask = 1 / torch.pow(f.unsqueeze(0), alpha.unsqueeze(1))

    # Avoid inf due to 1/0 division at f=0
    spectral_mask[:, 0] = spectral_mask[:, 1]

    # Mask for the upper part of the spectrum (f > sample_rate/2)
    spectral_mask_up = torch.flip(spectral_mask, dims=(1,))

    # Managing odd/even sequences
    if white_noise.shape[1] % 2:
        mid_element = spectral_mask[
            :, int(white_noise.shape[1] / 2) - 1
        ].unsqueeze(1)
        spectral_mask = torch.cat(
            [spectral_mask, mid_element, spectral_mask_up], dim=1
        )
    else:
        spectral_mask = torch.cat([spectral_mask, spectral_mask_up], dim=1)

    # Managing multi-channel inputs
    if len(white_noise.shape) == 3:
        spectral_mask = spectral_mask.unsqueeze(2)

    # Spectral masking
    pink_noise_fft = white_noise_fft * spectral_mask

    # Return to the time-domain
    pink_noise = torch.fft.ifft(pink_noise_fft, dim=1).real
    return pink_noise




[docs]
class DropBitResolution(torch.nn.Module):
    """
    This class transforms a float32 tensor into a lower resolution one
    (e.g., int16, int8, float16) and then converts it back to a float32.
    This process loses information and can be used for data augmentation.

    Arguments:
    ---------
        target_dtype: str
            One of "int16", "int8", "float16". If "random", the bit resolution
            is randomly selected among the options listed above.

    Example:
        >>> dropper = DropBitResolution()
        >>> signal = torch.rand(4, 16000)
        >>> signal_dropped = dropper(signal)
    """

    def __init__(self, target_dtype="random"):
        super().__init__()

        self.target_dtype = target_dtype
        self.bit_depths = {
            "int16": (16, torch.int16),
            "int8": (8, torch.int8),
            "float16": (16, torch.float16),
        }

        if (
            self.target_dtype != "random"
            and self.target_dtype not in self.bit_depths
        ):
            raise ValueError(
                f"target_dtype must be one of {list(self.bit_depths.keys())}"
            )


[docs]
    def forward(self, float32_tensor):
        """
        Arguments:
        ---------
            float32_tensor: torch.Tensor
                Float32 tensor with shape `[batch, time]` or `[batch, time, channels]`.

        Returns:
        ---------
            torch.Tensor
                Tensor of shape `[batch, time]` or `[batch, time, channels]` (Float32)
        """

        if self.target_dtype == "random":
            random_key = random.choice(list(self.bit_depths.keys()))
            bit, target_dtype = self.bit_depths[random_key]
        else:
            bit, target_dtype = self.bit_depths[self.target_dtype]

        # Define a scale factor to map the float32 range to the target bit depth
        if target_dtype != torch.float16:
            scale_factor = (2 ** (bit - 1) - 1) / float32_tensor.abs().max()
            quantized_tensor = (float32_tensor * scale_factor).to(target_dtype)
        else:
            quantized_tensor = float32_tensor.half()
            scale_factor = 1

        # To dequantize and recover the original float32 values
        dequantized_tensor = quantized_tensor.to(torch.float32) / scale_factor
        return dequantized_tensor