"""
Combinations of processing algorithms to implement downsampling methods.
Authors
* Salah Zaiem
"""
import torch
import torchaudio.transforms as T
from speechbrain.nnet.CNN import Conv1d
from speechbrain.nnet.pooling import Pooling1d
[docs]
class Downsampler(torch.nn.Module):
"""Wrapper for downsampling techniques"""
[docs]
def forward(self, x):
"""Downsampling function
Arguments
---------
x : tensor
Speech samples of shape [B,n_samples] with B the batch size
Returns
-------
Downsampled outputs.
"""
return self.downsampler(x)
[docs]
class SignalDownsampler(Downsampler):
"""Signal downsampling (Decimation)
Arguments
---------
downsampling_factor : int
Factor of downsampling (i.e. ratio (length before ds / length after ds))
initial_sampling_rate : int
Sampling_rate of the input audios
Example
-------
>>> sd = SignalDownsampler(2, 16000)
>>> a = torch.rand([8, 28000])
>>> a = sd(a)
>>> print(a.shape)
torch.Size([8, 14000])
"""
def __init__(self, downsampling_factor, initial_sampling_rate):
super().__init__()
self.downsampling_factor = downsampling_factor
self.target_ds_rate = int(initial_sampling_rate / downsampling_factor)
self.downsampler = T.Resample(
initial_sampling_rate, self.target_ds_rate, dtype=torch.float32
)
[docs]
class Conv1DDownsampler(Downsampler):
"""1D Convolutional downsampling with a learned convolution
Arguments
---------
downsampling_factor : int
Factor of downsampling (i.e. ratio (length before ds / length after ds))
kernel_size : int
Kernel size of the 1D filter (must be an odd integer)
Example
-------
>>> sd = Conv1DDownsampler(3, 161)
>>> a = torch.rand([8, 33000])
>>> a = sd(a)
>>> print(a.shape)
torch.Size([8, 10947])
"""
def __init__(self, downsampling_factor, kernel_size):
super().__init__()
self.kernel_size = kernel_size
self.downsampling_factor = downsampling_factor
self.downsampler = Conv1d(
stride=self.downsampling_factor,
padding="valid",
kernel_size=self.kernel_size,
out_channels=1,
input_shape=[None, None],
)
[docs]
class PoolingDownsampler(Downsampler):
"""1D Pooling downsampling (non-learned)
Arguments
---------
downsampling_factor : int
Factor of downsampling (i.e. ratio (length before ds / length after ds))
kernel_size : int
Kernel size of the 1D filter (must be an odd integer)
padding : int
The number of padding elements to apply.
pool_type : string
Pooling approach, must be within ["avg","max"]
Example
-------
>>> sd = PoolingDownsampler(3, 41)
>>> a = torch.rand([8, 33000])
>>> a = sd(a)
>>> print(a.shape)
torch.Size([8, 10987])
"""
def __init__(
self, downsampling_factor, kernel_size, padding=0, pool_type="avg"
):
super().__init__()
self.kernel_size = kernel_size
self.padding = padding
self.pool_type = pool_type
self.downsampling_factor = downsampling_factor
self.downsampler = Pooling1d(
stride=self.downsampling_factor,
padding=self.padding,
kernel_size=self.kernel_size,
input_dims=3,
pool_type=self.pool_type,
)
# Copied from https://github.com/X-LANCE/SLAM-LLM/blob/main/src/slam_llm/models/projector.py
[docs]
class ConcatDownsampler(Downsampler):
"""Concatenation downsampling with naive frame dropping.
Frames are dropped to make the time dimension divisible by
the downsampling_factor.
Arguments
---------
downsampling_factor : int
Factor of downsampling (i.e. ratio (length before ds / length after ds))
Example
-------
>>> down = ConcatDownsampler(2)
>>> a = torch.rand([8, 40, 40])
>>> a = down(a)
>>> print(a.shape)
torch.Size([8, 20, 80])
"""
def __init__(self, downsampling_factor):
super().__init__()
self.k = downsampling_factor
[docs]
def forward(self, x):
"""Downsamples x given the resampling factor.
Arguments
---------
x : torch.Tensor
Factor of downsampling (i.e. ratio (length before ds / length after ds)).
Returns
-------
x : torch.Tensor
The downsampled tensor.
"""
batch_size, seq_len, dim = x.size()
num_frames_to_discard = seq_len % self.k
if num_frames_to_discard > 0:
x = x[:, :-num_frames_to_discard, :]
seq_len = x.size(1)
x = x.contiguous()
x = x.view(batch_size, seq_len // self.k, dim * self.k)
return x