Source code for speechbrain.utils.streaming

"""Utilities to assist with designing and training streaming models.

Authors
* Sylvain de Langen 2023
"""

import math
import torch
from typing import Callable, List


[docs] def split_fixed_chunks( x: torch.Tensor, chunk_size: int, dim: int = -1 ) -> List[torch.Tensor]: """Split an input tensor `x` into a list of chunk tensors of size `chunk_size` alongside dimension `dim`. Useful for splitting up sequences with chunks of fixed sizes. If dimension `dim` cannot be evenly split by `chunk_size`, then the last chunk will be smaller than `chunk_size`. Arguments --------- x : torch.Tensor The tensor to split into chunks, typically a sequence or audio signal. chunk_size : int The size of each chunk, i.e. the max size of each chunk on dimension `dim`. dim : int Dimension to split alongside of, typically the time dimension. Returns ------- List[torch.Tensor] A chunk list of tensors, see description and example. Guarantees `.size(dim) <= chunk_size`. Example ------- >>> import torch >>> from speechbrain.utils.streaming import split_fixed_chunks >>> x = torch.zeros((16, 10000, 80)) >>> chunks = split_fixed_chunks(x, 128, dim=1) >>> len(chunks) 79 >>> chunks[0].shape torch.Size([16, 128, 80]) >>> chunks[-1].shape torch.Size([16, 16, 80]) """ num_chunks = math.ceil(x.size(dim) / chunk_size) split_at_indices = [(i + 1) * chunk_size for i in range(num_chunks - 1)] return torch.tensor_split(x, split_at_indices, dim=1)
[docs] def split_wav_lens( chunk_lens: List[int], wav_lens: torch.Tensor ) -> List[torch.Tensor]: """Converts a single `wav_lens` tensor into a list of `chunk_count` tensors, typically useful when chunking signals with `split_fixed_chunks`. `wav_lens` represents the relative length of each audio within a batch, which is typically used for masking. This function computes the relative length at chunk level. Arguments --------- chunk_lens : List[int] Length of the sequence of every chunk. For example, if `chunks` was returned from `split_fixed_chunks(x, chunk_size, dim=1)`, then this should be `[chk.size(1) for chk in chunks]`. wav_lens : torch.Tensor Relative lengths of audio within a batch. For example, for an input signal of 100 frames and a batch of 3 elements, `(1.0, 0.5, 0.25)` would mean the batch holds audio of 100 frames, 50 frames and 25 frames respectively. Returns ------- List[torch.Tensor] A list of chunked wav_lens, see description and example. Example ------- >>> import torch >>> from speechbrain.utils.streaming import split_wav_lens, split_fixed_chunks >>> x = torch.zeros((3, 20, 80)) >>> chunks = split_fixed_chunks(x, 8, dim=1) >>> len(chunks) 3 >>> # 20 frames, 13 frames, 17 frames >>> wav_lens = torch.tensor([1.0, 0.65, 0.85]) >>> chunked_wav_lens = split_wav_lens([c.size(1) for c in chunks], wav_lens) >>> chunked_wav_lens [tensor([1., 1., 1.]), tensor([1.0000, 0.6250, 1.0000]), tensor([1.0000, 0.0000, 0.2500])] >>> # wav 1 covers 62.5% (5/8) of the second chunk's frames """ chunk_wav_lens = [] seq_size = sum(chunk_lens) wav_lens_frames = wav_lens * seq_size chunk_start_frame = 0 for chunk_len in chunk_lens: chunk_raw_len = (wav_lens_frames - chunk_start_frame) / chunk_len chunk_raw_len = torch.clamp(chunk_raw_len, 0.0, 1.0) chunk_wav_lens.append(chunk_raw_len) chunk_start_frame += chunk_len return chunk_wav_lens
[docs] def infer_dependency_matrix( model: Callable, seq_shape: tuple, in_stride: int = 1 ): """ Randomizes parts of the input sequence several times in order to detect dependencies between input frames and output frames, aka whether a given output frame depends on a given input frame. This can prove useful to check whether a model behaves correctly in a streaming context and does not contain accidental dependencies to future frames that couldn't be known in a streaming scenario. Note that this can get very computationally expensive for very long sequences. Furthermore, this expects inference to be fully deterministic, else false dependencies may be found. This also means that the model must be in eval mode, to inhibit things like dropout layers. Arguments --------- model : Callable Can be a model or a function (potentially emulating streaming functionality). Does not require to be a trained model, random weights should usually suffice. seq_shape : tuple The function tries inferring by randomizing parts of the input sequence in order to detect unwanted dependencies. The shape is expected to look like `[batch_size, seq_len, num_feats]`, where `batch_size` may be `1`. in_stride : int Consider only N-th input, for when the input sequences are very long (e.g. raw audio) and the output is shorter (subsampled, filters, etc.) Returns ------- dependencies : torch.BoolTensor Matrix representing whether an output is dependent on an input; index using `[in_frame_idx, out_frame_idx]`. `True` indicates a detected dependency. """ # TODO: document arguments bs, seq_len, feat_len = seq_shape base_seq = torch.rand(seq_shape) with torch.no_grad(): base_out = model(base_seq) if not model(base_seq).equal(base_out): raise ValueError( "Expected deterministic model, but inferring twice on the same " "data yielded different results. Make sure that you use " "`eval()` mode so that it does not include randomness." ) out_len, _out_feat_len = base_out.shape[1:] deps = torch.zeros( ((seq_len + (in_stride - 1)) // in_stride, out_len), dtype=torch.bool ) for in_frame_idx in range(0, seq_len, in_stride): test_seq = base_seq.clone() test_seq[:, in_frame_idx, :] = torch.rand(bs, feat_len) with torch.no_grad(): test_out = model(test_seq) for out_frame_idx in range(out_len): if not torch.allclose( test_out[:, out_frame_idx, :], base_out[:, out_frame_idx, :] ): deps[in_frame_idx // in_stride][out_frame_idx] = True return deps
[docs] def plot_dependency_matrix(deps): """ Returns a matplotlib figure of a dependency matrix generated by `infer_dependency_matrix`. At a given point, a red square indicates that a given output frame (y-axis) was to depend on a given input frame (x-axis). For example, a fully red image means that all output frames were dependent on all the history. This could be the case of a bidirectional RNN, or a transformer model, for example. Arguments --------- deps : torch.BoolTensor Matrix returned by `infer_dependency_matrix` or one in a compatible format. """ import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap cmap = ListedColormap(["white", "red"]) fig, ax = plt.subplots() ax.pcolormesh( torch.permute(deps, (1, 0)), cmap=cmap, vmin=False, vmax=True, edgecolors="gray", linewidth=0.5, ) ax.set_title("Dependency plot") ax.set_xlabel("in") ax.set_ylabel("out") ax.set_aspect("equal") return fig