\n",
"
speechbrain.dataio.batch.PaddedBatch
def __init__(examples, padded_keys=None, device_prep_keys=None, padding_func=batch_pad_right, padding_kwargs=None, per_key_padding_kwargs=None, apply_default_convert=True, nonpadded_stack=True)
/usr/local/lib/python3.12/dist-packages/speechbrain/dataio/batch.pyCollate_fn when examples are dicts and have variable-length sequences.\n",
"\n",
"Different elements in the examples get matched by key.\n",
"All numpy tensors get converted to Torch (PyTorch default_convert)\n",
"Then, by default, all torch.Tensor valued elements get padded and support\n",
"collective pin_memory() and to() calls.\n",
"Regular Python data types are just collected in a list.\n",
"\n",
"Arguments\n",
"---------\n",
"examples : list\n",
" List of example dicts, as produced by Dataloader.\n",
"padded_keys : list, None\n",
" (Optional) List of keys to pad on. If None, pad all torch.Tensors\n",
"device_prep_keys : list, None\n",
" (Optional) Only these keys participate in collective memory pinning and moving with\n",
" to().\n",
" If None, defaults to all items with torch.Tensor values.\n",
"padding_func : callable, optional\n",
" Called with a list of tensors to be padded together. Needs to return\n",
" two tensors: the padded data, and another tensor for the data lengths.\n",
"padding_kwargs : dict, None\n",
" (Optional) Extra kwargs to pass to padding_func. E.G. mode, value\n",
" This is used as the default padding configuration for all keys.\n",
"per_key_padding_kwargs : dict, None\n",
" (Optional) Per-key padding configuration. Keys in this dict should match\n",
" the keys in the examples. Each value should be a dict with padding parameters\n",
" (e.g., {'value': -100, 'mode': 'constant'}). If a key is not in this dict,\n",
" the global padding_kwargs will be used.\n",
"apply_default_convert : bool\n",
" Whether to apply PyTorch default_convert (numpy to torch recursively,\n",
" etc.) on all data. Default:True, usually does the right thing.\n",
"nonpadded_stack : bool\n",
" Whether to apply PyTorch-default_collate-like stacking on values that\n",
" didn't get padded. This stacks if it can, but doesn't error out if it\n",
" cannot. Default:True, usually does the right thing.\n",
"\n",
"Example\n",
"-------\n",
">>> batch = PaddedBatch(\n",
"... [\n",
"... {"id": "ex1", "foo": torch.Tensor([1.0])},\n",
"... {"id": "ex2", "foo": torch.Tensor([2.0, 1.0])},\n",
"... ]\n",
"... )\n",
">>> # Attribute or key-based access:\n",
">>> batch.id\n",
"['ex1', 'ex2']\n",
">>> batch["id"]\n",
"['ex1', 'ex2']\n",
">>> # torch.Tensors get padded\n",
">>> type(batch.foo)\n",
"<class 'speechbrain.dataio.batch.PaddedData'>\n",
">>> batch.foo.data\n",
"tensor([[1., 0.],\n",
" [2., 1.]])\n",
">>> batch.foo.lengths\n",
"tensor([0.5000, 1.0000])\n",
">>> # Batch supports collective operations:\n",
">>> _ = batch.to(dtype=torch.half)\n",
">>> batch.foo.data\n",
"tensor([[1., 0.],\n",
" [2., 1.]], dtype=torch.float16)\n",
">>> batch.foo.lengths\n",
"tensor([0.5000, 1.0000], dtype=torch.float16)\n",
">>> # Numpy tensors get converted to torch and padded as well:\n",
">>> import numpy as np\n",
">>> batch = PaddedBatch(\n",
"... [{"wav": np.asarray([1, 2, 3, 4])}, {"wav": np.asarray([1, 2, 3])}]\n",
"... )\n",
">>> batch.wav # +ELLIPSIS\n",
"PaddedData(data=tensor([[1, 2,...\n",
">>> # Basic stacking collation deals with non padded data:\n",
">>> batch = PaddedBatch(\n",
"... [\n",
"... {\n",
"... "spk_id": torch.tensor([1]),\n",
"... "wav": torch.tensor([0.1, 0.0, 0.3]),\n",
"... },\n",
"... {\n",
"... "spk_id": torch.tensor([2]),\n",
"... "wav": torch.tensor([0.2, 0.3, -0.1]),\n",
"... },\n",
"... ],\n",
"... padded_keys=["wav"],\n",
"... )\n",
">>> batch.spk_id\n",
"tensor([[1],\n",
" [2]])\n",
">>> # And some data is left alone:\n",
">>> batch = PaddedBatch(\n",
"... [{"text": ["Hello"]}, {"text": ["How", "are", "you?"]}]\n",
"... )\n",
">>> batch.text\n",
"[['Hello'], ['How', 'are', 'you?']]\n",
">>> # Per-key padding configuration:\n",
">>> batch = PaddedBatch(\n",
"... [\n",
"... {\n",
"... "wav": torch.tensor([1, 2, 3]),\n",
"... "labels": torch.tensor([1, 2]),\n",
"... },\n",
"... {"wav": torch.tensor([4, 5]), "labels": torch.tensor([3])},\n",
"... ],\n",
"... per_key_padding_kwargs={\n",
"... "wav": {"value": 0},\n",
"... "labels": {"value": -100},\n",
"... },\n",
"... )\n",
">>> batch.wav.data\n",
"tensor([[1, 2, 3],\n",
" [4, 5, 0]])\n",
">>> batch.labels.data\n",
"tensor([[ 1, 2],\n",
" [ 3, -100]])\n",
" \n",
"
"
]
},
"metadata": {},
"execution_count": 111
}
],
"source": [
"batch_obj # the dataloader returns an PaddedBatch obj now\n",
"type(batch_obj)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8IZb33Bt0L9d"
},
"source": [
"Dynamic Items can be accessed in the batch object by using `dict` syntax:"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"id": "dB42KZ3AxLth",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "643c11bb-fbf0-49e1-a9b5-7a1d2799e0c6"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"PaddedData(data=tensor([[0],\n",
" [0]]), lengths=tensor([1., 1.]))"
]
},
"metadata": {},
"execution_count": 112
}
],
"source": [
"batch_obj[\"spkid_encoded\"]"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"id": "ZKJlXaW10XRg",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "c5186323-026f-4477-d0b8-f31ac2db1614"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"PaddedData(data=tensor([[ 0.0020, 0.0006, 0.0004, ..., -0.0033, -0.0034, -0.0029],\n",
" [-0.0018, -0.0023, -0.0027, ..., 0.0000, 0.0000, 0.0000]]), lengths=tensor([1.0000, 0.6220]))"
]
},
"metadata": {},
"execution_count": 113
}
],
"source": [
"batch_obj[\"signal\"]"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"id": "gEG6g0sc0ZXn",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "5fa1ffa7-b0cb-436a-dd81-4d4f01f9e087"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['3576-138058-0019', '3576-138058-0021']"
]
},
"metadata": {},
"execution_count": 114
}
],
"source": [
"batch_obj[\"id\"] # example ids in this batch useful for debugging"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UF3k_9060g4y"
},
"source": [
"As said, all elements in PaddedBatch which are `torch.Tensors` are padded together by adding zeros to the right.\n",
"When these elements are accessed a [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple) is returned: the actual padded tensors and a `length` tensor."
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"id": "G2pFsSa702_x"
},
"outputs": [],
"source": [
"wav_data, length = batch_obj[\"signal\"]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GhPDU20bcvNA"
},
"source": [
"As it is a [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple) the two items *length* and *data* are also accessible as attributes:"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"id": "glL_x9sqb8vO"
},
"outputs": [],
"source": [
"lengths = batch_obj[\"signal\"].lengths\n",
"wav_data = batch_obj[\"signal\"].data"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"id": "YDzqHldc0bBe",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "0d13cecb-97ed-4279-e7d1-50c31215e8c0"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"tensor([1.0000, 0.6220])"
]
},
"metadata": {},
"execution_count": 117
}
],
"source": [
"lengths"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zSbkh9tJ08j2"
},
"source": [
"This length tensor contains the relative true length of each sequence.\n",
"In this example it means that the second example in the batch has not been padded (relative length == 1) while the first instead has been padded to more twice its length.\n",
"\n",
"The use of relative lengths instead of absolute indexes guarantees that that these values do not change even after feature extraction: the relative true length remains the same even after STFT whatever is the window.\n",
"\n",
"The absolute indexes are easy to obtain:"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"id": "tx8HUi2U1Tsu",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "63974e5d-2a89-442d-87cf-6a294ab7ac24"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"tensor([158960, 98879])"
]
},
"metadata": {},
"execution_count": 118
}
],
"source": [
"abs_lens = (lengths*wav_data.shape[1]).long()\n",
"abs_lens"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"id": "KLClRlo-1xjg",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "ed4b3d3a-30a1-4b11-8368-74d97115736f"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"tensor([-0.0018, -0.0023, -0.0027, ..., 0.0042, 0.0052, 0.0038])"
]
},
"metadata": {},
"execution_count": 119
}
],
"source": [
"wav_data[1][:abs_lens[1]] # no zeros"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"id": "NmH1C4Dx2Fyt",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a561c82d-e9cd-4799-bdb6-90b9dbada775"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"tensor([0.0019, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000])"
]
},
"metadata": {},
"execution_count": 120
}
],
"source": [
"wav_data[1][abs_lens[1]:] # zeros begins at abs_lens[0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BRIP5d4h2e2Y"
},
"source": [
"The PaddedBatch object allows for conveniently moving all dynamic items which are` torch.Tensor` to the right device using to:"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"id": "gtn6h1hM2Xw2"
},
"outputs": [],
"source": [
"batch_obj = batch_obj.to(\"cpu\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IyZ84-4-20wn"
},
"source": [
"Of course items which are not tensors such as `id` are not moved and are not padded. Instead they are simply returned as a list."
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"id": "O8-4oUMw2xNd",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "05816f8e-bd9e-41a8-f469-37c38f5e305f"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['3576-138058-0019', '3576-138058-0021']"
]
},
"metadata": {},
"execution_count": 122
}
],
"source": [
"batch_obj[\"id\"]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Vz09N7Zb3Iho"
},
"source": [
"It is also possible to iterate over the examples of `PaddedBatch`:"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"id": "05vyHnk23H8X",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "5d6f6b79-645c-4fe9-90eb-573c06671964"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['3576-138058-0019', '3576-138058-0021']\n",
"PaddedData(data=tensor([[0],\n",
" [0]]), lengths=tensor([1., 1.]))\n",
"PaddedData(data=tensor([[ 0.0020, 0.0006, 0.0004, ..., -0.0033, -0.0034, -0.0029],\n",
" [-0.0018, -0.0023, -0.0027, ..., 0.0000, 0.0000, 0.0000]]), lengths=tensor([1.0000, 0.6220]))\n"
]
}
],
"source": [
"for ex in batch_obj:\n",
" print(ex)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sXjZOd3O3tt1"
},
"source": [
"And access a single example by its position:"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"id": "2mPr65B22_r8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "087360a2-824d-43d2-9a66-4d64e6dc09b8"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"PaddedData(data=tensor([[0],\n",
" [0]]), lengths=tensor([1., 1.]))"
]
},
"metadata": {},
"execution_count": 124
}
],
"source": [
"batch_obj.at_position(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ze8DO5Nj32eE"
},
"source": [
"These methods can be conveniently used in the `Brain` class `compute_forward` and `compute_objectives` methods.\n",
"As we have shown in the first example of this tutorial where a complete dataIO example was illustrated:\n",
"\n",
"\n",
"\n",
"\n",
"```python\n",
"def compute_forward(self, batch, stage):\n",
" audio, audio_len = batch[\"sig\"]\n",
" # the examples are automatically padded, audio_len contains the relative\n",
" # length of the original sequence.\n",
" return self.modules.model(audio.unsqueeze(1)).mean(-1).unsqueeze(-1)\n",
" \n",
" def compute_objectives(self, logits, batch, stage):\n",
" spk_ids, _ = batch[\"spkid_encoded\"]\n",
" return torch.nn.functional.cross_entropy(logits, spk_ids)\n",
"```\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ufpPurk3bsvc"
},
"source": [
"## Full Example: Training a simple Speaker Recognition System."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yKox1vfgbyey"
},
"source": [
"Hereafter we show how the **DynamicItemDataset**, **DIPs** and **CategoricalEncoder** can be used to build a data pipeline for Speaker Recognition.\n",
"\n",
"In particular we have to:\n",
"\n",
"- read the audio\n",
"- read the speaker ID from annotation and encode it to integer\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_0e4ksGZpErL"
},
"source": [
"We firstly instantiate the dataset from that JSON annotation"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"id": "9RKlcn6JpKo9"
},
"outputs": [],
"source": [
"dataset = DynamicItemDataset.from_json(\"data.json\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KdI7Uakdi1kh"
},
"source": [
"Then fit the **CategoricalEncoder** to speaker IDs (`spkID`) in the annotation."
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"id": "4dLTFQyPi0yk"
},
"outputs": [],
"source": [
"spk_id_encoder = CategoricalEncoder()\n",
"spk_id_encoder.update_from_didataset(dataset, \"spkID\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aM2U403ejF3b"
},
"source": [
"We add the **DIP** which encodes `spkID`"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"id": "IRWERFtjjFEF"
},
"outputs": [],
"source": [
"dataset.add_dynamic_item(spk_id_encoder.encode_label_torch, takes=\"spkID\", provides=\"spk_encoded\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2PNpmwVZjPLn"
},
"source": [
"We add the **DIP** for reading the audio\n"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"id": "L4ccm3_KjSOF"
},
"outputs": [],
"source": [
"dataset.add_dynamic_item(speechbrain.dataio.dataio.read_audio, takes=\"file_path\", provides=\"signal\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ge4RwtGSNWMm"
},
"source": [
"#### Caching features for the speaker pipeline\n",
"\n",
"We can cache the filterbank features so each epoch only reads them from a single HDF5 file instead of recomputing them or creating thousands of small `.pt` files. This mirrors the caching workflow shown earlier, but plugs directly into the speaker-recognition `DynamicItemDataset`.\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "_6davA36NWMm"
},
"execution_count": 129,
"outputs": [],
"source": [
"import os, shutil\n",
"from speechbrain.integrations.hdf5.cached_item import CachedHDF5DynamicItem\n",
"from speechbrain.lobes.features import Fbank\n",
"\n",
"# Clean cache for a reproducible run in the notebook\n",
"shutil.rmtree(\"spk_feat_cache\", ignore_errors=True)\n",
"os.makedirs(\"spk_feat_cache\", exist_ok=True)\n",
"\n",
"fbank = Fbank()\n",
"\n",
"@CachedHDF5DynamicItem.cache(\n",
" cache_location=\"spk_feat_cache\",\n",
" cache_filename=\"speaker_feats.hdf5\",\n",
" compression=\"gzip\",\n",
")\n",
"@speechbrain.utils.data_pipeline.takes(\"id\", \"signal\")\n",
"@speechbrain.utils.data_pipeline.provides(\"feats\")\n",
"def cached_fbank(uid, sig):\n",
" return fbank(sig.unsqueeze(0)).squeeze(0)\n",
"\n",
"dataset.add_dynamic_item(cached_fbank)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0s13XGkjNWMm"
},
"source": [
"The cache is lazy: the first access to an utterance writes it into `speaker_feats.hdf5`. Reusing the notebook (or multiple workers) just reads from the same file.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0_oJNpxkpgqF"
},
"source": [
"and set the outputs of the dataset we want to access in training loop"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {
"id": "sahSr0yJpkhY"
},
"outputs": [],
"source": [
"dataset.set_output_keys([\"id\", \"feats\", \"spk_encoded\"])\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ovhe-jc8jWzh"
},
"source": [
"We sort the dataset based on length to speed up training so that we minimize in batches the amount of padded elements."
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"id": "jG36aYVvjWB7"
},
"outputs": [],
"source": [
"sorted_data = dataset.filtered_sorted(sort_key=\"length\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IPGrUCU5kNY3"
},
"source": [
"We can train now a simple classifier, by passing the dataset object directly to the **Brain** class. The **Brain** class will automatically create a SaveableDataLoader with specified `train_loader_kwargs` and will be handling the padding for you."
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"id": "OFFiFjaek8uN",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b63d6a3b-7ce0-406e-eb32-719215c702d2"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"INFO:speechbrain.core:Gradscaler enabled: `False`\n",
"INFO:speechbrain.core:Using training precision: `--precision=fp32`\n",
"INFO:speechbrain.core:Using evaluation precision: `--eval_precision=fp32`\n",
"INFO:speechbrain.core:SimpleBrain Model Statistics:\n",
"* Total Number of Trainable Parameters: 23.8k\n",
"* Total Number of Parameters: 23.8k\n",
"* Trainable Parameters represent 100.0000% of the total size.\n",
"100%|██████████| 68/68 [00:25<00:00, 2.70it/s, train_loss=4.22]\n"
]
}
],
"source": [
"from speechbrain.lobes.features import MFCC, Fbank\n",
"from speechbrain.nnet.losses import nll_loss\n",
"\n",
"\n",
"class SimpleBrain(speechbrain.Brain):\n",
" def compute_forward(self, batch, stage):\n",
" x = batch.feats.data\n",
" x = self.modules.encoder(x)\n",
" x = self.modules.pooling(x, batch.feats.lengths)\n",
" x = self.modules.to_output(x)\n",
" return self.modules.softmax(x)\n",
"\n",
" def compute_objectives(self, logits, batch, stage):\n",
" return nll_loss(logits, batch.spk_encoded.data)\n",
"\n",
"modules = {\n",
" \"encoder\": torch.nn.Sequential(torch.nn.Linear(40, 256),\n",
" torch.nn.ReLU()),\n",
" \"pooling\": speechbrain.nnet.pooling.StatisticsPooling(),\n",
" \"to_output\": torch.nn.Linear(512, len(spk_id_encoder)),\n",
" \"softmax\": speechbrain.nnet.activations.Softmax(apply_log=True)\n",
"}\n",
"brain = SimpleBrain(modules, opt_class=lambda x: torch.optim.SGD(x, 0.01), run_opts={\"device\": \"cpu\"})\n",
"brain.fit(range(1), train_set=sorted_data,\n",
" train_loader_kwargs={\"batch_size\": 16, \"drop_last\":True})\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "H9vn3zCqNWMn"
},
"source": [
"## Authors\n",
"\n",
"- SpeechBrain team: Mirco Ravanelli, Titouan Parcollet, Peter Plantinga, and Adel Moumen (2026)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lvCQK0beq8Fc"
},
"source": [
"## Acknowledgements\n",
"\n",
"\n",
"\n",
"* Many thanks to Nasser Benabderrazik ([lenassero](https://github.com/lenassero)) who helped improving this Tutorial.\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sb_auto_footer",
"tags": [
"sb_auto_footer"
]
},
"source": [
"## Citing SpeechBrain\n",
"\n",
"If you use SpeechBrain in your research or business, please cite it using the following BibTeX entry:\n",
"\n",
"```bibtex\n",
"@misc{speechbrainV1,\n",
" title={Open-Source Conversational AI with {SpeechBrain} 1.0},\n",
" author={Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Gaelle Laperriere and Mickael Rouvier and Renato De Mori and Yannick Esteve},\n",
" year={2024},\n",
" eprint={2407.00463},\n",
" archivePrefix={arXiv},\n",
" primaryClass={cs.LG},\n",
" url={https://arxiv.org/abs/2407.00463},\n",
"}\n",
"@misc{speechbrain,\n",
" title={{SpeechBrain}: A General-Purpose Speech Toolkit},\n",
" author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},\n",
" year={2021},\n",
" eprint={2106.04624},\n",
" archivePrefix={arXiv},\n",
" primaryClass={eess.AS},\n",
" note={arXiv:2106.04624}\n",
"}\n",
"```"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}