"""A popular speaker recognition and diarization model.
Authors
* Nauman Dawalatabad 2020
* Mirco Ravanelli 2020
"""
# import os
import torch # noqa: F401
import torch.nn as nn
import speechbrain as sb
from speechbrain.nnet.CNN import Conv1d
from speechbrain.nnet.linear import Linear
from speechbrain.nnet.normalization import BatchNorm1d
from speechbrain.nnet.pooling import StatisticsPooling
[docs]
class Xvector(torch.nn.Module):
"""This model extracts X-vectors for speaker recognition and diarization.
Arguments
---------
device : str
Device used e.g. "cpu" or "cuda".
activation : torch class
A class for constructing the activation layers.
tdnn_blocks : int
Number of time-delay neural (TDNN) layers.
tdnn_channels : list of ints
Output channels for TDNN layer.
tdnn_kernel_sizes : list of ints
List of kernel sizes for each TDNN layer.
tdnn_dilations : list of ints
List of dilations for kernels in each TDNN layer.
lin_neurons : int
Number of neurons in linear layers.
in_channels : int
Expected size of input features.
Example
-------
>>> compute_xvect = Xvector('cpu')
>>> input_feats = torch.rand([5, 10, 40])
>>> outputs = compute_xvect(input_feats)
>>> outputs.shape
torch.Size([5, 1, 512])
"""
def __init__(
self,
device="cpu",
activation=torch.nn.LeakyReLU,
tdnn_blocks=5,
tdnn_channels=[512, 512, 512, 512, 1500],
tdnn_kernel_sizes=[5, 3, 3, 1, 1],
tdnn_dilations=[1, 2, 3, 1, 1],
lin_neurons=512,
in_channels=40,
):
super().__init__()
self.blocks = nn.ModuleList()
# TDNN layers
for block_index in range(tdnn_blocks):
out_channels = tdnn_channels[block_index]
self.blocks.extend(
[
Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=tdnn_kernel_sizes[block_index],
dilation=tdnn_dilations[block_index],
),
activation(),
BatchNorm1d(input_size=out_channels),
]
)
in_channels = tdnn_channels[block_index]
# Statistical pooling
self.blocks.append(StatisticsPooling())
# Final linear transformation
self.blocks.append(
Linear(
input_size=out_channels * 2,
n_neurons=lin_neurons,
bias=True,
combine_dims=False,
)
)
[docs]
def forward(self, x, lens=None):
"""Returns the x-vectors.
Arguments
---------
x : torch.Tensor
Inputs features for extracting x-vectors.
lens : torch.Tensor
The corresponding relative lengths of the inputs.
Returns
-------
x : torch.Tensor
X-vectors.
"""
for layer in self.blocks:
try:
x = layer(x, lengths=lens)
except TypeError:
x = layer(x)
return x
[docs]
class Classifier(sb.nnet.containers.Sequential):
"""This class implements the last MLP on the top of xvector features.
Arguments
---------
input_shape : tuple
Expected shape of an example input.
activation : torch class
A class for constructing the activation layers.
lin_blocks : int
Number of linear layers.
lin_neurons : int
Number of neurons in linear layers.
out_neurons : int
Number of output neurons.
Example
-------
>>> input_feats = torch.rand([5, 10, 40])
>>> compute_xvect = Xvector()
>>> xvects = compute_xvect(input_feats)
>>> classify = Classifier(input_shape=xvects.shape)
>>> output = classify(xvects)
>>> output.shape
torch.Size([5, 1, 1211])
"""
def __init__(
self,
input_shape,
activation=torch.nn.LeakyReLU,
lin_blocks=1,
lin_neurons=512,
out_neurons=1211,
):
super().__init__(input_shape=input_shape)
self.append(activation(), layer_name="act")
self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm")
if lin_blocks > 0:
self.append(sb.nnet.containers.Sequential, layer_name="DNN")
for block_index in range(lin_blocks):
block_name = f"block_{block_index}"
self.DNN.append(
sb.nnet.containers.Sequential, layer_name=block_name
)
self.DNN[block_name].append(
sb.nnet.linear.Linear,
n_neurons=lin_neurons,
bias=True,
layer_name="linear",
)
self.DNN[block_name].append(activation(), layer_name="act")
self.DNN[block_name].append(
sb.nnet.normalization.BatchNorm1d, layer_name="norm"
)
# Final Softmax classifier
self.append(
sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out"
)
self.append(
sb.nnet.activations.Softmax(apply_log=True), layer_name="softmax"
)
[docs]
class Discriminator(sb.nnet.containers.Sequential):
"""This class implements a discriminator on the top of xvector features.
Arguments
---------
input_shape : tuple
Expected shape of the input tensor.
activation : torch class
A class for constructing the activation layers.
lin_blocks : int
Number of linear layers.
lin_neurons : int
Number of neurons in linear layers.
out_neurons : int
Size of the output vector.
Example
-------
>>> input_feats = torch.rand([5, 10, 40])
>>> compute_xvect = Xvector()
>>> xvects = compute_xvect(input_feats)
>>> discriminate = Discriminator(xvects.shape)
>>> output = discriminate(xvects)
>>> output.shape
torch.Size([5, 1, 1])
"""
def __init__(
self,
input_shape,
activation=torch.nn.LeakyReLU,
lin_blocks=1,
lin_neurons=512,
out_neurons=1,
):
super().__init__(input_shape=input_shape)
if lin_blocks > 0:
self.append(sb.nnet.containers.Sequential, layer_name="DNN")
for block_index in range(lin_blocks):
block_name = f"block_{block_index}"
self.DNN.append(
sb.nnet.containers.Sequential, layer_name=block_name
)
self.DNN[block_name].append(
sb.nnet.linear.Linear,
n_neurons=lin_neurons,
bias=True,
combine_dims=False,
layer_name="linear",
)
self.DNN[block_name].append(
sb.nnet.normalization.BatchNorm1d, layer_name="norm"
)
self.DNN[block_name].append(activation(), layer_name="act")
# Final Layer (sigmoid not included)
self.append(
sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out"
)