Source code for speechbrain.lobes.models.Xvector

"""A popular speaker recognition and diarization model.

Authors
 * Nauman Dawalatabad 2020
 * Mirco Ravanelli 2020
"""

# import os
import torch  # noqa: F401
import torch.nn as nn
import speechbrain as sb
from speechbrain.nnet.pooling import StatisticsPooling
from speechbrain.nnet.CNN import Conv1d
from speechbrain.nnet.linear import Linear
from speechbrain.nnet.normalization import BatchNorm1d


[docs] class Xvector(torch.nn.Module): """This model extracts X-vectors for speaker recognition and diarization. Arguments --------- device : str Device used e.g. "cpu" or "cuda". activation : torch class A class for constructing the activation layers. tdnn_blocks : int Number of time-delay neural (TDNN) layers. tdnn_channels : list of ints Output channels for TDNN layer. tdnn_kernel_sizes : list of ints List of kernel sizes for each TDNN layer. tdnn_dilations : list of ints List of dilations for kernels in each TDNN layer. lin_neurons : int Number of neurons in linear layers. Example ------- >>> compute_xvect = Xvector('cpu') >>> input_feats = torch.rand([5, 10, 40]) >>> outputs = compute_xvect(input_feats) >>> outputs.shape torch.Size([5, 1, 512]) """ def __init__( self, device="cpu", activation=torch.nn.LeakyReLU, tdnn_blocks=5, tdnn_channels=[512, 512, 512, 512, 1500], tdnn_kernel_sizes=[5, 3, 3, 1, 1], tdnn_dilations=[1, 2, 3, 1, 1], lin_neurons=512, in_channels=40, ): super().__init__() self.blocks = nn.ModuleList() # TDNN layers for block_index in range(tdnn_blocks): out_channels = tdnn_channels[block_index] self.blocks.extend( [ Conv1d( in_channels=in_channels, out_channels=out_channels, kernel_size=tdnn_kernel_sizes[block_index], dilation=tdnn_dilations[block_index], ), activation(), BatchNorm1d(input_size=out_channels), ] ) in_channels = tdnn_channels[block_index] # Statistical pooling self.blocks.append(StatisticsPooling()) # Final linear transformation self.blocks.append( Linear( input_size=out_channels * 2, n_neurons=lin_neurons, bias=True, combine_dims=False, ) )
[docs] def forward(self, x, lens=None): """Returns the x-vectors. Arguments --------- x : torch.Tensor """ for layer in self.blocks: try: x = layer(x, lengths=lens) except TypeError: x = layer(x) return x
[docs] class Classifier(sb.nnet.containers.Sequential): """This class implements the last MLP on the top of xvector features. Arguments --------- input_shape : tuple Expected shape of an example input. activation : torch class A class for constructing the activation layers. lin_blocks : int Number of linear layers. lin_neurons : int Number of neurons in linear layers. out_neurons : int Number of output neurons. Example ------- >>> input_feats = torch.rand([5, 10, 40]) >>> compute_xvect = Xvector() >>> xvects = compute_xvect(input_feats) >>> classify = Classifier(input_shape=xvects.shape) >>> output = classify(xvects) >>> output.shape torch.Size([5, 1, 1211]) """ def __init__( self, input_shape, activation=torch.nn.LeakyReLU, lin_blocks=1, lin_neurons=512, out_neurons=1211, ): super().__init__(input_shape=input_shape) self.append(activation(), layer_name="act") self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm") if lin_blocks > 0: self.append(sb.nnet.containers.Sequential, layer_name="DNN") for block_index in range(lin_blocks): block_name = f"block_{block_index}" self.DNN.append( sb.nnet.containers.Sequential, layer_name=block_name ) self.DNN[block_name].append( sb.nnet.linear.Linear, n_neurons=lin_neurons, bias=True, layer_name="linear", ) self.DNN[block_name].append(activation(), layer_name="act") self.DNN[block_name].append( sb.nnet.normalization.BatchNorm1d, layer_name="norm" ) # Final Softmax classifier self.append( sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out" ) self.append( sb.nnet.activations.Softmax(apply_log=True), layer_name="softmax" )
[docs] class Discriminator(sb.nnet.containers.Sequential): """This class implements a discriminator on the top of xvector features. Arguments --------- device : str Device used e.g. "cpu" or "cuda" activation : torch class A class for constructing the activation layers. lin_blocks : int Number of linear layers. lin_neurons : int Number of neurons in linear layers. Example ------- >>> input_feats = torch.rand([5, 10, 40]) >>> compute_xvect = Xvector() >>> xvects = compute_xvect(input_feats) >>> discriminate = Discriminator(xvects.shape) >>> output = discriminate(xvects) >>> output.shape torch.Size([5, 1, 1]) """ def __init__( self, input_shape, activation=torch.nn.LeakyReLU, lin_blocks=1, lin_neurons=512, out_neurons=1, ): super().__init__(input_shape=input_shape) if lin_blocks > 0: self.append(sb.nnet.containers.Sequential, layer_name="DNN") for block_index in range(lin_blocks): block_name = f"block_{block_index}" self.DNN.append( sb.nnet.containers.Sequential, layer_name=block_name ) self.DNN[block_name].append( sb.nnet.linear.Linear, n_neurons=lin_neurons, bias=True, combine_dims=False, layer_name="linear", ) self.DNN[block_name].append( sb.nnet.normalization.BatchNorm1d, layer_name="norm" ) self.DNN[block_name].append(activation(), layer_name="act") # Final Layer (sigmoid not included) self.append( sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out" )