Source code for speechbrain.lobes.models.CRDNN

"""A combination of Convolutional, Recurrent, and Fully-connected networks.

 * Mirco Ravanelli 2020
 * Peter Plantinga 2020
 * Ju-Chieh Chou 2020
 * Titouan Parcollet 2020
 * Abdel 2020
import torch
import speechbrain as sb

[docs] class CRDNN(sb.nnet.containers.Sequential): """This model is a combination of CNNs, RNNs, and DNNs. This model expects 3-dimensional input [batch, time, feats] and by default produces output of the size [batch, time, dnn_neurons]. One exception is if ``using_2d_pooling`` or ``time_pooling`` is True. In this case, the time dimension will be downsampled. Arguments --------- input_size : int The length of the expected input at the third dimension. input_shape : tuple While input_size will suffice, this option can allow putting CRDNN into a sequential with other classes. activation : torch class A class used for constructing the activation layers for CNN and DNN. dropout : float Neuron dropout rate as applied to CNN, RNN, and DNN. cnn_blocks : int The number of convolutional neural blocks to include. cnn_channels : list of ints A list of the number of output channels for each CNN block. cnn_kernelsize : tuple of ints The size of the convolutional kernels. time_pooling : bool Whether to pool the utterance on the time axis before the RNN. time_pooling_size : int The number of elements to pool on the time axis. time_pooling_stride : int The number of elements to increment by when iterating the time axis. using_2d_pooling: bool Whether using a 2D or 1D pooling after each CNN block. inter_layer_pooling_size : list of ints A list of the pooling sizes for each CNN block. rnn_class : torch class The type of RNN to use in CRDNN network (LiGRU, LSTM, GRU, RNN) rnn_layers : int The number of recurrent RNN layers to include. rnn_neurons : int Number of neurons in each layer of the RNN. rnn_bidirectional : bool Whether this model will process just forward or in both directions. rnn_re_init : bool, If True, an orthogonal initialization will be applied to the recurrent weights. dnn_blocks : int The number of linear neural blocks to include. dnn_neurons : int The number of neurons in the linear layers. use_rnnp: bool If True, a linear projection layer is added between RNN layers. projection_dim : int The number of neurons in the projection layer. This layer is used to reduce the size of the flattened representation obtained after the CNN blocks. Example ------- >>> inputs = torch.rand([10, 15, 60]) >>> model = CRDNN(input_shape=inputs.shape) >>> outputs = model(inputs) >>> outputs.shape torch.Size([10, 15, 512]) """ def __init__( self, input_size=None, input_shape=None, activation=torch.nn.LeakyReLU, dropout=0.15, cnn_blocks=2, cnn_channels=[128, 256], cnn_kernelsize=(3, 3), time_pooling=False, time_pooling_size=2, freq_pooling_size=2, rnn_class=sb.nnet.RNN.LiGRU, inter_layer_pooling_size=[2, 2], using_2d_pooling=False, rnn_layers=4, rnn_neurons=512, rnn_bidirectional=True, rnn_re_init=False, dnn_blocks=2, dnn_neurons=512, projection_dim=-1, use_rnnp=False, ): if input_size is None and input_shape is None: raise ValueError("Must specify one of input_size or input_shape") if input_shape is None: input_shape = [None, None, input_size] super().__init__(input_shape=input_shape) if cnn_blocks > 0: self.append(sb.nnet.containers.Sequential, layer_name="CNN") for block_index in range(cnn_blocks): self.CNN.append( CNN_Block, channels=cnn_channels[block_index], kernel_size=cnn_kernelsize, using_2d_pool=using_2d_pooling, pooling_size=inter_layer_pooling_size[block_index], activation=activation, dropout=dropout, layer_name=f"block_{block_index}", ) if time_pooling: self.append( sb.nnet.pooling.Pooling1d( pool_type="max", input_dims=4, kernel_size=time_pooling_size, pool_axis=1, ), layer_name="time_pooling", ) # This projection helps reducing the number of parameters # when using large number of CNN filters. # Large numbers of CNN filters + large features # often lead to very large flattened layers. # This layer projects it back to something reasonable. if projection_dim != -1: self.append(sb.nnet.containers.Sequential, layer_name="projection") self.projection.append( sb.nnet.linear.Linear, n_neurons=projection_dim, bias=True, combine_dims=True, layer_name="linear", ) self.projection.append( sb.nnet.normalization.LayerNorm, layer_name="norm" ) self.projection.append(activation(), layer_name="act") if rnn_layers > 0: if use_rnnp: self.append(sb.nnet.containers.Sequential, layer_name="RNN") for _ in range(rnn_layers): self.append( rnn_class, hidden_size=rnn_neurons, num_layers=1, bidirectional=rnn_bidirectional, re_init=rnn_re_init, ) self.append( sb.nnet.linear.Linear, n_neurons=dnn_neurons, bias=True, combine_dims=True, ) self.append(torch.nn.Dropout(p=dropout)) else: self.append( rnn_class, layer_name="RNN", hidden_size=rnn_neurons, num_layers=rnn_layers, dropout=dropout, bidirectional=rnn_bidirectional, re_init=rnn_re_init, ) if dnn_blocks > 0: self.append(sb.nnet.containers.Sequential, layer_name="DNN") for block_index in range(dnn_blocks): self.DNN.append( DNN_Block, neurons=dnn_neurons, activation=activation, dropout=dropout, layer_name=f"block_{block_index}", )
[docs] class CNN_Block(sb.nnet.containers.Sequential): """CNN Block, based on VGG blocks. Arguments --------- input_shape : tuple Expected shape of the input. channels : int Number of convolutional channels for the block. kernel_size : tuple Size of the 2d convolutional kernel activation : torch.nn.Module class A class to be used for instantiating an activation layer. using_2d_pool : bool Whether to use 2d pooling or only 1d pooling. pooling_size : int Size of pooling kernel, duplicated for 2d pooling. dropout : float Rate to use for dropping channels. Example ------- >>> inputs = torch.rand(10, 15, 60) >>> block = CNN_Block(input_shape=inputs.shape, channels=32) >>> outputs = block(inputs) >>> outputs.shape torch.Size([10, 15, 30, 32]) """ def __init__( self, input_shape, channels, kernel_size=[3, 3], activation=torch.nn.LeakyReLU, using_2d_pool=False, pooling_size=2, dropout=0.15, ): super().__init__(input_shape=input_shape) self.append( sb.nnet.CNN.Conv2d, out_channels=channels, kernel_size=kernel_size, layer_name="conv_1", ) self.append(sb.nnet.normalization.LayerNorm, layer_name="norm_1") self.append(activation(), layer_name="act_1") self.append( sb.nnet.CNN.Conv2d, out_channels=channels, kernel_size=kernel_size, layer_name="conv_2", ) self.append(sb.nnet.normalization.LayerNorm, layer_name="norm_2") self.append(activation(), layer_name="act_2") if using_2d_pool: self.append( sb.nnet.pooling.Pooling2d( pool_type="max", kernel_size=(pooling_size, pooling_size), pool_axis=(1, 2), ), layer_name="pooling", ) else: self.append( sb.nnet.pooling.Pooling1d( pool_type="max", input_dims=4, kernel_size=pooling_size, pool_axis=2, ), layer_name="pooling", ) self.append( sb.nnet.dropout.Dropout2d(drop_rate=dropout), layer_name="drop" )
[docs] class DNN_Block(sb.nnet.containers.Sequential): """Block for linear layers. Arguments --------- input_shape : tuple Expected shape of the input. neurons : int Size of the linear layers. activation : torch.nn.Module class Class definition to use for constructing activation layers. dropout : float Rate to use for dropping neurons. Example ------- >>> inputs = torch.rand(10, 15, 128) >>> block = DNN_Block(input_shape=inputs.shape, neurons=64) >>> outputs = block(inputs) >>> outputs.shape torch.Size([10, 15, 64]) """ def __init__( self, input_shape, neurons, activation=torch.nn.LeakyReLU, dropout=0.15 ): super().__init__(input_shape=input_shape) self.append( sb.nnet.linear.Linear, n_neurons=neurons, layer_name="linear", ) self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm") self.append(activation(), layer_name="act") self.append(torch.nn.Dropout(p=dropout), layer_name="dropout")