Source code for speechbrain.nnet.normalization

"""Library implementing normalization.

Authors
 * Mirco Ravanelli 2020
 * Guillermo Cámbara 2021
 * Sarthak Yadav 2022
"""
import torch
import torch.nn as nn


[docs] class BatchNorm1d(nn.Module): """Applies 1d batch normalization to the input tensor. Arguments --------- input_shape : tuple The expected shape of the input. Alternatively, use ``input_size``. input_size : int The expected size of the input. Alternatively, use ``input_shape``. eps : float This value is added to std deviation estimation to improve the numerical stability. momentum : float It is a value used for the running_mean and running_var computation. affine : bool When set to True, the affine parameters are learned. track_running_stats : bool When set to True, this module tracks the running mean and variance, and when set to False, this module does not track such statistics. combine_batch_time : bool When true, it combines batch an time axis. Example ------- >>> input = torch.randn(100, 10) >>> norm = BatchNorm1d(input_shape=input.shape) >>> output = norm(input) >>> output.shape torch.Size([100, 10]) """ def __init__( self, input_shape=None, input_size=None, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True, combine_batch_time=False, skip_transpose=False, ): super().__init__() self.combine_batch_time = combine_batch_time self.skip_transpose = skip_transpose if input_size is None and skip_transpose: input_size = input_shape[1] elif input_size is None: input_size = input_shape[-1] self.norm = nn.BatchNorm1d( input_size, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats, )
[docs] def forward(self, x): """Returns the normalized input tensor. Arguments --------- x : torch.Tensor (batch, time, [channels]) input to normalize. 2d or 3d tensors are expected in input 4d tensors can be used when combine_dims=True. """ shape_or = x.shape if self.combine_batch_time: if x.ndim == 3: x = x.reshape(shape_or[0] * shape_or[1], shape_or[2]) else: x = x.reshape( shape_or[0] * shape_or[1], shape_or[3], shape_or[2] ) elif not self.skip_transpose: x = x.transpose(-1, 1) x_n = self.norm(x) if self.combine_batch_time: x_n = x_n.reshape(shape_or) elif not self.skip_transpose: x_n = x_n.transpose(1, -1) return x_n
[docs] class BatchNorm2d(nn.Module): """Applies 2d batch normalization to the input tensor. Arguments --------- input_shape : tuple The expected shape of the input. Alternatively, use ``input_size``. input_size : int The expected size of the input. Alternatively, use ``input_shape``. eps : float This value is added to std deviation estimation to improve the numerical stability. momentum : float It is a value used for the running_mean and running_var computation. affine : bool When set to True, the affine parameters are learned. track_running_stats : bool When set to True, this module tracks the running mean and variance, and when set to False, this module does not track such statistics. Example ------- >>> input = torch.randn(100, 10, 5, 20) >>> norm = BatchNorm2d(input_shape=input.shape) >>> output = norm(input) >>> output.shape torch.Size([100, 10, 5, 20]) """ def __init__( self, input_shape=None, input_size=None, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True, ): super().__init__() if input_shape is None and input_size is None: raise ValueError("Expected input_shape or input_size as input") if input_size is None: input_size = input_shape[-1] self.norm = nn.BatchNorm2d( input_size, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats, )
[docs] def forward(self, x): """Returns the normalized input tensor. Arguments --------- x : torch.Tensor (batch, time, channel1, channel2) input to normalize. 4d tensors are expected. """ x = x.transpose(-1, 1) x_n = self.norm(x) x_n = x_n.transpose(1, -1) return x_n
[docs] class LayerNorm(nn.Module): """Applies layer normalization to the input tensor. Arguments --------- input_shape : tuple The expected shape of the input. eps : float This value is added to std deviation estimation to improve the numerical stability. elementwise_affine : bool If True, this module has learnable per-element affine parameters initialized to ones (for weights) and zeros (for biases). Example ------- >>> input = torch.randn(100, 101, 128) >>> norm = LayerNorm(input_shape=input.shape) >>> output = norm(input) >>> output.shape torch.Size([100, 101, 128]) """ def __init__( self, input_size=None, input_shape=None, eps=1e-05, elementwise_affine=True, ): super().__init__() self.eps = eps self.elementwise_affine = elementwise_affine if input_shape is not None: input_size = input_shape[2:] self.norm = torch.nn.LayerNorm( input_size, eps=self.eps, elementwise_affine=self.elementwise_affine, )
[docs] def forward(self, x): """Returns the normalized input tensor. Arguments --------- x : torch.Tensor (batch, time, channels) input to normalize. 3d or 4d tensors are expected. """ return self.norm(x)
[docs] class InstanceNorm1d(nn.Module): """Applies 1d instance normalization to the input tensor. Arguments --------- input_shape : tuple The expected shape of the input. Alternatively, use ``input_size``. input_size : int The expected size of the input. Alternatively, use ``input_shape``. eps : float This value is added to std deviation estimation to improve the numerical stability. momentum : float It is a value used for the running_mean and running_var computation. track_running_stats : bool When set to True, this module tracks the running mean and variance, and when set to False, this module does not track such statistics. affine : bool A boolean value that when set to True, this module has learnable affine parameters, initialized the same way as done for batch normalization. Default: False. Example ------- >>> input = torch.randn(100, 10, 20) >>> norm = InstanceNorm1d(input_shape=input.shape) >>> output = norm(input) >>> output.shape torch.Size([100, 10, 20]) """ def __init__( self, input_shape=None, input_size=None, eps=1e-05, momentum=0.1, track_running_stats=True, affine=False, ): super().__init__() if input_shape is None and input_size is None: raise ValueError("Expected input_shape or input_size as input") if input_size is None: input_size = input_shape[-1] self.norm = nn.InstanceNorm1d( input_size, eps=eps, momentum=momentum, track_running_stats=track_running_stats, affine=affine, )
[docs] def forward(self, x): """Returns the normalized input tensor. Arguments --------- x : torch.Tensor (batch, time, channels) input to normalize. 3d tensors are expected. """ x = x.transpose(-1, 1) x_n = self.norm(x) x_n = x_n.transpose(1, -1) return x_n
[docs] class InstanceNorm2d(nn.Module): """Applies 2d instance normalization to the input tensor. Arguments --------- input_shape : tuple The expected shape of the input. Alternatively, use ``input_size``. input_size : int The expected size of the input. Alternatively, use ``input_shape``. eps : float This value is added to std deviation estimation to improve the numerical stability. momentum : float It is a value used for the running_mean and running_var computation. track_running_stats : bool When set to True, this module tracks the running mean and variance, and when set to False, this module does not track such statistics. affine : bool A boolean value that when set to True, this module has learnable affine parameters, initialized the same way as done for batch normalization. Default: False. Example ------- >>> input = torch.randn(100, 10, 20, 2) >>> norm = InstanceNorm2d(input_shape=input.shape) >>> output = norm(input) >>> output.shape torch.Size([100, 10, 20, 2]) """ def __init__( self, input_shape=None, input_size=None, eps=1e-05, momentum=0.1, track_running_stats=True, affine=False, ): super().__init__() if input_shape is None and input_size is None: raise ValueError("Expected input_shape or input_size as input") if input_size is None: input_size = input_shape[-1] self.norm = nn.InstanceNorm2d( input_size, eps=eps, momentum=momentum, track_running_stats=track_running_stats, affine=affine, )
[docs] def forward(self, x): """Returns the normalized input tensor. Arguments --------- x : torch.Tensor (batch, time, channel1, channel2) input to normalize. 4d tensors are expected. """ x = x.transpose(-1, 1) x_n = self.norm(x) x_n = x_n.transpose(1, -1) return x_n
[docs] class GroupNorm(nn.Module): """Applies group normalization to the input tensor. Arguments --------- input_shape : tuple The expected shape of the input. Alternatively, use ``input_size``. input_size : int The expected size of the input. Alternatively, use ``input_shape``. num_groups : int Number of groups to separate the channels into. eps : float This value is added to std deviation estimation to improve the numerical stability. affine : bool A boolean value that when set to True, this module has learnable per-channel affine parameters initialized to ones (for weights) and zeros (for biases). Example ------- >>> input = torch.randn(100, 101, 128) >>> norm = GroupNorm(input_size=128, num_groups=128) >>> output = norm(input) >>> output.shape torch.Size([100, 101, 128]) """ def __init__( self, input_shape=None, input_size=None, num_groups=None, eps=1e-05, affine=True, ): super().__init__() self.eps = eps self.affine = affine if input_shape is None and input_size is None: raise ValueError("Expected input_shape or input_size as input") if num_groups is None: raise ValueError("Expected num_groups as input") if input_shape is not None: input_size = input_shape[-1] self.norm = torch.nn.GroupNorm( num_groups, input_size, eps=self.eps, affine=self.affine, )
[docs] def forward(self, x): """Returns the normalized input tensor. Arguments --------- x : torch.Tensor (batch, time, channels) input to normalize. 3d or 4d tensors are expected. """ x = x.transpose(-1, 1) x_n = self.norm(x) x_n = x_n.transpose(1, -1) return x_n
[docs] class ExponentialMovingAverage(nn.Module): """ Applies learnable exponential moving average, as required by learnable PCEN layer Arguments --------- input_size : int The expected size of the input. coeff_init: float Initial smoothing coefficient value per_channel: bool Controls whether every smoothing coefficients are learned independently for every input channel trainable: bool whether to learn the PCEN parameters or use fixed skip_transpose : bool If False, uses batch x time x channel convention of speechbrain. If True, uses batch x channel x time convention. Example ------- >>> inp_tensor = torch.rand([10, 50, 40]) >>> pcen = ExponentialMovingAverage(40) >>> out_tensor = pcen(inp_tensor) >>> out_tensor.shape torch.Size([10, 50, 40]) """ def __init__( self, input_size: int, coeff_init: float = 0.04, per_channel: bool = False, trainable: bool = True, skip_transpose: bool = False, ): super(ExponentialMovingAverage, self).__init__() self._coeff_init = coeff_init self._per_channel = per_channel self.skip_transpose = skip_transpose self.trainable = trainable weights = ( torch.ones(input_size,) if self._per_channel else torch.ones(1,) ) self._weights = nn.Parameter( weights * self._coeff_init, requires_grad=trainable )
[docs] def forward(self, x): """Returns the normalized input tensor. Arguments --------- x : torch.Tensor (batch, time, channels) input to normalize. """ if not self.skip_transpose: x = x.transpose(1, -1) w = torch.clamp(self._weights, min=0.0, max=1.0) initial_state = x[:, :, 0] def scan(init_state, x, w): """Loops and accumulates.""" x = x.permute(2, 0, 1) acc = init_state results = [] for ix in range(x.shape[0]): acc = (w * x[ix]) + ((1.0 - w) * acc) results.append(acc.unsqueeze(0)) results = torch.cat(results, dim=0) results = results.permute(1, 2, 0) return results output = scan(initial_state, x, w) if not self.skip_transpose: output = output.transpose(1, -1) return output
[docs] class PCEN(nn.Module): """ This class implements a learnable Per-channel energy normalization (PCEN) layer, supporting both original PCEN as specified in [1] as well as sPCEN as specified in [2] [1] Yuxuan Wang, Pascal Getreuer, Thad Hughes, Richard F. Lyon, Rif A. Saurous, "Trainable Frontend For Robust and Far-Field Keyword Spotting", in Proc of ICASSP 2017 (https://arxiv.org/abs/1607.05666) [2] Neil Zeghidour, Olivier Teboul, F{\'e}lix de Chaumont Quitry & Marco Tagliasacchi, "LEAF: A LEARNABLE FRONTEND FOR AUDIO CLASSIFICATION", in Proc of ICLR 2021 (https://arxiv.org/abs/2101.08596) The default argument values correspond with those used by [2]. Arguments --------- input_size : int The expected size of the input. alpha: float specifies alpha coefficient for PCEN smooth_coef: float specified smooth coefficient for PCEN delta: float specifies delta coefficient for PCEN root: float specifies root coefficient for PCEN floor: float specifies floor coefficient for PCEN trainable: bool whether to learn the PCEN parameters or use fixed per_channel_smooth_coef: bool whether to learn independent smooth coefficients for every channel. when True, essentially using sPCEN from [2] skip_transpose : bool If False, uses batch x time x channel convention of speechbrain. If True, uses batch x channel x time convention. Example ------- >>> inp_tensor = torch.rand([10, 50, 40]) >>> pcen = PCEN(40, alpha=0.96) # sPCEN >>> out_tensor = pcen(inp_tensor) >>> out_tensor.shape torch.Size([10, 50, 40]) """ def __init__( self, input_size, alpha: float = 0.96, smooth_coef: float = 0.04, delta: float = 2.0, root: float = 2.0, floor: float = 1e-12, trainable: bool = True, per_channel_smooth_coef: bool = True, skip_transpose: bool = False, ): super(PCEN, self).__init__() self._smooth_coef = smooth_coef self._floor = floor self._per_channel_smooth_coef = per_channel_smooth_coef self.skip_transpose = skip_transpose self.alpha = nn.Parameter( torch.ones(input_size) * alpha, requires_grad=trainable ) self.delta = nn.Parameter( torch.ones(input_size) * delta, requires_grad=trainable ) self.root = nn.Parameter( torch.ones(input_size) * root, requires_grad=trainable ) self.ema = ExponentialMovingAverage( input_size, coeff_init=self._smooth_coef, per_channel=self._per_channel_smooth_coef, skip_transpose=True, trainable=trainable, )
[docs] def forward(self, x): """Returns the normalized input tensor. Arguments --------- x : torch.Tensor (batch, time, channels) input to normalize. """ if not self.skip_transpose: x = x.transpose(1, -1) alpha = torch.min( self.alpha, torch.tensor(1.0, dtype=x.dtype, device=x.device) ) root = torch.max( self.root, torch.tensor(1.0, dtype=x.dtype, device=x.device) ) ema_smoother = self.ema(x) one_over_root = 1.0 / root output = ( x / (self._floor + ema_smoother) ** alpha.view(1, -1, 1) + self.delta.view(1, -1, 1) ) ** one_over_root.view(1, -1, 1) - self.delta.view( 1, -1, 1 ) ** one_over_root.view( 1, -1, 1 ) if not self.skip_transpose: output = output.transpose(1, -1) return output