Source code for speechbrain.lobes.models.segan_model

"""
This file contains two PyTorch modules which together consist of the SEGAN model architecture
(based on the paper: Pascual et al. https://arxiv.org/pdf/1703.09452.pdf).
Modification of the initialization parameters allows the change of the model described in the class project,
such as turning the generator to a VAE, or removing the latent variable concatenation.

Loss functions for training SEGAN are also defined in this file.

Authors
 * Francis Carter 2021
"""

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from math import floor


[docs] class Generator(torch.nn.Module): """CNN Autoencoder model to clean speech signals. Arguments --------- kernel_size : int Size of the convolutional kernel. latent_vae : bool Whether or not to convert the autoencoder to a vae z_prob : bool Whether to remove the latent variable concatenation. Is only applicable if latent_vae is False """ def __init__(self, kernel_size, latent_vae, z_prob): super().__init__() self.EncodeLayers = torch.nn.ModuleList() self.DecodeLayers = torch.nn.ModuleList() self.kernel_size = 5 self.latent_vae = latent_vae self.z_prob = z_prob EncoderChannels = [1, 16, 32, 32, 64, 64, 128, 128, 256, 256, 512, 1024] DecoderChannels = [ 2048, 1024, 512, 512, 256, 256, 128, 128, 64, 64, 32, 1, ] # Create encoder and decoder layers. for i in range(len(EncoderChannels) - 1): if i == len(EncoderChannels) - 2 and self.latent_vae: outs = EncoderChannels[i + 1] * 2 else: outs = EncoderChannels[i + 1] self.EncodeLayers.append( nn.Conv1d( in_channels=EncoderChannels[i], out_channels=outs, kernel_size=kernel_size, stride=2, padding=floor(kernel_size / 2), # same ) ) for i in range(len(DecoderChannels) - 1): if i == 0 and self.latent_vae: ins = EncoderChannels[-1 * (i + 1)] else: ins = EncoderChannels[-1 * (i + 1)] * 2 self.DecodeLayers.append( nn.ConvTranspose1d( in_channels=ins, out_channels=EncoderChannels[-1 * (i + 2)], kernel_size=kernel_size + 1, # adding one to kernel size makes the dimensions match stride=2, padding=floor(kernel_size / 2), # same ) )
[docs] def forward(self, x): """Forward pass through autoencoder""" # encode skips = [] x = x.permute(0, 2, 1) for i, layer in enumerate(self.EncodeLayers): x = layer(x) skips.append(x.clone()) if i == len(self.DecodeLayers) - 1: continue else: x = F.leaky_relu(x, negative_slope=0.3) # fuse z if self.latent_vae: z_mean, z_logvar = x.chunk(2, dim=1) x = z_mean + torch.exp(z_logvar / 2.0) * torch.randn_like( z_logvar, device=x.device ) # sampling from latent var probability distribution elif self.z_prob: z = torch.normal(torch.zeros_like(x), torch.ones_like(x)) x = torch.cat((x, z), 1) else: z = torch.zeros_like(x) x = torch.cat((x, z), 1) # decode for i, layer in enumerate(self.DecodeLayers): x = layer(x) if i == len(self.DecodeLayers) - 1: continue else: x = torch.cat((x, skips[-1 * (i + 2)]), 1) x = F.leaky_relu(x, negative_slope=0.3) x = x.permute(0, 2, 1) if self.latent_vae: return x, z_mean, z_logvar else: return x
[docs] class Discriminator(torch.nn.Module): """CNN discriminator of SEGAN Arguments --------- kernel_size : int Size of the convolutional kernel. """ def __init__(self, kernel_size): super().__init__() self.Layers = torch.nn.ModuleList() self.Norms = torch.nn.ModuleList() Channels = [2, 16, 32, 32, 64, 64, 128, 128, 256, 256, 512, 1024, 1] # Create encoder and decoder layers. for i in range(len(Channels) - 1): if i != len(Channels) - 2: self.Layers.append( nn.Conv1d( in_channels=Channels[i], out_channels=Channels[i + 1], kernel_size=kernel_size, stride=2, padding=floor(kernel_size / 2), # same ) ) self.Norms.append( nn.BatchNorm1d( num_features=Channels[ i + 1 ] # not sure what the last dim should be here ) ) # output convolution else: self.Layers.append( nn.Conv1d( in_channels=Channels[i], out_channels=Channels[i + 1], kernel_size=1, stride=1, padding=0, # same ) ) self.Layers.append( nn.Linear(in_features=8, out_features=1,) # Channels[i+1], )
[docs] def forward(self, x): """forward pass through the discriminator""" x = x.permute(0, 2, 1) # encode for i in range(len(self.Norms)): x = self.Layers[i](x) x = self.Norms[i](x) x = F.leaky_relu(x, negative_slope=0.3) # output x = self.Layers[-2](x) x = self.Layers[-1](x) # x = F.sigmoid(x) x = x.permute(0, 2, 1) return x # in logit format
[docs] def d1_loss(d_outputs, reduction="mean"): """Calculates the loss of the discriminator when the inputs are clean """ output = 0.5 * ((d_outputs - 1) ** 2) if reduction == "mean": return output.mean() elif reduction == "batch": return output.view(output.size(0), -1).mean(1)
[docs] def d2_loss(d_outputs, reduction="mean"): """Calculates the loss of the discriminator when the inputs are not clean """ output = 0.5 * ((d_outputs) ** 2) if reduction == "mean": return output.mean() elif reduction == "batch": return output.view(output.size(0), -1).mean(1)
[docs] def g3_loss( d_outputs, predictions, targets, length, l1LossCoeff, klLossCoeff, z_mean=None, z_logvar=None, reduction="mean", ): """Calculates the loss of the generator given the discriminator outputs """ discrimloss = 0.5 * ((d_outputs - 1) ** 2) l1norm = torch.nn.functional.l1_loss(predictions, targets, reduction="none") if not ( z_mean is None ): # This will determine if model is being trained as a vae ZERO = torch.zeros_like(z_mean) distq = torch.distributions.normal.Normal( z_mean, torch.exp(z_logvar) ** (1 / 2) ) distp = torch.distributions.normal.Normal( ZERO, torch.exp(ZERO) ** (1 / 2) ) kl = torch.distributions.kl.kl_divergence(distq, distp) kl = kl.sum(axis=1).sum(axis=1).mean() else: kl = 0 if reduction == "mean": return ( discrimloss.mean() + l1LossCoeff * l1norm.mean() + klLossCoeff * kl ) elif reduction == "batch": dloss = discrimloss.view(discrimloss.size(0), -1).mean(1) lloss = l1norm.view(l1norm.size(0), -1).mean(1) return dloss + l1LossCoeff * lloss + klLossCoeff * kl