Source code for speechbrain.wordemb.util

"""
Utilities for word embeddings

Authors
* Artem Ploujnikov 2021
"""
import torch


[docs] def expand_to_chars(emb, seq, seq_len, word_separator): """Expands word embeddings to a sequence of character embeddings, assigning each character the word embedding of the word to which it belongs Arguments --------- emb: torch.Tensor a tensor of word embeddings seq: torch.Tensor a tensor of character embeddings seq_len: torch.Tensor a tensor of character embedding lengths word_separator: torch.Tensor the word separator being used Returns ------- char_word_emb: torch.Tensor a combined character + word embedding tensor Example ------- >>> import torch >>> emb = torch.tensor( ... [[[1., 2., 3.], ... [3., 1., 2.], ... [0., 0., 0.]], ... [[1., 3., 2.], ... [3., 2., 1.], ... [2., 3., 1.]]] ... ) >>> seq = torch.tensor( ... [[1, 2, 0, 2, 1, 0], ... [1, 0, 1, 2, 0, 2]] ... ) >>> seq_len = torch.tensor([4, 5]) >>> word_separator = 0 >>> expand_to_chars(emb, seq, seq_len, word_separator) tensor([[[1., 2., 3.], [1., 2., 3.], [0., 0., 0.], [3., 1., 2.], [3., 1., 2.], [0., 0., 0.]], <BLANKLINE> [[1., 3., 2.], [0., 0., 0.], [3., 2., 1.], [3., 2., 1.], [0., 0., 0.], [2., 3., 1.]]]) """ word_boundaries = seq == word_separator words = word_boundaries.cumsum(dim=-1) # TODO: Find a way to vectorize over the batch axis char_word_emb = torch.zeros(emb.size(0), seq.size(-1), emb.size(-1)).to( emb.device ) seq_len_idx = (seq_len * seq.size(-1)).int() for idx, (item, item_length) in enumerate(zip(words, seq_len_idx)): char_word_emb[idx] = emb[idx, item] char_word_emb[idx, item_length:, :] = 0 char_word_emb[idx, word_boundaries[idx], :] = 0 return char_word_emb