Source code for speechbrain.inference.TTS

""" Specifies the inference interfaces for Text-To-Speech (TTS) modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
"""
import re
import logging
import torch
import torchaudio
import random
import speechbrain
from speechbrain.utils.fetching import fetch
from speechbrain.inference.interfaces import Pretrained
from speechbrain.utils.text_to_sequence import text_to_sequence
from speechbrain.inference.text import GraphemeToPhoneme
from speechbrain.inference.encoders import MelSpectrogramEncoder
from speechbrain.inference.classifiers import EncoderClassifier


logger = logging.getLogger(__name__)


[docs] class Tacotron2(Pretrained): """ A ready-to-use wrapper for Tacotron2 (text -> mel_spec). Arguments --------- hparams Hyperparameters (from HyperPyYAML) Example ------- >>> tmpdir_tts = getfixture('tmpdir') / "tts" >>> tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir=tmpdir_tts) >>> mel_output, mel_length, alignment = tacotron2.encode_text("Mary had a little lamb") >>> items = [ ... "A quick brown fox jumped over the lazy dog", ... "How much wood would a woodchuck chuck?", ... "Never odd or even" ... ] >>> mel_outputs, mel_lengths, alignments = tacotron2.encode_batch(items) >>> # One can combine the TTS model with a vocoder (that generates the final waveform) >>> # Intialize the Vocoder (HiFIGAN) >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder" >>> from speechbrain.inference.vocoders import HIFIGAN >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder) >>> # Running the TTS >>> mel_output, mel_length, alignment = tacotron2.encode_text("Mary had a little lamb") >>> # Running Vocoder (spectrogram-to-waveform) >>> waveforms = hifi_gan.decode_batch(mel_output) """ HPARAMS_NEEDED = ["model", "text_to_sequence"] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.text_cleaners = getattr( self.hparams, "text_cleaners", ["english_cleaners"] ) self.infer = self.hparams.model.infer
[docs] def text_to_seq(self, txt): """Encodes raw text into a tensor with a customer text-to-sequence function""" sequence = self.hparams.text_to_sequence(txt, self.text_cleaners) return sequence, len(sequence)
[docs] def encode_batch(self, texts): """Computes mel-spectrogram for a list of texts Texts must be sorted in decreasing order on their lengths Arguments --------- texts: List[str] texts to be encoded into spectrogram Returns ------- tensors of output spectrograms, output lengths and alignments """ with torch.no_grad(): inputs = [ { "text_sequences": torch.tensor( self.text_to_seq(item)[0], device=self.device ) } for item in texts ] inputs = speechbrain.dataio.batch.PaddedBatch(inputs) lens = [self.text_to_seq(item)[1] for item in texts] assert lens == sorted( lens, reverse=True ), "input lengths must be sorted in decreasing order" input_lengths = torch.tensor(lens, device=self.device) mel_outputs_postnet, mel_lengths, alignments = self.infer( inputs.text_sequences.data, input_lengths ) return mel_outputs_postnet, mel_lengths, alignments
[docs] def encode_text(self, text): """Runs inference for a single text str""" return self.encode_batch([text])
[docs] def forward(self, texts): "Encodes the input texts." return self.encode_batch(texts)
[docs] class MSTacotron2(Pretrained): """ A ready-to-use wrapper for Zero-Shot Multi-Speaker Tacotron2. For voice cloning: (text, reference_audio) -> (mel_spec). For generating a random speaker voice: (text) -> (mel_spec). Example ------- >>> tmpdir_tts = getfixture('tmpdir') / "tts" >>> mstacotron2 = MSTacotron2.from_hparams(source="speechbrain/tts-mstacotron2-libritts", savedir=tmpdir_tts) # doctest: +SKIP >>> # Sample rate of the reference audio must be greater or equal to the sample rate of the speaker embedding model >>> reference_audio_path = "tests/samples/single-mic/example1.wav" >>> input_text = "Mary had a little lamb." >>> mel_output, mel_length, alignment = mstacotron2.clone_voice(input_text, reference_audio_path) # doctest: +SKIP >>> # One can combine the TTS model with a vocoder (that generates the final waveform) >>> # Intialize the Vocoder (HiFIGAN) >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder" >>> from speechbrain.inference.vocoders import HIFIGAN >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-22050Hz", savedir=tmpdir_vocoder) # doctest: +SKIP >>> # Running the TTS >>> mel_output, mel_length, alignment = mstacotron2.clone_voice(input_text, reference_audio_path) # doctest: +SKIP >>> # Running Vocoder (spectrogram-to-waveform) >>> waveforms = hifi_gan.decode_batch(mel_output) # doctest: +SKIP >>> # For generating a random speaker voice, use the following >>> mel_output, mel_length, alignment = mstacotron2.generate_random_voice(input_text) # doctest: +SKIP """ HPARAMS_NEEDED = ["model"] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.text_cleaners = ["english_cleaners"] self.infer = self.hparams.model.infer self.custom_mel_spec_encoder = self.hparams.custom_mel_spec_encoder self.g2p = GraphemeToPhoneme.from_hparams( self.hparams.g2p, run_opts={"device": self.device} ) self.spk_emb_encoder = None if self.custom_mel_spec_encoder: self.spk_emb_encoder = MelSpectrogramEncoder.from_hparams( source=self.hparams.spk_emb_encoder, run_opts={"device": self.device}, ) else: self.spk_emb_encoder = EncoderClassifier.from_hparams( source=self.hparams.spk_emb_encoder, run_opts={"device": self.device}, ) def __text_to_seq(self, txt): """Encodes raw text into a tensor with a customer text-to-equence fuction """ sequence = text_to_sequence(txt, self.text_cleaners) return sequence, len(sequence)
[docs] def clone_voice(self, texts, audio_path): """ Generates mel-spectrogram using input text and reference audio Arguments --------- texts : str or list Input text audio_path : str Reference audio Returns ------- tensors of output spectrograms, output lengths and alignments """ # Loads audio ref_signal, signal_sr = torchaudio.load(audio_path) # Resamples the audio if required if signal_sr != self.hparams.spk_emb_sample_rate: ref_signal = torchaudio.functional.resample( ref_signal, signal_sr, self.hparams.spk_emb_sample_rate ) ref_signal = ref_signal.to(self.device) # Computes speaker embedding if self.custom_mel_spec_encoder: spk_emb = self.spk_emb_encoder.encode_waveform(ref_signal) else: spk_emb = self.spk_emb_encoder.encode_batch(ref_signal) spk_emb = spk_emb.squeeze(0) # Converts input texts into the corresponding phoneme sequences if isinstance(texts, str): texts = [texts] phoneme_seqs = self.g2p(texts) for i in range(len(phoneme_seqs)): phoneme_seqs[i] = " ".join(phoneme_seqs[i]) phoneme_seqs[i] = "{" + phoneme_seqs[i] + "}" # Repeats the speaker embedding to match the number of input texts spk_embs = spk_emb.repeat(len(texts), 1) # Calls __encode_batch to generate the mel-spectrograms return self.__encode_batch(phoneme_seqs, spk_embs)
[docs] def generate_random_voice(self, texts): """ Generates mel-spectrogram using input text and a random speaker voice Arguments --------- texts : str or list Input text Returns ------- tensors of output spectrograms, output lengths and alignments """ spk_emb = self.__sample_random_speaker().float() spk_emb = spk_emb.to(self.device) # Converts input texts into the corresponding phoneme sequences if isinstance(texts, str): texts = [texts] phoneme_seqs = self.g2p(texts) for i in range(len(phoneme_seqs)): phoneme_seqs[i] = " ".join(phoneme_seqs[i]) phoneme_seqs[i] = "{" + phoneme_seqs[i] + "}" # Repeats the speaker embedding to match the number of input texts spk_embs = spk_emb.repeat(len(texts), 1) # Calls __encode_batch to generate the mel-spectrograms return self.__encode_batch(phoneme_seqs, spk_embs)
def __encode_batch(self, texts, spk_embs): """Computes mel-spectrograms for a list of texts Texts are sorted in decreasing order on their lengths Arguments --------- texts: List[str] texts to be encoded into spectrogram spk_embs: torch.Tensor speaker embeddings Returns ------- tensors of output spectrograms, output lengths and alignments """ with torch.no_grad(): inputs = [ { "text_sequences": torch.tensor( self.__text_to_seq(item)[0], device=self.device ) } for item in texts ] inputs = sorted( inputs, key=lambda x: x["text_sequences"].size()[0], reverse=True, ) lens = [entry["text_sequences"].size()[0] for entry in inputs] inputs = speechbrain.dataio.batch.PaddedBatch(inputs) assert lens == sorted( lens, reverse=True ), "ipnut lengths must be sorted in decreasing order" input_lengths = torch.tensor(lens, device=self.device) mel_outputs_postnet, mel_lengths, alignments = self.infer( inputs.text_sequences.data, spk_embs, input_lengths ) return mel_outputs_postnet, mel_lengths, alignments def __sample_random_speaker(self): """Samples a random speaker embedding from a pretrained GMM Returns ------- x: torch.Tensor A randomly sampled speaker embedding """ # Fetches and Loads GMM trained on speaker embeddings speaker_gmm_local_path = fetch( filename=self.hparams.random_speaker_sampler, source=self.hparams.random_speaker_sampler_source, savedir=self.hparams.pretrainer.collect_in, ) random_speaker_gmm = torch.load(speaker_gmm_local_path) gmm_n_components = random_speaker_gmm["gmm_n_components"] gmm_means = random_speaker_gmm["gmm_means"] gmm_covariances = random_speaker_gmm["gmm_covariances"] # Randomly selects a speaker counts = torch.zeros(gmm_n_components) counts[random.randint(0, gmm_n_components - 1)] = 1 x = torch.empty(0, device=counts.device) # Samples an embedding for the speaker for k in torch.arange(gmm_n_components)[counts > 0]: # Considers full covariance type d_k = torch.distributions.multivariate_normal.MultivariateNormal( gmm_means[k], gmm_covariances[k] ) x_k = torch.stack([d_k.sample() for _ in range(int(counts[k]))]) x = torch.cat((x, x_k), dim=0) return x
[docs] class FastSpeech2(Pretrained): """ A ready-to-use wrapper for Fastspeech2 (text -> mel_spec). Arguments --------- hparams Hyperparameters (from HyperPyYAML) Example ------- >>> tmpdir_tts = getfixture('tmpdir') / "tts" >>> fastspeech2 = FastSpeech2.from_hparams(source="speechbrain/tts-fastspeech2-ljspeech", savedir=tmpdir_tts) # doctest: +SKIP >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(["Mary had a little lamb."]) # doctest: +SKIP >>> items = [ ... "A quick brown fox jumped over the lazy dog", ... "How much wood would a woodchuck chuck?", ... "Never odd or even" ... ] >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(items) # doctest: +SKIP >>> >>> # One can combine the TTS model with a vocoder (that generates the final waveform) >>> # Intialize the Vocoder (HiFIGAN) >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder" >>> from speechbrain.inference.vocoders import HIFIGAN >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder) # doctest: +SKIP >>> # Running the TTS >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(["Mary had a little lamb."]) # doctest: +SKIP >>> # Running Vocoder (spectrogram-to-waveform) >>> waveforms = hifi_gan.decode_batch(mel_outputs) # doctest: +SKIP """ HPARAMS_NEEDED = ["spn_predictor", "model", "input_encoder"] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) lexicon = self.hparams.lexicon lexicon = ["@@"] + lexicon self.input_encoder = self.hparams.input_encoder self.input_encoder.update_from_iterable(lexicon, sequence_input=False) self.input_encoder.add_unk() self.g2p = GraphemeToPhoneme.from_hparams("speechbrain/soundchoice-g2p") self.spn_token_encoded = ( self.input_encoder.encode_sequence_torch(["spn"]).int().item() )
[docs] def encode_text(self, texts, pace=1.0, pitch_rate=1.0, energy_rate=1.0): """Computes mel-spectrogram for a list of texts Arguments --------- texts: List[str] texts to be converted to spectrogram pace: float pace for the speech synthesis pitch_rate : float scaling factor for phoneme pitches energy_rate : float scaling factor for phoneme energies Returns ------- tensors of output spectrograms, output lengths and alignments """ # Preprocessing required at the inference time for the input text # "label" below contains input text # "phoneme_labels" contain the phoneme sequences corresponding to input text labels # "last_phonemes_combined" is used to indicate whether the index position is for a last phoneme of a word # "punc_positions" is used to add back the silence for punctuations phoneme_labels = list() last_phonemes_combined = list() punc_positions = list() for label in texts: phoneme_label = list() last_phonemes = list() punc_position = list() words = label.split() words = [word.strip() for word in words] words_phonemes = self.g2p(words) for i in range(len(words_phonemes)): words_phonemes_seq = words_phonemes[i] for phoneme in words_phonemes_seq: if not phoneme.isspace(): phoneme_label.append(phoneme) last_phonemes.append(0) punc_position.append(0) last_phonemes[-1] = 1 if words[i][-1] in ":;-,.!?": punc_position[-1] = 1 phoneme_labels.append(phoneme_label) last_phonemes_combined.append(last_phonemes) punc_positions.append(punc_position) # Inserts silent phonemes in the input phoneme sequence all_tokens_with_spn = list() max_seq_len = -1 for i in range(len(phoneme_labels)): phoneme_label = phoneme_labels[i] token_seq = ( self.input_encoder.encode_sequence_torch(phoneme_label) .int() .to(self.device) ) last_phonemes = torch.LongTensor(last_phonemes_combined[i]).to( self.device ) # Runs the silent phoneme predictor spn_preds = ( self.hparams.modules["spn_predictor"] .infer(token_seq.unsqueeze(0), last_phonemes.unsqueeze(0)) .int() ) spn_to_add = torch.nonzero(spn_preds).reshape(-1).tolist() for j in range(len(punc_positions[i])): if punc_positions[i][j] == 1: spn_to_add.append(j) tokens_with_spn = list() for token_idx in range(token_seq.shape[0]): tokens_with_spn.append(token_seq[token_idx].item()) if token_idx in spn_to_add: tokens_with_spn.append(self.spn_token_encoded) tokens_with_spn = torch.LongTensor(tokens_with_spn).to(self.device) all_tokens_with_spn.append(tokens_with_spn) if max_seq_len < tokens_with_spn.shape[-1]: max_seq_len = tokens_with_spn.shape[-1] # "tokens_with_spn_tensor" holds the input phoneme sequence with silent phonemes tokens_with_spn_tensor_padded = torch.LongTensor( len(texts), max_seq_len ).to(self.device) tokens_with_spn_tensor_padded.zero_() for seq_idx, seq in enumerate(all_tokens_with_spn): tokens_with_spn_tensor_padded[seq_idx, : len(seq)] = seq return self.encode_batch( tokens_with_spn_tensor_padded, pace=pace, pitch_rate=pitch_rate, energy_rate=energy_rate, )
[docs] def encode_phoneme( self, phonemes, pace=1.0, pitch_rate=1.0, energy_rate=1.0 ): """Computes mel-spectrogram for a list of phoneme sequences Arguments --------- phonemes: List[List[str]] phonemes to be converted to spectrogram pace: float pace for the speech synthesis pitch_rate : float scaling factor for phoneme pitches energy_rate : float scaling factor for phoneme energies Returns ------- tensors of output spectrograms, output lengths and alignments """ all_tokens = [] max_seq_len = -1 for phoneme in phonemes: token_seq = ( self.input_encoder.encode_sequence_torch(phoneme) .int() .to(self.device) ) if max_seq_len < token_seq.shape[-1]: max_seq_len = token_seq.shape[-1] all_tokens.append(token_seq) tokens_padded = torch.LongTensor(len(phonemes), max_seq_len).to( self.device ) tokens_padded.zero_() for seq_idx, seq in enumerate(all_tokens): tokens_padded[seq_idx, : len(seq)] = seq return self.encode_batch( tokens_padded, pace=pace, pitch_rate=pitch_rate, energy_rate=energy_rate, )
[docs] def encode_batch( self, tokens_padded, pace=1.0, pitch_rate=1.0, energy_rate=1.0 ): """Batch inference for a tensor of phoneme sequences Arguments --------- tokens_padded : torch.Tensor A sequence of encoded phonemes to be converted to spectrogram pace : float pace for the speech synthesis pitch_rate : float scaling factor for phoneme pitches energy_rate : float scaling factor for phoneme energies """ with torch.no_grad(): ( _, post_mel_outputs, durations, pitch, _, energy, _, _, ) = self.hparams.model( tokens_padded, pace=pace, pitch_rate=pitch_rate, energy_rate=energy_rate, ) # Transposes to make in compliant with HiFI GAN expected format post_mel_outputs = post_mel_outputs.transpose(-1, 1) return post_mel_outputs, durations, pitch, energy
[docs] def forward(self, text, pace=1.0, pitch_rate=1.0, energy_rate=1.0): """Batch inference for a tensor of phoneme sequences Arguments --------- text : str A text to be converted to spectrogram pace : float pace for the speech synthesis pitch_rate : float scaling factor for phoneme pitches energy_rate : float scaling factor for phoneme energies """ return self.encode_text( [text], pace=pace, pitch_rate=pitch_rate, energy_rate=energy_rate )
[docs] class FastSpeech2InternalAlignment(Pretrained): """ A ready-to-use wrapper for Fastspeech2 with internal alignment(text -> mel_spec). Arguments --------- hparams Hyperparameters (from HyperPyYAML) Example ------- >>> tmpdir_tts = getfixture('tmpdir') / "tts" >>> fastspeech2 = FastSpeech2InternalAlignment.from_hparams(source="speechbrain/tts-fastspeech2-internal-alignment-ljspeech", savedir=tmpdir_tts) # doctest: +SKIP >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(["Mary had a little lamb."]) # doctest: +SKIP >>> items = [ ... "A quick brown fox jumped over the lazy dog", ... "How much wood would a woodchuck chuck?", ... "Never odd or even" ... ] >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(items) # doctest: +SKIP >>> # One can combine the TTS model with a vocoder (that generates the final waveform) >>> # Intialize the Vocoder (HiFIGAN) >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder" >>> from speechbrain.inference.vocoders import HIFIGAN >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder) # doctest: +SKIP >>> # Running the TTS >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(["Mary had a little lamb."]) # doctest: +SKIP >>> # Running Vocoder (spectrogram-to-waveform) >>> waveforms = hifi_gan.decode_batch(mel_outputs) # doctest: +SKIP """ HPARAMS_NEEDED = ["model", "input_encoder"] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) lexicon = self.hparams.lexicon lexicon = ["@@"] + lexicon self.input_encoder = self.hparams.input_encoder self.input_encoder.update_from_iterable(lexicon, sequence_input=False) self.input_encoder.add_unk() self.g2p = GraphemeToPhoneme.from_hparams("speechbrain/soundchoice-g2p")
[docs] def encode_text(self, texts, pace=1.0, pitch_rate=1.0, energy_rate=1.0): """Computes mel-spectrogram for a list of texts Arguments --------- texts: List[str] texts to be converted to spectrogram pace: float pace for the speech synthesis pitch_rate : float scaling factor for phoneme pitches energy_rate : float scaling factor for phoneme energies Returns ------- tensors of output spectrograms, output lengths and alignments """ # Preprocessing required at the inference time for the input text # "label" below contains input text # "phoneme_labels" contain the phoneme sequences corresponding to input text labels phoneme_labels = list() max_seq_len = -1 for label in texts: phonemes_with_punc = self._g2p_keep_punctuations(self.g2p, label) if max_seq_len < len(phonemes_with_punc): max_seq_len = len(phonemes_with_punc) token_seq = ( self.input_encoder.encode_sequence_torch(phonemes_with_punc) .int() .to(self.device) ) phoneme_labels.append(token_seq) tokens_padded = torch.LongTensor(len(texts), max_seq_len).to( self.device ) tokens_padded.zero_() for seq_idx, seq in enumerate(phoneme_labels): tokens_padded[seq_idx, : len(seq)] = seq return self.encode_batch( tokens_padded, pace=pace, pitch_rate=pitch_rate, energy_rate=energy_rate, )
def _g2p_keep_punctuations(self, g2p_model, text): """do grapheme to phoneme and keep the punctuations between the words""" # find the words where a "-" or "'" or "." or ":" appears in the middle special_words = re.findall(r"\w+[-':\.][-':\.\w]*\w+", text) # remove intra-word punctuations ("-':."), this does not change the output of speechbrain g2p for special_word in special_words: rmp = special_word.replace("-", "") rmp = rmp.replace("'", "") rmp = rmp.replace(":", "") rmp = rmp.replace(".", "") text = text.replace(special_word, rmp) # keep inter-word punctuations all_ = re.findall(r"[\w]+|[-!'(),.:;? ]", text) try: phonemes = g2p_model(text) except RuntimeError: logger.info(f"error with text: {text}") quit() word_phonemes = "-".join(phonemes).split(" ") phonemes_with_punc = [] count = 0 try: # if the g2p model splits the words correctly for i in all_: if i not in "-!'(),.:;? ": phonemes_with_punc.extend(word_phonemes[count].split("-")) count += 1 else: phonemes_with_punc.append(i) except IndexError: # sometimes the g2p model cannot split the words correctly logger.warning( f"Do g2p word by word because of unexpected ouputs from g2p for text: {text}" ) for i in all_: if i not in "-!'(),.:;? ": p = g2p_model.g2p(i) p_without_space = [i for i in p if i != " "] phonemes_with_punc.extend(p_without_space) else: phonemes_with_punc.append(i) while "" in phonemes_with_punc: phonemes_with_punc.remove("") return phonemes_with_punc
[docs] def encode_phoneme( self, phonemes, pace=1.0, pitch_rate=1.0, energy_rate=1.0 ): """Computes mel-spectrogram for a list of phoneme sequences Arguments --------- phonemes: List[List[str]] phonemes to be converted to spectrogram pace: float pace for the speech synthesis pitch_rate : float scaling factor for phoneme pitches energy_rate : float scaling factor for phoneme energies Returns ------- tensors of output spectrograms, output lengths and alignments """ all_tokens = [] max_seq_len = -1 for phoneme in phonemes: token_seq = ( self.input_encoder.encode_sequence_torch(phoneme) .int() .to(self.device) ) if max_seq_len < token_seq.shape[-1]: max_seq_len = token_seq.shape[-1] all_tokens.append(token_seq) tokens_padded = torch.LongTensor(len(phonemes), max_seq_len).to( self.device ) tokens_padded.zero_() for seq_idx, seq in enumerate(all_tokens): tokens_padded[seq_idx, : len(seq)] = seq return self.encode_batch( tokens_padded, pace=pace, pitch_rate=pitch_rate, energy_rate=energy_rate, )
[docs] def encode_batch( self, tokens_padded, pace=1.0, pitch_rate=1.0, energy_rate=1.0 ): """Batch inference for a tensor of phoneme sequences Arguments --------- tokens_padded : torch.Tensor A sequence of encoded phonemes to be converted to spectrogram pace : float pace for the speech synthesis pitch_rate : float scaling factor for phoneme pitches energy_rate : float scaling factor for phoneme energies """ with torch.no_grad(): ( _, post_mel_outputs, durations, pitch, _, energy, _, _, _, _, _, _, ) = self.hparams.model( tokens_padded, pace=pace, pitch_rate=pitch_rate, energy_rate=energy_rate, ) # Transposes to make in compliant with HiFI GAN expected format post_mel_outputs = post_mel_outputs.transpose(-1, 1) return post_mel_outputs, durations, pitch, energy
[docs] def forward(self, text, pace=1.0, pitch_rate=1.0, energy_rate=1.0): """Batch inference for a tensor of phoneme sequences Arguments --------- text : str A text to be converted to spectrogram pace : float pace for the speech synthesis pitch_rate : float scaling factor for phoneme pitches energy_rate : float scaling factor for phoneme energies """ return self.encode_text( [text], pace=pace, pitch_rate=pitch_rate, energy_rate=energy_rate )