mirror of https://github.com/coqui-ai/TTS.git
9.1 KiB
9.1 KiB
None
<html lang="en">
<head>
</head>
</html>
Mozilla TTS on CPU Real-Time Speech Synthesis with Tensorflow¶
These models are converted from released PyTorch models using our TF utilities provided in Mozilla TTS.
These TF models support TF 2.2 and for different versions you might need to regenerate them.
We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.
Tacotron2 is trained using Double Decoder Consistency (DDC) only for 130K steps (3 days) with a single GPU.
MultiBand-Melgan is trained 1.45M steps with real spectrograms.
Note that both model performances can be improved with more training.
Download Models¶
In [ ]:
!gdown --id 1p7OSEEW_Z7ORxNgfZwhMy7IiLE1s0aH7 -O data/tts_model.pkl !gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json
In [ ]:
!gdown --id 1rHmj7CqD3Sfa716Y3ub_vpIBrQg_b1yF -O data/vocoder_model.pkl !gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json !gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy
Define TTS function¶
In [ ]:
def tts(model, text, CONFIG, p): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, backend='tf') waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0)) waveform = waveform.numpy()[0, 0] rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate) tps = (time.time() - t_1) / len(waveform) print(waveform.shape) print(" > Run-time: {}".format(time.time() - t_1)) print(" > Real-time factor: {}".format(rtf)) print(" > Time per step: {}".format(tps)) IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) return alignment, mel_postnet_spec, stop_tokens, waveform
Load Models¶
In [ ]:
import os import torch import time import IPython from TTS.tts.tf.utils.generic_utils import setup_model from TTS.tts.tf.utils.io import load_checkpoint from TTS.utils.io import load_config from TTS.tts.utils.text.symbols import symbols, phonemes from TTS.utils.audio import AudioProcessor from TTS.tts.utils.synthesis import synthesis
In [ ]:
# runtime settings use_cuda = False
In [ ]:
# model paths TTS_MODEL = "data/tts_model.pkl" TTS_CONFIG = "data/config.json" VOCODER_MODEL = "data/vocoder_model.pkl" VOCODER_CONFIG = "data/config_vocoder.json"
In [ ]:
# load configs TTS_CONFIG = load_config(TTS_CONFIG) VOCODER_CONFIG = load_config(VOCODER_CONFIG)
In [ ]:
# load the audio processor TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy' ap = AudioProcessor(**TTS_CONFIG.audio)
In [ ]:
# LOAD TTS MODEL # multi speaker speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, len(speakers), TTS_CONFIG) model.build_inference() model = load_checkpoint(model, TTS_MODEL) model.decoder.set_max_decoder_steps(1000)
In [ ]:
from TTS.vocoder.tf.utils.generic_utils import setup_generator from TTS.vocoder.tf.utils.io import load_checkpoint # LOAD VOCODER MODEL vocoder_model = setup_generator(VOCODER_CONFIG) vocoder_model.build_inference() vocoder_model = load_checkpoint(vocoder_model, VOCODER_MODEL) vocoder_model.inference_padding = 0 ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])
Run Inference¶
In [ ]:
sentence = "Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go." align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)