mirror of https://github.com/coqui-ai/TTS.git
9.5 KiB
9.5 KiB
None
<html lang="en">
<head>
</head>
</html>
Mozilla TTS on CPU Real-Time Speech Synthesis with Tensorflow¶
These models are converted from released PyTorch models using our TF utilities provided in Mozilla mozilla_voice_tts.
These TF models support TF 2.2 and for different versions you might need to regenerate them.
We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.
Tacotron2 is trained using Double Decoder Consistency (DDC) only for 130K steps (3 days) with a single GPU.
MultiBand-Melgan is trained 1.45M steps with real spectrograms.
Note that both model performances can be improved with more training.
Download Models¶
In [ ]:
!gdown --id 1p7OSEEW_Z7ORxNgfZwhMy7IiLE1s0aH7 -O data/tts_model.pkl !gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json
In [ ]:
!gdown --id 1rHmj7CqD3Sfa716Y3ub_vpIBrQg_b1yF -O data/vocoder_model.pkl !gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json !gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy
Define TTS function¶
In [ ]:
def tts(model, text, CONFIG, p): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, backend='tf') waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0)) waveform = waveform.numpy()[0, 0] rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate) tps = (time.time() - t_1) / len(waveform) print(waveform.shape) print(" > Run-time: {}".format(time.time() - t_1)) print(" > Real-time factor: {}".format(rtf)) print(" > Time per step: {}".format(tps)) IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) return alignment, mel_postnet_spec, stop_tokens, waveform
Load Models¶
In [ ]:
import os import torch import time import IPython from mozilla_voice_tts.tts.tf.utils.generic_utils import setup_model from mozilla_voice_tts.tts.tf.utils.io import load_checkpoint from mozilla_voice_tts.utils.io import load_config from mozilla_voice_tts.tts.utils.text.symbols import symbols, phonemes from mozilla_voice_tts.utils.audio import AudioProcessor from mozilla_voice_tts.tts.utils.synthesis import synthesis
In [ ]:
# runtime settings use_cuda = False
In [ ]:
# model paths TTS_MODEL = "data/tts_model.pkl" TTS_CONFIG = "data/config.json" VOCODER_MODEL = "data/vocoder_model.pkl" VOCODER_CONFIG = "data/config_vocoder.json"
In [ ]:
# load configs TTS_CONFIG = load_config(TTS_CONFIG) VOCODER_CONFIG = load_config(VOCODER_CONFIG)
In [ ]:
# load the audio processor TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy' ap = AudioProcessor(**TTS_CONFIG.audio)
In [ ]:
# LOAD TTS MODEL # multi speaker speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, len(speakers), TTS_CONFIG) model.build_inference() model = load_checkpoint(model, TTS_MODEL) model.decoder.set_max_decoder_steps(1000)
In [ ]:
from mozilla_voice_tts.vocoder.tf.utils.generic_utils import setup_generator from mozilla_voice_tts.vocoder.tf.utils.io import load_checkpoint # LOAD VOCODER MODEL vocoder_model = setup_generator(VOCODER_CONFIG) vocoder_model.build_inference() vocoder_model = load_checkpoint(vocoder_model, VOCODER_MODEL) vocoder_model.inference_padding = 0 ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])
Run Inference¶
In [ ]:
sentence = "Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go." align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)