.models.json and synthesize.py update for interfacing with model manager

This commit is contained in:
root 2021-01-20 02:08:58 +00:00
parent 435943ba39
commit 3d30dae8f3
2 changed files with 334 additions and 106 deletions

77
.models.json Normal file
View File

@ -0,0 +1,77 @@
{
"tts_models":{
"en":{
"ljspeech":{
"glow-tts":{
"description": "",
"model_file": "1NFsfhH8W8AgcfJ-BsL8CYAwQfZ5k4T-n",
"config_file": "1IAROF3yy9qTK43vG_-R67y3Py9yYbD6t",
"stats_file": null,
"commit": ""
},
"tacotron2-DCA": {
"description": "",
"model_file": "1CFoPDQBnhfBFu2Gc0TBSJn8o-TuNKQn7",
"config_file": "1lWSscNfKet1zZSJCNirOn7v9bigUZ8C1",
"stats_file": "1qevpGRVHPmzfiRBNuugLMX62x1k7B5vK",
"commit": ""
},
"speedy-speech-wn":{
"description": "Speedy Speech model with wavenet decoder.",
"model_file": "1VXAwiq6N-Viq3rsSXlf43bdoi0jSvMAJ",
"config_file": "1KvZilhsNP3EumVggDcD46yd834eO5hR3",
"stats_file": "1Ju7apZ5JlgsVECcETL-GEx3DRoNzWfkR",
"commit": "77b6145"
}
}
},
"es":{
"mai":{
"tacotron2-DDC":{
"model_file": "1jZ4HvYcAXI5ZClke2iGA7qFQQJBXIovw",
"config_file": "1s7g4n-B73ChCB48AQ88_DV_8oyLth8r0",
"stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv",
"commit": ""
}
}
},
"fr":{
"mai":{
"tacotron2-DDC":{
"model_file": "1qyxrrCyoXUvBG2lqVd0KqAlHj-2nZCgS",
"config_file": "1yECKeP2LI7tNv4E8yVNx1yLmCfTCpkqG",
"stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv",
"commit": ""
}
}
}
},
"vocoder_models":{
"universal":{
"libri-tts":{
"wavegrad":{
"model_file": "1r2g90JaZsfCj9dJkI9ioIU6JCFMPRqi6",
"config_file": "1POrrLf5YEpZyjvWyMccj1nGCVc94mR6s",
"stats_file": "1Vwbv4t-N1i3jXqI0bgKAhShAEO097sK0",
"commit": "ea976b0"
},
"fullband-melgan":{
"model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K",
"config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu",
"stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU",
"commit": "4132240"
}
}
},
"en": {
"ljspeech":{
"mulitband-melgan":{
"model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K",
"config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu",
"stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU",
"commit": "ea976b0"
}
}
}
}
}

363
TTS/bin/synthesize.py Normal file → Executable file
View File

@ -3,50 +3,140 @@
import argparse import argparse
import json import json
# pylint: disable=redefined-outer-name, unused-argument
import os import os
import sys
import string import string
import time import time
from argparse import RawTextHelpFormatter
# pylint: disable=redefined-outer-name, unused-argument
from pathlib import Path
import torch
import numpy as np import numpy as np
import torch
from TTS.tts.utils.generic_utils import setup_model, is_tacotron from TTS.tts.utils.generic_utils import is_tacotron, setup_model
from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.io import load_checkpoint
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config from TTS.utils.io import load_config
from TTS.vocoder.utils.generic_utils import setup_generator from TTS.utils.manage import ModelManager
from TTS.vocoder.utils.generic_utils import setup_generator, interpolate_vocoder_input
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None): def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
def load_tts_model(model_path, config_path, use_cuda, speakers_json=None, speaker_idx=None):
global phonemes
global symbols
# load the config
model_config = load_config(config_path)
# load the audio processor
ap = AudioProcessor(**model_config.audio)
# if the vocabulary was passed, replace the default
if 'characters' in model_config.keys():
symbols, phonemes = make_symbols(**model_config.characters)
# load speakers
speaker_embedding = None
speaker_embedding_dim = None
num_speakers = 0
if speakers_json is not None:
speaker_mapping = json.load(open(speakers_json, 'r'))
num_speakers = len(speaker_mapping)
if model_config.use_external_speaker_embedding_file:
if speaker_idx is not None:
speaker_embedding = speaker_mapping[speaker_idx]['embedding']
else: # if speaker_idx is not specificated use the first sample in speakers.json
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
speaker_embedding_dim = len(speaker_embedding)
# load tts model
num_chars = len(phonemes) if model_config.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, model_config, speaker_embedding_dim)
model.load_checkpoint(model_config, model_path, eval=True)
if use_cuda:
model.cuda()
return model, model_config, ap, speaker_embedding
def load_vocoder_model(model_path, config_path, use_cuda):
vocoder_config = load_config(vocoder_config_path)
vocoder_ap = AudioProcessor(**vocoder_config['audio'])
vocoder_model = setup_generator(vocoder_config)
vocoder_model.load_checkpoint(vocoder_config, model_path, eval=True)
if use_cuda:
vocoder_model.cuda()
return vocoder_model, vocoder_config, vocoder_ap
def tts(model,
vocoder_model,
text,
model_config,
vocoder_config,
use_cuda,
ap,
vocoder_ap,
use_gl,
speaker_fileid,
speaker_embedding=None,
gst_style=None):
t_1 = time.time() t_1 = time.time()
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding) waveform, _, _, mel_postnet_spec, _, _ = synthesis(
model,
text,
model_config,
use_cuda,
ap,
speaker_fileid,
gst_style,
False,
model_config.enable_eos_bos_chars,
use_gl,
speaker_embedding=speaker_embedding)
# grab spectrogram (thx to the nice guys at mozilla discourse for codesnipplet) # grab spectrogram (thx to the nice guys at mozilla discourse for codesnippet)
if args.save_spectogram: if args.save_spectogram:
spec_file_name = args.text.replace(" ", "_")[0:10] spec_file_name = args.text.replace(" ", "_")[0:10]
spec_file_name = spec_file_name.translate( spec_file_name = spec_file_name.translate(
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.npy' str.maketrans('', '', string.punctuation.replace('_', ''))) + '.npy'
spec_file_name = os.path.join(args.out_path, spec_file_name) spec_file_name = os.path.join(args.out_path, spec_file_name)
spectrogram = torch.FloatTensor(mel_postnet_spec.T) spectrogram = mel_postnet_spec.T
spectrogram = spectrogram.unsqueeze(0) spectrogram = spectrogram[0]
np.save(spec_file_name, spectrogram) np.save(spec_file_name, spectrogram)
print(" > Saving raw spectogram to " + spec_file_name) print(" > Saving raw spectogram to " + spec_file_name)
# convert linear spectrogram to melspectrogram for tacotron
if CONFIG.model == "Tacotron" and not use_gl: if model_config.model == "Tacotron" and not use_gl:
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T)
# run vocoder_model
if not use_gl: if not use_gl:
# Use if not computed noise schedule with tune_wavegrad # denormalize tts output based on tts audio config
beta = np.linspace(1e-6, 0.01, 50) mel_postnet_spec = ap._denormalize(mel_postnet_spec.T).T
vocoder_model.compute_noise_level(beta)
# Use alternative when using output npy file from tune_wavegrad
# beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item()
# vocoder_model.compute_noise_level(beta['beta'])
device_type = "cuda" if use_cuda else "cpu" device_type = "cuda" if use_cuda else "cpu"
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0)) # renormalize spectrogram based on vocoder config
vocoder_input = vocoder_ap._normalize(mel_postnet_spec.T)
# compute scale factor for possible sample rate mismatch
scale_factor = [1, vocoder_config['audio']['sample_rate'] / ap.sample_rate]
if scale_factor[1] != 1:
print(" > interpolating tts model output.")
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
else:
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)
# run vocoder model
# [1, T, C]
waveform = vocoder_model.inference(vocoder_input.to(device_type))
if use_cuda and not use_gl: if use_cuda and not use_gl:
waveform = waveform.cpu() waveform = waveform.cpu()
if not use_gl: if not use_gl:
@ -62,54 +152,115 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n'''
parser.add_argument('text', type=str, help='Text to generate speech.')
parser.add_argument('config_path', '''You can either use your trained model or choose a model from the provided list.\n'''
type=str,
help='Path to model config file.') '''
Example runs:
# list provided models
./TTS/bin/synthesize.py --list_models
# run a model from the list
./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
# run your own TTS model (Using Griffin-Lim Vocoder)
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
# run your own TTS and Vocoder models
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
''',
formatter_class=RawTextHelpFormatter)
parser.add_argument( parser.add_argument(
'model_path', '--list_models',
type=str2bool,
nargs='?',
const=True,
default=False,
help='list available pre-trained tts and vocoder models.'
)
parser.add_argument(
'--text',
type=str, type=str,
default=None,
help='Text to generate speech.'
)
# Args for running pre-trained TTS models.
parser.add_argument(
'--model_name',
type=str,
default=None,
help=
'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>'
)
parser.add_argument(
'--vocoder_name',
type=str,
default=None,
help=
'Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>'
)
# Args for running custom models
parser.add_argument(
'--config_path',
default=None,
type=str,
help='Path to model config file.'
)
parser.add_argument(
'--model_path',
type=str,
default=None,
help='Path to model file.', help='Path to model file.',
) )
parser.add_argument( parser.add_argument(
'out_path', '--out_path',
type=str, type=str,
help='Path to save final wav file. Wav file will be names as the text given.', default=Path(__file__).resolve().parent,
help='Path to save final wav file. Wav file will be named as the given text.',
) )
parser.add_argument('--use_cuda', parser.add_argument(
type=bool, '--use_cuda',
help='Run model on CUDA.', type=bool,
default=False) help='Run model on CUDA.',
default=False
)
parser.add_argument( parser.add_argument(
'--vocoder_path', '--vocoder_path',
type=str, type=str,
help= help=
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
default="", default=None,
) )
parser.add_argument('--vocoder_config_path',
type=str,
help='Path to vocoder model config file.',
default="")
parser.add_argument( parser.add_argument(
'--batched_vocoder', '--vocoder_config_path',
type=bool,
help="If True, vocoder model uses faster batch processing.",
default=True)
parser.add_argument('--speakers_json',
type=str,
help="JSON file for multi-speaker model.",
default="")
parser.add_argument(
'--speaker_fileid',
type=str, type=str,
help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.", help='Path to vocoder model config file.',
default=None)
# args for multi-speaker synthesis
parser.add_argument(
'--speakers_json',
type=str,
help="JSON file for multi-speaker model.",
default=None)
parser.add_argument(
'--speaker_idx',
type=str,
help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
default=None) default=None)
parser.add_argument( parser.add_argument(
'--gst_style', '--gst_style',
help="Wav path file for GST stylereference.", help="Wav path file for GST stylereference.",
default=None) default=None)
# aux args
parser.add_argument( parser.add_argument(
'--save_spectogram', '--save_spectogram',
type=bool, type=bool,
@ -118,86 +269,86 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
# load the config # load model manager
C = load_config(args.config_path) path = Path(__file__).parent / "../../.models.json"
C.forward_attn_mask = True manager = ModelManager(path)
# load the audio processor model_path = None
ap = AudioProcessor(**C.audio) vocoder_path = None
model = None
vocoder_model = None
vocoder_config = None
vocoder_ap = None
# if the vocabulary was passed, replace the default # CASE1: list pre-trained TTS models
if 'characters' in C.keys(): if args.list_models:
symbols, phonemes = make_symbols(**C.characters) manager.list_models()
sys.exit()
speaker_embedding = None # CASE2: load pre-trained models
speaker_embedding_dim = None if args.model_name is not None:
num_speakers = 0 model_path, config_path = manager.download_model(args.model_name)
# load speakers if args.vocoder_name is not None:
if args.speakers_json != '': vocoder_path, vocoder_config_path = manager.download_model(args.vocoder_name)
speaker_mapping = json.load(open(args.speakers_json, 'r'))
num_speakers = len(speaker_mapping)
if C.use_external_speaker_embedding_file:
if args.speaker_fileid is not None:
speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding']
else: # if speaker_fileid is not specificated use the first sample in speakers.json
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
speaker_embedding_dim = len(speaker_embedding)
# load the model # CASE3: load custome models
num_chars = len(phonemes) if C.use_phonemes else len(symbols) if args.model_path is not None:
model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim) model_path = args.model_path
cp = torch.load(args.model_path, map_location=torch.device('cpu')) config_path = args.config_path
model.load_state_dict(cp['model'])
model.eval()
if args.use_cuda:
model.cuda()
if is_tacotron(C):
model.decoder.set_r(cp['r'])
# load vocoder model if args.vocoder_path is not None:
if args.vocoder_path != "": vocoder_path = args.vocoder_path
VC = load_config(args.vocoder_config_path) vocoder_config_path = args.vocoder_config_path
vocoder_model = setup_generator(VC)
vocoder_model.load_state_dict(torch.load(args.vocoder_path, map_location="cpu")["model"])
vocoder_model.remove_weight_norm()
if args.use_cuda:
vocoder_model.cuda()
vocoder_model.eval()
else:
vocoder_model = None
VC = None
# synthesize voice # RUN THE SYNTHESIS
use_griffin_lim = args.vocoder_path == "" # load models
model, model_config, ap, speaker_embedding = load_tts_model(model_path, config_path, args.use_cuda, args.speaker_idx)
if vocoder_path is not None:
vocoder_model, vocoder_config, vocoder_ap = load_vocoder_model(vocoder_path, vocoder_config_path, use_cuda=args.use_cuda)
use_griffin_lim = vocoder_path is None
print(" > Text: {}".format(args.text)) print(" > Text: {}".format(args.text))
if not C.use_external_speaker_embedding_file: # handle multi-speaker setting
if args.speaker_fileid.isdigit(): if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None:
args.speaker_fileid = int(args.speaker_fileid) if args.speaker_idx.isdigit():
args.speaker_idx = int(args.speaker_idx)
else: else:
args.speaker_fileid = None args.speaker_idx = None
else: else:
args.speaker_fileid = None args.speaker_idx = None
if args.gst_style is None: if args.gst_style is None:
if is_tacotron(C): if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None:
gst_style = C.gst['gst_style_input'] gst_style = model_config.gst['gst_style_input']
else: else:
gst_style = None gst_style = None
else: else:
# check if gst_style string is a dict, if is dict convert else use string # check if gst_style string is a dict, if is dict convert else use string
try: try:
gst_style = json.loads(args.gst_style) gst_style = json.loads(args.gst_style)
if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']: if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']:
raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), C.gst['gst_style_tokens'])) raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens']))
except ValueError: except ValueError:
gst_style = args.gst_style gst_style = args.gst_style
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style) # kick it
wav = tts(model,
vocoder_model,
args.text,
model_config,
vocoder_config,
args.use_cuda,
ap,
vocoder_ap,
use_griffin_lim,
args.speaker_idx,
speaker_embedding=speaker_embedding,
gst_style=gst_style)
# save the results # save the results
file_name = args.text.replace(" ", "_")[0:10] file_name = args.text.replace(" ", "_")[0:20]
file_name = file_name.translate( file_name = file_name.translate(
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(args.out_path, file_name) out_path = os.path.join(args.out_path, file_name)