add support for synthesize using variable size external embedding and add bugfix in scipy.io import

This commit is contained in:
Edresson 2020-07-30 03:51:20 -03:00 committed by erogol
parent def7e49f59
commit 6e7f33c798
3 changed files with 47 additions and 17 deletions

View File

@ -18,9 +18,9 @@ from mozilla_voice_tts.utils.io import load_config
from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_id):
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):
t_1 = time.time()
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, CONFIG.gst['gst_style_input'], False, CONFIG.enable_eos_bos_chars, use_gl)
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
if CONFIG.model == "Tacotron" and not use_gl:
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
if not use_gl:
@ -80,9 +80,9 @@ if __name__ == "__main__":
help="JSON file for multi-speaker model.",
default="")
parser.add_argument(
'--speaker_id',
type=int,
help="target speaker_id if the model is multi-speaker.",
'--speaker_fileid',
type=str,
help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
default=None)
args = parser.parse_args()
@ -97,16 +97,24 @@ if __name__ == "__main__":
if 'characters' in C.keys():
symbols, phonemes = make_symbols(**C.characters)
speaker_embedding = None
speaker_embedding_dim = None
num_speakers = 0
# load speakers
if args.speakers_json != '':
speakers = json.load(open(args.speakers_json, 'r'))
num_speakers = len(speakers)
else:
num_speakers = 0
speaker_mapping = json.load(open(args.speakers_json, 'r'))
num_speakers = len(speaker_mapping)
if C.use_external_speaker_embedding_file:
if args.speaker_fileid is not None:
speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding']
else: # if speaker_fileid is not specificated use the first sample in speakers.json
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
speaker_embedding_dim = len(speaker_embedding)
# load the model
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, C)
model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
cp = torch.load(args.model_path, map_location=torch.device('cpu'))
model.load_state_dict(cp['model'])
model.eval()
@ -130,7 +138,16 @@ if __name__ == "__main__":
# synthesize voice
use_griffin_lim = args.vocoder_path == ""
print(" > Text: {}".format(args.text))
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_id)
if not C.use_external_speaker_embedding_file:
if args.speaker_fileid.isdigit():
args.speaker_fileid = int(args.speaker_fileid)
else:
args.speaker_fileid = None
else:
args.speaker_fileid = None
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding)
# save the results
file_name = args.text.replace(" ", "_")

View File

@ -523,7 +523,6 @@ def main(args): # pylint: disable=redefined-outer-name
"a previously trained model."
elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
print(speaker_mapping)
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"

View File

@ -45,17 +45,17 @@ def compute_style_mel(style_wav, ap, cuda=False):
return style_mel
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None):
if CONFIG.use_gst:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, style_mel=style_mel, speaker_ids=speaker_id)
inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
else:
if truncated:
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
inputs, speaker_ids=speaker_id)
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
else:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, speaker_ids=speaker_id)
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
return decoder_output, postnet_output, alignments, stop_tokens
@ -140,6 +140,15 @@ def id_to_torch(speaker_id, cuda=False):
return speaker_id
def embedding_to_torch(speaker_embedding, cuda=False):
if speaker_embedding is not None:
speaker_embedding = np.asarray(speaker_embedding)
speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor)
if cuda:
return speaker_embedding.cuda()
return speaker_embedding
# TODO: perform GL with pytorch for batching
def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
'''Apply griffin-lim to each sample iterating throught the first dimension.
@ -169,6 +178,7 @@ def synthesis(model,
enable_eos_bos_chars=False, #pylint: disable=unused-argument
use_griffin_lim=False,
do_trim_silence=False,
speaker_embedding=None,
backend='torch'):
"""Synthesize voice for the given text.
@ -200,6 +210,10 @@ def synthesis(model,
if backend == 'torch':
if speaker_id is not None:
speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
if speaker_embedding is not None:
speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda)
if not isinstance(style_mel, dict):
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda)
@ -216,7 +230,7 @@ def synthesis(model,
# synthesize voice
if backend == 'torch':
decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
model, inputs, CONFIG, truncated, speaker_id, style_mel)
model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding)
postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch(
postnet_output, decoder_output, alignments, stop_tokens)
elif backend == 'tf':