mirror of https://github.com/coqui-ai/TTS.git
add support for synthesize using variable size external embedding and add bugfix in scipy.io import
This commit is contained in:
parent
def7e49f59
commit
6e7f33c798
|
@ -18,9 +18,9 @@ from mozilla_voice_tts.utils.io import load_config
|
|||
from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator
|
||||
|
||||
|
||||
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_id):
|
||||
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):
|
||||
t_1 = time.time()
|
||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, CONFIG.gst['gst_style_input'], False, CONFIG.enable_eos_bos_chars, use_gl)
|
||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
|
||||
if CONFIG.model == "Tacotron" and not use_gl:
|
||||
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
|
||||
if not use_gl:
|
||||
|
@ -80,9 +80,9 @@ if __name__ == "__main__":
|
|||
help="JSON file for multi-speaker model.",
|
||||
default="")
|
||||
parser.add_argument(
|
||||
'--speaker_id',
|
||||
type=int,
|
||||
help="target speaker_id if the model is multi-speaker.",
|
||||
'--speaker_fileid',
|
||||
type=str,
|
||||
help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
|
||||
default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -97,16 +97,24 @@ if __name__ == "__main__":
|
|||
if 'characters' in C.keys():
|
||||
symbols, phonemes = make_symbols(**C.characters)
|
||||
|
||||
speaker_embedding = None
|
||||
speaker_embedding_dim = None
|
||||
num_speakers = 0
|
||||
|
||||
# load speakers
|
||||
if args.speakers_json != '':
|
||||
speakers = json.load(open(args.speakers_json, 'r'))
|
||||
num_speakers = len(speakers)
|
||||
else:
|
||||
num_speakers = 0
|
||||
speaker_mapping = json.load(open(args.speakers_json, 'r'))
|
||||
num_speakers = len(speaker_mapping)
|
||||
if C.use_external_speaker_embedding_file:
|
||||
if args.speaker_fileid is not None:
|
||||
speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding']
|
||||
else: # if speaker_fileid is not specificated use the first sample in speakers.json
|
||||
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
|
||||
speaker_embedding_dim = len(speaker_embedding)
|
||||
|
||||
# load the model
|
||||
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, num_speakers, C)
|
||||
model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
|
||||
cp = torch.load(args.model_path, map_location=torch.device('cpu'))
|
||||
model.load_state_dict(cp['model'])
|
||||
model.eval()
|
||||
|
@ -130,7 +138,16 @@ if __name__ == "__main__":
|
|||
# synthesize voice
|
||||
use_griffin_lim = args.vocoder_path == ""
|
||||
print(" > Text: {}".format(args.text))
|
||||
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_id)
|
||||
|
||||
if not C.use_external_speaker_embedding_file:
|
||||
if args.speaker_fileid.isdigit():
|
||||
args.speaker_fileid = int(args.speaker_fileid)
|
||||
else:
|
||||
args.speaker_fileid = None
|
||||
else:
|
||||
args.speaker_fileid = None
|
||||
|
||||
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding)
|
||||
|
||||
# save the results
|
||||
file_name = args.text.replace(" ", "_")
|
||||
|
|
|
@ -523,7 +523,6 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
"a previously trained model."
|
||||
elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file
|
||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||
print(speaker_mapping)
|
||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
|
||||
elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file
|
||||
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
||||
|
|
|
@ -45,17 +45,17 @@ def compute_style_mel(style_wav, ap, cuda=False):
|
|||
return style_mel
|
||||
|
||||
|
||||
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
|
||||
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None):
|
||||
if CONFIG.use_gst:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||
inputs, style_mel=style_mel, speaker_ids=speaker_id)
|
||||
inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
|
||||
else:
|
||||
if truncated:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
|
||||
inputs, speaker_ids=speaker_id)
|
||||
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
|
||||
else:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||
inputs, speaker_ids=speaker_id)
|
||||
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
|
||||
return decoder_output, postnet_output, alignments, stop_tokens
|
||||
|
||||
|
||||
|
@ -140,6 +140,15 @@ def id_to_torch(speaker_id, cuda=False):
|
|||
return speaker_id
|
||||
|
||||
|
||||
def embedding_to_torch(speaker_embedding, cuda=False):
|
||||
if speaker_embedding is not None:
|
||||
speaker_embedding = np.asarray(speaker_embedding)
|
||||
speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor)
|
||||
if cuda:
|
||||
return speaker_embedding.cuda()
|
||||
return speaker_embedding
|
||||
|
||||
|
||||
# TODO: perform GL with pytorch for batching
|
||||
def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
|
||||
'''Apply griffin-lim to each sample iterating throught the first dimension.
|
||||
|
@ -169,6 +178,7 @@ def synthesis(model,
|
|||
enable_eos_bos_chars=False, #pylint: disable=unused-argument
|
||||
use_griffin_lim=False,
|
||||
do_trim_silence=False,
|
||||
speaker_embedding=None,
|
||||
backend='torch'):
|
||||
"""Synthesize voice for the given text.
|
||||
|
||||
|
@ -200,6 +210,10 @@ def synthesis(model,
|
|||
if backend == 'torch':
|
||||
if speaker_id is not None:
|
||||
speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
|
||||
|
||||
if speaker_embedding is not None:
|
||||
speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda)
|
||||
|
||||
if not isinstance(style_mel, dict):
|
||||
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
|
||||
inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda)
|
||||
|
@ -216,7 +230,7 @@ def synthesis(model,
|
|||
# synthesize voice
|
||||
if backend == 'torch':
|
||||
decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
|
||||
model, inputs, CONFIG, truncated, speaker_id, style_mel)
|
||||
model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding)
|
||||
postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch(
|
||||
postnet_output, decoder_output, alignments, stop_tokens)
|
||||
elif backend == 'tf':
|
||||
|
|
Loading…
Reference in New Issue