mirror of https://github.com/coqui-ai/TTS.git
config, benchmark notebook, synthesis fixed
This commit is contained in:
parent
d172a3d3d5
commit
05ff8801d1
|
@ -75,6 +75,7 @@
|
||||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners"
|
"text_cleaner": "phoneme_cleaners",
|
||||||
|
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -76,6 +76,7 @@
|
||||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners"
|
"text_cleaner": "phoneme_cleaners",
|
||||||
|
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
||||||
}
|
}
|
||||||
|
|
|
@ -78,6 +78,7 @@
|
||||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners"
|
"text_cleaner": "phoneme_cleaners",
|
||||||
|
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -76,6 +76,7 @@
|
||||||
"phoneme_cache_path": "phoneme_cache", // phoneme computation is slow, therefore, it caches results in the given folder.
|
"phoneme_cache_path": "phoneme_cache", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||||
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "de", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "de", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners"
|
"text_cleaner": "phoneme_cleaners",
|
||||||
|
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,6 +76,7 @@
|
||||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners"
|
"text_cleaner": "phoneme_cleaners",
|
||||||
|
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -35,17 +35,17 @@ def compute_style_mel(style_wav, ap, use_cuda):
|
||||||
return style_mel
|
return style_mel
|
||||||
|
|
||||||
|
|
||||||
def run_model(model, inputs, CONFIG, truncated, style_mel=None):
|
def run_model(model, inputs, speaker_id, CONFIG, truncated, style_mel=None):
|
||||||
if CONFIG.model == "TacotronGST" and style_mel is not None:
|
if CONFIG.model == "TacotronGST" and style_mel is not None:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||||
inputs, style_mel)
|
inputs, style_mel, speaker_id)
|
||||||
else:
|
else:
|
||||||
if truncated:
|
if truncated:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
|
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
|
||||||
inputs)
|
inputs, speaker_id)
|
||||||
else:
|
else:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||||
inputs)
|
inputs, speaker_id)
|
||||||
return decoder_output, postnet_output, alignments, stop_tokens
|
return decoder_output, postnet_output, alignments, stop_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@ -100,12 +100,13 @@ def synthesis(model,
|
||||||
style_mel = compute_style_mel(style_wav, ap, use_cuda)
|
style_mel = compute_style_mel(style_wav, ap, use_cuda)
|
||||||
# preprocess the given text
|
# preprocess the given text
|
||||||
inputs = text_to_seqvec(text, CONFIG, use_cuda)
|
inputs = text_to_seqvec(text, CONFIG, use_cuda)
|
||||||
speaker_id = speaker_id_var = torch.from_numpy(speaker_id).unsqueeze(0)
|
speaker_id = np.asarray(speaker_id)
|
||||||
|
speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
speaker_id.cuda()
|
speaker_id.cuda()
|
||||||
# synthesize voice
|
# synthesize voice
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = run_model(
|
decoder_output, postnet_output, alignments, stop_tokens = run_model(
|
||||||
model, inputs, CONFIG, truncated, style_mel)
|
model, inputs, speaker_id, CONFIG, truncated, style_mel)
|
||||||
# convert outputs to numpy
|
# convert outputs to numpy
|
||||||
postnet_output, decoder_output, alignment = parse_outputs(
|
postnet_output, decoder_output, alignment = parse_outputs(
|
||||||
postnet_output, decoder_output, alignments)
|
postnet_output, decoder_output, alignments)
|
||||||
|
|
Loading…
Reference in New Issue