Update models

This commit is contained in:
Eren Gölge 2022-03-03 20:23:00 +01:00
parent 6cb00be795
commit dd4287de1f
4 changed files with 12 additions and 18 deletions

View File

@ -33,7 +33,7 @@
},
"tacotron2-DDC_ph": {
"description": "Tacotron2 with Double Decoder Consistency with phonemes.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--tacotronDDC_ph.zip",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
"default_vocoder": "vocoder_models/en/ljspeech/univnet",
"commit": "3900448",
"author": "Eren Gölge @erogol",
@ -71,7 +71,7 @@
},
"vits": {
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--vits.zip",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip",
"default_vocoder": null,
"commit": "3900448",
"author": "Eren Gölge @erogol",
@ -89,18 +89,9 @@
}
},
"vctk": {
"sc-glow-tts": {
"description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--vctk--sc-glow-tts.zip",
"default_vocoder": "vocoder_models/en/vctk/hifigan_v2",
"commit": "b531fa69",
"author": "Edresson Casanova",
"license": "",
"contact": ""
},
"vits": {
"description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--vctk--vits.zip",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip",
"default_vocoder": null,
"commit": "3900448",
"author": "Eren @erogol",
@ -109,7 +100,7 @@
},
"fast_pitch":{
"description": "FastPitch model trained on VCTK dataseset.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--en--vctk--fast_pitch.zip",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip",
"default_vocoder": null,
"commit": "bdab788d",
"author": "Eren @erogol",
@ -156,7 +147,7 @@
"uk":{
"mai": {
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--uk--mailabs--glow-tts.zip",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip",
"author":"@robinhad",
"commit": "bdab788d",
"license": "MIT",
@ -168,7 +159,7 @@
"zh-CN": {
"baker": {
"tacotron2-DDC-GST": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
"commit": "unknown",
"author": "@kirianguiller",
"default_vocoder": null

View File

@ -1470,7 +1470,7 @@ class Vits(BaseTTS):
"""
from TTS.utils.audio import AudioProcessor
upsample_rate = math.prod(config.model_args.upsample_rates_decoder)
upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
assert (
upsample_rate == config.audio.hop_length
), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
@ -1480,7 +1480,7 @@ class Vits(BaseTTS):
speaker_manager = SpeakerManager.init_from_config(config, samples)
language_manager = LanguageManager.init_from_config(config)
if config.model_args.speaker_encoder_model_path is not None:
if config.model_args.speaker_encoder_model_path:
speaker_manager.init_speaker_encoder(
config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path
)

View File

@ -167,7 +167,7 @@ def synthesis(
style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
# convert text to sequence of token IDs
text_inputs = np.asarray(
model.tokenizer.text_to_ids(text),
model.tokenizer.text_to_ids(text, language=language_id),
dtype=np.int32,
)
# pass tensors to backend

View File

@ -93,6 +93,9 @@ class TTSTokenizer:
language(str):
The language code of the text. Defaults to None.
TODO:
- Add support for language-specific processing.
1. Text normalizatin
2. Phonemization (if use_phonemes is True)
3. Add blank char between characters