mirror of https://github.com/coqui-ai/TTS.git
Update models
This commit is contained in:
parent
6cb00be795
commit
dd4287de1f
|
@ -33,7 +33,7 @@
|
|||
},
|
||||
"tacotron2-DDC_ph": {
|
||||
"description": "Tacotron2 with Double Decoder Consistency with phonemes.",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--tacotronDDC_ph.zip",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
|
||||
"default_vocoder": "vocoder_models/en/ljspeech/univnet",
|
||||
"commit": "3900448",
|
||||
"author": "Eren Gölge @erogol",
|
||||
|
@ -71,7 +71,7 @@
|
|||
},
|
||||
"vits": {
|
||||
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--vits.zip",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": "3900448",
|
||||
"author": "Eren Gölge @erogol",
|
||||
|
@ -89,18 +89,9 @@
|
|||
}
|
||||
},
|
||||
"vctk": {
|
||||
"sc-glow-tts": {
|
||||
"description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--vctk--sc-glow-tts.zip",
|
||||
"default_vocoder": "vocoder_models/en/vctk/hifigan_v2",
|
||||
"commit": "b531fa69",
|
||||
"author": "Edresson Casanova",
|
||||
"license": "",
|
||||
"contact": ""
|
||||
},
|
||||
"vits": {
|
||||
"description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--vctk--vits.zip",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": "3900448",
|
||||
"author": "Eren @erogol",
|
||||
|
@ -109,7 +100,7 @@
|
|||
},
|
||||
"fast_pitch":{
|
||||
"description": "FastPitch model trained on VCTK dataseset.",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--en--vctk--fast_pitch.zip",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": "bdab788d",
|
||||
"author": "Eren @erogol",
|
||||
|
@ -156,7 +147,7 @@
|
|||
"uk":{
|
||||
"mai": {
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--uk--mailabs--glow-tts.zip",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip",
|
||||
"author":"@robinhad",
|
||||
"commit": "bdab788d",
|
||||
"license": "MIT",
|
||||
|
@ -168,7 +159,7 @@
|
|||
"zh-CN": {
|
||||
"baker": {
|
||||
"tacotron2-DDC-GST": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
|
||||
"commit": "unknown",
|
||||
"author": "@kirianguiller",
|
||||
"default_vocoder": null
|
||||
|
|
|
@ -1470,7 +1470,7 @@ class Vits(BaseTTS):
|
|||
"""
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
upsample_rate = math.prod(config.model_args.upsample_rates_decoder)
|
||||
upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
|
||||
assert (
|
||||
upsample_rate == config.audio.hop_length
|
||||
), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
|
||||
|
@ -1480,7 +1480,7 @@ class Vits(BaseTTS):
|
|||
speaker_manager = SpeakerManager.init_from_config(config, samples)
|
||||
language_manager = LanguageManager.init_from_config(config)
|
||||
|
||||
if config.model_args.speaker_encoder_model_path is not None:
|
||||
if config.model_args.speaker_encoder_model_path:
|
||||
speaker_manager.init_speaker_encoder(
|
||||
config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path
|
||||
)
|
||||
|
|
|
@ -167,7 +167,7 @@ def synthesis(
|
|||
style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
|
||||
# convert text to sequence of token IDs
|
||||
text_inputs = np.asarray(
|
||||
model.tokenizer.text_to_ids(text),
|
||||
model.tokenizer.text_to_ids(text, language=language_id),
|
||||
dtype=np.int32,
|
||||
)
|
||||
# pass tensors to backend
|
||||
|
|
|
@ -93,6 +93,9 @@ class TTSTokenizer:
|
|||
language(str):
|
||||
The language code of the text. Defaults to None.
|
||||
|
||||
TODO:
|
||||
- Add support for language-specific processing.
|
||||
|
||||
1. Text normalizatin
|
||||
2. Phonemization (if use_phonemes is True)
|
||||
3. Add blank char between characters
|
||||
|
|
Loading…
Reference in New Issue