From 5ae369d6296959db652d29ac33c06218098ce07e Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 31 Oct 2023 16:56:25 +0200 Subject: [PATCH 1/9] Move FreeVCConfig to TTS.vc.configs (like all other config classes) --- TTS/vc/configs/freevc_config.py | 277 ++++++++++++++++++++++++++++++- TTS/vc/models/freevc.py | 279 +------------------------------- 2 files changed, 278 insertions(+), 278 deletions(-) diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py index 890a2693..207181b3 100644 --- a/TTS/vc/configs/freevc_config.py +++ b/TTS/vc/configs/freevc_config.py @@ -1,5 +1,278 @@ from dataclasses import dataclass, field -from typing import List +from typing import List, Optional + +from coqpit import Coqpit from TTS.vc.configs.shared_configs import BaseVCConfig -from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig + + +@dataclass +class FreeVCAudioConfig(Coqpit): + """Audio configuration + + Args: + max_wav_value (float): + The maximum value of the waveform. + + input_sample_rate (int): + The sampling rate of the input waveform. + + output_sample_rate (int): + The sampling rate of the output waveform. + + filter_length (int): + The length of the filter. + + hop_length (int): + The hop length. + + win_length (int): + The window length. + + n_mel_channels (int): + The number of mel channels. + + mel_fmin (float): + The minimum frequency of the mel filterbank. + + mel_fmax (Optional[float]): + The maximum frequency of the mel filterbank. + """ + + max_wav_value: float = field(default=32768.0) + input_sample_rate: int = field(default=16000) + output_sample_rate: int = field(default=24000) + filter_length: int = field(default=1280) + hop_length: int = field(default=320) + win_length: int = field(default=1280) + n_mel_channels: int = field(default=80) + mel_fmin: float = field(default=0.0) + mel_fmax: Optional[float] = field(default=None) + + +@dataclass +class FreeVCArgs(Coqpit): + """FreeVC model arguments + + Args: + spec_channels (int): + The number of channels in the spectrogram. + + inter_channels (int): + The number of channels in the intermediate layers. + + hidden_channels (int): + The number of channels in the hidden layers. + + filter_channels (int): + The number of channels in the filter layers. + + n_heads (int): + The number of attention heads. + + n_layers (int): + The number of layers. + + kernel_size (int): + The size of the kernel. + + p_dropout (float): + The dropout probability. + + resblock (str): + The type of residual block. + + resblock_kernel_sizes (List[int]): + The kernel sizes for the residual blocks. + + resblock_dilation_sizes (List[List[int]]): + The dilation sizes for the residual blocks. + + upsample_rates (List[int]): + The upsample rates. + + upsample_initial_channel (int): + The number of channels in the initial upsample layer. + + upsample_kernel_sizes (List[int]): + The kernel sizes for the upsample layers. + + n_layers_q (int): + The number of layers in the quantization network. + + use_spectral_norm (bool): + Whether to use spectral normalization. + + gin_channels (int): + The number of channels in the global conditioning vector. + + ssl_dim (int): + The dimension of the self-supervised learning embedding. + + use_spk (bool): + Whether to use external speaker encoder. + """ + + spec_channels: int = field(default=641) + inter_channels: int = field(default=192) + hidden_channels: int = field(default=192) + filter_channels: int = field(default=768) + n_heads: int = field(default=2) + n_layers: int = field(default=6) + kernel_size: int = field(default=3) + p_dropout: float = field(default=0.1) + resblock: str = field(default="1") + resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2]) + upsample_initial_channel: int = field(default=512) + upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) + n_layers_q: int = field(default=3) + use_spectral_norm: bool = field(default=False) + gin_channels: int = field(default=256) + ssl_dim: int = field(default=1024) + use_spk: bool = field(default=False) + num_spks: int = field(default=0) + segment_size: int = field(default=8960) + + +@dataclass +class FreeVCConfig(BaseVCConfig): + """Defines parameters for FreeVC End2End TTS model. + + Args: + model (str): + Model name. Do not change unless you know what you are doing. + + model_args (FreeVCArgs): + Model architecture arguments. Defaults to `FreeVCArgs()`. + + audio (FreeVCAudioConfig): + Audio processing configuration. Defaults to `FreeVCAudioConfig()`. + + grad_clip (List): + Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`. + + lr_gen (float): + Initial learning rate for the generator. Defaults to 0.0002. + + lr_disc (float): + Initial learning rate for the discriminator. Defaults to 0.0002. + + lr_scheduler_gen (str): + Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to + `ExponentialLR`. + + lr_scheduler_gen_params (dict): + Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`. + + lr_scheduler_disc (str): + Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to + `ExponentialLR`. + + lr_scheduler_disc_params (dict): + Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`. + + scheduler_after_epoch (bool): + If true, step the schedulers after each epoch else after each step. Defaults to `False`. + + optimizer (str): + Name of the optimizer to use with both the generator and the discriminator networks. One of the + `torch.optim.*`. Defaults to `AdamW`. + + kl_loss_alpha (float): + Loss weight for KL loss. Defaults to 1.0. + + disc_loss_alpha (float): + Loss weight for the discriminator loss. Defaults to 1.0. + + gen_loss_alpha (float): + Loss weight for the generator loss. Defaults to 1.0. + + feat_loss_alpha (float): + Loss weight for the feature matching loss. Defaults to 1.0. + + mel_loss_alpha (float): + Loss weight for the mel loss. Defaults to 45.0. + + return_wav (bool): + If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`. + + compute_linear_spec (bool): + If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`. + + use_weighted_sampler (bool): + If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`. + + weighted_sampler_attrs (dict): + Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities + by overweighting `root_path` by 2.0. Defaults to `{}`. + + weighted_sampler_multipliers (dict): + Weight each unique value of a key returned by the formatter for weighted sampling. + For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`. + It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`. + + r (int): + Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`. + + add_blank (bool): + If true, a blank token is added in between every character. Defaults to `True`. + + test_sentences (List[List]): + List of sentences with speaker and language information to be used for testing. + + language_ids_file (str): + Path to the language ids file. + + use_language_embedding (bool): + If true, language embedding is used. Defaults to `False`. + + Note: + Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. + + Example: + + >>> from TTS.vc.configs.freevc_config import FreeVCConfig + >>> config = FreeVCConfig() + """ + + model: str = "freevc" + # model specific params + model_args: FreeVCArgs = field(default_factory=FreeVCArgs) + audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig) + + # optimizer + # TODO with training support + + # loss params + # TODO with training support + + # data loader params + return_wav: bool = True + compute_linear_spec: bool = True + + # sampler params + use_weighted_sampler: bool = False # TODO: move it to the base config + weighted_sampler_attrs: dict = field(default_factory=lambda: {}) + weighted_sampler_multipliers: dict = field(default_factory=lambda: {}) + + # overrides + r: int = 1 # DO NOT CHANGE + add_blank: bool = True + + # multi-speaker settings + # use speaker embedding layer + num_speakers: int = 0 + speakers_file: str = None + speaker_embedding_channels: int = 256 + + # use d-vectors + use_d_vector_file: bool = False + d_vector_file: List[str] = None + d_vector_dim: int = None + + def __post_init__(self): + for key, val in self.model_args.items(): + if hasattr(self, key): + self[key] = val diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index ae22ad28..fd53a77f 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple, Union import librosa @@ -13,8 +12,8 @@ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm import TTS.vc.modules.freevc.commons as commons import TTS.vc.modules.freevc.modules as modules from TTS.tts.utils.speakers import SpeakerManager -from TTS.utils.io import load_fsspec, save_checkpoint -from TTS.vc.configs.shared_configs import BaseVCConfig +from TTS.utils.io import load_fsspec +from TTS.vc.configs.freevc_config import FreeVCConfig from TTS.vc.models.base_vc import BaseVC from TTS.vc.modules.freevc.commons import get_padding, init_weights from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch @@ -294,136 +293,6 @@ class SpeakerEncoder(torch.nn.Module): return embed -@dataclass -class FreeVCAudioConfig(Coqpit): - """Audio configuration - - Args: - max_wav_value (float): - The maximum value of the waveform. - - input_sample_rate (int): - The sampling rate of the input waveform. - - output_sample_rate (int): - The sampling rate of the output waveform. - - filter_length (int): - The length of the filter. - - hop_length (int): - The hop length. - - win_length (int): - The window length. - - n_mel_channels (int): - The number of mel channels. - - mel_fmin (float): - The minimum frequency of the mel filterbank. - - mel_fmax (Optional[float]): - The maximum frequency of the mel filterbank. - """ - - max_wav_value: float = field(default=32768.0) - input_sample_rate: int = field(default=16000) - output_sample_rate: int = field(default=24000) - filter_length: int = field(default=1280) - hop_length: int = field(default=320) - win_length: int = field(default=1280) - n_mel_channels: int = field(default=80) - mel_fmin: float = field(default=0.0) - mel_fmax: Optional[float] = field(default=None) - - -@dataclass -class FreeVCArgs(Coqpit): - """FreeVC model arguments - - Args: - spec_channels (int): - The number of channels in the spectrogram. - - inter_channels (int): - The number of channels in the intermediate layers. - - hidden_channels (int): - The number of channels in the hidden layers. - - filter_channels (int): - The number of channels in the filter layers. - - n_heads (int): - The number of attention heads. - - n_layers (int): - The number of layers. - - kernel_size (int): - The size of the kernel. - - p_dropout (float): - The dropout probability. - - resblock (str): - The type of residual block. - - resblock_kernel_sizes (List[int]): - The kernel sizes for the residual blocks. - - resblock_dilation_sizes (List[List[int]]): - The dilation sizes for the residual blocks. - - upsample_rates (List[int]): - The upsample rates. - - upsample_initial_channel (int): - The number of channels in the initial upsample layer. - - upsample_kernel_sizes (List[int]): - The kernel sizes for the upsample layers. - - n_layers_q (int): - The number of layers in the quantization network. - - use_spectral_norm (bool): - Whether to use spectral normalization. - - gin_channels (int): - The number of channels in the global conditioning vector. - - ssl_dim (int): - The dimension of the self-supervised learning embedding. - - use_spk (bool): - Whether to use external speaker encoder. - """ - - spec_channels: int = field(default=641) - inter_channels: int = field(default=192) - hidden_channels: int = field(default=192) - filter_channels: int = field(default=768) - n_heads: int = field(default=2) - n_layers: int = field(default=6) - kernel_size: int = field(default=3) - p_dropout: float = field(default=0.1) - resblock: str = field(default="1") - resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2]) - upsample_initial_channel: int = field(default=512) - upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) - n_layers_q: int = field(default=3) - use_spectral_norm: bool = field(default=False) - gin_channels: int = field(default=256) - ssl_dim: int = field(default=1024) - use_spk: bool = field(default=False) - num_spks: int = field(default=0) - segment_size: int = field(default=8960) - - class FreeVC(BaseVC): """ @@ -677,7 +546,7 @@ class FreeVC(BaseVC): ... @staticmethod - def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): + def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True): model = FreeVC(config) return model @@ -689,145 +558,3 @@ class FreeVC(BaseVC): def train_step(): ... - - -@dataclass -class FreeVCConfig(BaseVCConfig): - """Defines parameters for FreeVC End2End TTS model. - - Args: - model (str): - Model name. Do not change unless you know what you are doing. - - model_args (FreeVCArgs): - Model architecture arguments. Defaults to `FreeVCArgs()`. - - audio (FreeVCAudioConfig): - Audio processing configuration. Defaults to `FreeVCAudioConfig()`. - - grad_clip (List): - Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`. - - lr_gen (float): - Initial learning rate for the generator. Defaults to 0.0002. - - lr_disc (float): - Initial learning rate for the discriminator. Defaults to 0.0002. - - lr_scheduler_gen (str): - Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to - `ExponentialLR`. - - lr_scheduler_gen_params (dict): - Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`. - - lr_scheduler_disc (str): - Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to - `ExponentialLR`. - - lr_scheduler_disc_params (dict): - Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`. - - scheduler_after_epoch (bool): - If true, step the schedulers after each epoch else after each step. Defaults to `False`. - - optimizer (str): - Name of the optimizer to use with both the generator and the discriminator networks. One of the - `torch.optim.*`. Defaults to `AdamW`. - - kl_loss_alpha (float): - Loss weight for KL loss. Defaults to 1.0. - - disc_loss_alpha (float): - Loss weight for the discriminator loss. Defaults to 1.0. - - gen_loss_alpha (float): - Loss weight for the generator loss. Defaults to 1.0. - - feat_loss_alpha (float): - Loss weight for the feature matching loss. Defaults to 1.0. - - mel_loss_alpha (float): - Loss weight for the mel loss. Defaults to 45.0. - - return_wav (bool): - If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`. - - compute_linear_spec (bool): - If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`. - - use_weighted_sampler (bool): - If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`. - - weighted_sampler_attrs (dict): - Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities - by overweighting `root_path` by 2.0. Defaults to `{}`. - - weighted_sampler_multipliers (dict): - Weight each unique value of a key returned by the formatter for weighted sampling. - For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`. - It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`. - - r (int): - Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`. - - add_blank (bool): - If true, a blank token is added in between every character. Defaults to `True`. - - test_sentences (List[List]): - List of sentences with speaker and language information to be used for testing. - - language_ids_file (str): - Path to the language ids file. - - use_language_embedding (bool): - If true, language embedding is used. Defaults to `False`. - - Note: - Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. - - Example: - - >>> from TTS.tts.configs.freevc_config import FreeVCConfig - >>> config = FreeVCConfig() - """ - - model: str = "freevc" - # model specific params - model_args: FreeVCArgs = field(default_factory=FreeVCArgs) - audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig) - - # optimizer - # TODO with training support - - # loss params - # TODO with training support - - # data loader params - return_wav: bool = True - compute_linear_spec: bool = True - - # sampler params - use_weighted_sampler: bool = False # TODO: move it to the base config - weighted_sampler_attrs: dict = field(default_factory=lambda: {}) - weighted_sampler_multipliers: dict = field(default_factory=lambda: {}) - - # overrides - r: int = 1 # DO NOT CHANGE - add_blank: bool = True - - # multi-speaker settings - # use speaker embedding layer - num_speakers: int = 0 - speakers_file: str = None - speaker_embedding_channels: int = 256 - - # use d-vectors - use_d_vector_file: bool = False - d_vector_file: List[str] = None - d_vector_dim: int = None - - def __post_init__(self): - for key, val in self.model_args.items(): - if hasattr(self, key): - self[key] = val From ce1a39a9a4d106c68320c3cb00954fbf69b17a87 Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Wed, 8 Nov 2023 10:24:23 +0100 Subject: [PATCH 2/9] Add char limit warn (#3130) * Add char limit warning * Adding v2 langs * cached_property for cutlet * Fix import --- TTS/tts/layers/xtts/tokenizer.py | 42 +++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 4f2da02d..4c7ae6e3 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -8,6 +8,7 @@ from hangul_romanize import Transliter from hangul_romanize.rule import academic from num2words import num2words from tokenizers import Tokenizer +from functools import cached_property from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words @@ -535,11 +536,50 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), " class VoiceBpeTokenizer: def __init__(self, vocab_file=None): self.tokenizer = None - self.katsu = None if vocab_file is not None: self.tokenizer = Tokenizer.from_file(vocab_file) + self.char_limits = { + "en": 250, + "de": 253, + "fr": 273, + "es": 239, + "it": 213, + "pt": 203, + "pl": 224, + "zh-cn": 82, + "ar": 166, + "cs": 186, + "ru": 182, + "nl": 251, + "tr": 226, + "ja": 71, + "hu": 224, + "ko": 95, + } + + @cached_property + def katsu(self): + import cutlet + return cutlet.Cutlet() + + def check_input_length(self, txt, lang): + limit = self.char_limits.get(lang, 250) + if len(txt) > limit: + print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.") + + def preprocess_text(self, txt, lang): + if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]: + txt = multilingual_cleaners(txt, lang) + if lang == "zh-cn": + txt = chinese_transliterate(txt) + elif lang == "ja": + txt = japanese_cleaners(txt, self.katsu) + else: + raise NotImplementedError() + return txt def encode(self, txt, lang): + self.check_input_length(txt, lang) txt = self.preprocess_text(txt, lang) txt = f"[{lang}]{txt}" txt = txt.replace(" ", "[SPACE]") From a24ebcd8a6be0a233cf3bb3dfd23916b276dd591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Nov 2023 10:51:23 +0100 Subject: [PATCH 3/9] Fix coqui api (#3168) --- README.md | 10 ++------ TTS/api.py | 6 ++--- TTS/bin/synthesize.py | 4 +-- TTS/cs_api.py | 54 ++++++++++------------------------------ docs/source/inference.md | 11 ++------ 5 files changed, 22 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 1a9285eb..353db7cf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## 🐸Coqui.ai News - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board. - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech). -- 📣 ⓍTTS can now stream with <200ms latency. +- 📣 ⓍTTS can now stream with <200ms latency. - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html) - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. @@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models() # Init TTS with the target studio speaker tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) # Run TTS -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) +tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH) # V1 model models = TTS(cs_api_model="V1").list_models() # Run TTS with emotion and speed control # Emotion control only works with V1 model tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) - -# XTTS-multilingual -models = TTS(cs_api_model="XTTS-multilingual").list_models() -# Run TTS with emotion and speed control -# Emotion control only works with V1 model -tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0) ``` #### Example text to speech using **Fairseq models in ~1100 languages** 🤯. diff --git a/TTS/api.py b/TTS/api.py index 5d1fbb5a..c8600dcd 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -60,7 +60,7 @@ class TTS(nn.Module): vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are - "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control. + "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control. Defaults to "XTTS". gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ @@ -275,7 +275,7 @@ class TTS(nn.Module): speaker_name (str, optional): Speaker name from Coqui Studio. Defaults to None. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. emotion (str, optional): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available with "V1" model. Defaults to None. @@ -321,7 +321,7 @@ class TTS(nn.Module): Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index ef41c8e1..ddfe35d2 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -227,7 +227,7 @@ def main(): parser.add_argument( "--cs_model", type=str, - help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.", + help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.", ) parser.add_argument( "--emotion", @@ -238,7 +238,7 @@ def main(): parser.add_argument( "--language", type=str, - help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", + help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.", default=None, ) parser.add_argument( diff --git a/TTS/cs_api.py b/TTS/cs_api.py index 4a44b535..c45f9d08 100644 --- a/TTS/cs_api.py +++ b/TTS/cs_api.py @@ -43,7 +43,7 @@ class CS_API: Args: api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable `COQUI_STUDIO_TOKEN`. - model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`. + model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`. Example listing all available speakers: @@ -65,7 +65,7 @@ class CS_API: Example with multi-language model: >>> from TTS.api import CS_API - >>> tts = CS_API(model="XTTS-multilang") + >>> tts = CS_API(model="XTTS") >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") """ @@ -78,16 +78,12 @@ class CS_API: "XTTS": { "list_speakers": "https://app.coqui.ai/api/v2/speakers", "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", - "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", - }, - "XTTS-multilang": { - "list_speakers": "https://app.coqui.ai/api/v2/speakers", - "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/", - "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", + "list_voices": "https://app.coqui.ai/api/v2/voices/xtts", }, } - SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"] + + SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"] def __init__(self, api_token=None, model="XTTS"): self.api_token = api_token @@ -139,7 +135,7 @@ class CS_API: self._check_token() conn = http.client.HTTPSConnection("app.coqui.ai") url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] - conn.request("GET", f"{url}?per_page=100", headers=self.headers) + conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) res = conn.getresponse() data = res.read() return [Speaker(s) for s in json.loads(data)["result"]] @@ -148,7 +144,7 @@ class CS_API: """List custom voices created by the user.""" conn = http.client.HTTPSConnection("app.coqui.ai") url = self.MODEL_ENDPOINTS[self.model]["list_voices"] - conn.request("GET", f"{url}", headers=self.headers) + conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) res = conn.getresponse() data = res.read() return [Speaker(s, True) for s in json.loads(data)["result"]] @@ -197,14 +193,6 @@ class CS_API: } ) elif model == "XTTS": - payload.update( - { - "name": speaker.name, - "text": text, - "speed": speed, - } - ) - elif model == "XTTS-multilang": payload.update( { "name": speaker.name, @@ -226,13 +214,10 @@ class CS_API: assert language is None, "❗ language is not supported for V1 model." elif self.model == "XTTS": assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." - assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model." - elif self.model == "XTTS-multilang": - assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model." - assert language is not None, "❗ Language is required for XTTS-multilang model." + assert language is not None, "❗ Language is required for XTTS model." assert ( language in self.SUPPORTED_LANGUAGES - ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl" + ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create." return text, speaker_name, speaker_id, emotion, speed, language def tts( @@ -255,7 +240,7 @@ class CS_API: supported by `V1` model. Defaults to None. speed (float): Speed of the speech. 1.0 is normal speed. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. """ self._check_token() self.ping_api() @@ -305,7 +290,7 @@ class CS_API: speed (float): Speed of the speech. 1.0 is normal speed. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". file_path (str): Path to save the file. If None, a temporary file is created. """ if file_path is None: @@ -323,20 +308,7 @@ if __name__ == "__main__": print(api.list_speakers_as_tts_models()) ts = time.time() - wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name) + wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name) print(f" [i] XTTS took {time.time() - ts:.2f}s") - filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav") - - api = CS_API(model="XTTS-multilang") - print(api.speakers) - - ts = time.time() - wav, sr = api.tts( - "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en" - ) - print(f" [i] XTTS took {time.time() - ts:.2f}s") - - filepath = api.tts_to_file( - text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en" - ) + filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav") diff --git a/docs/source/inference.md b/docs/source/inference.md index 4de9ecdd..b40445ae 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -198,19 +198,12 @@ from TTS.api import CS_API # Init 🐸 Coqui Studio API # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. -# XTTS - Best quality and life-like speech in EN +# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. api = CS_API(api_token=, model="XTTS") api.speakers # all the speakers are available with all the models. api.list_speakers() api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) - -# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon) -api = CS_API(api_token=, model="XTTS-multilingual") -api.speakers -api.list_speakers() -api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) +wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5) # V1 - Fast and lightweight TTS in EN with emotion control. api = CS_API(api_token=, model="V1") From cc6e9fcaa72a6ba7255a1a39c77ffdb5b7bc7e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Nov 2023 11:13:58 +0100 Subject: [PATCH 4/9] Fix #3153 (#3169) --- TTS/vocoder/layers/losses.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index befc43cc..74cfc726 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -195,10 +195,10 @@ def _apply_D_loss(scores_fake, scores_real, loss_func): if isinstance(scores_fake, list): # multi-scale loss for score_fake, score_real in zip(scores_fake, scores_real): - total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real) + total_loss, real_loss_, fake_loss_ = loss_func(score_fake=score_fake, score_real=score_real) loss += total_loss - real_loss += real_loss - fake_loss += fake_loss + real_loss += real_loss_ + fake_loss += fake_loss_ # normalize loss values with number of scales (discriminators) loss /= len(scores_fake) real_loss /= len(scores_real) From 99edd6daa38f929c703e29f6dfcdcdd8f5260b0d Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Wed, 8 Nov 2023 11:29:01 +0100 Subject: [PATCH 5/9] Fix ModelManager.list_models() (#3128) * fix(utils.manage): remove hard-coded model_type variable * refactor(utils.manage): address lint issues, fix typos Addressed the following: TTS/utils/manage.py:307:12: R1705: Unnecessary "else" after "return" (no-else-return) TTS/utils/manage.py:308:21: W1514: Using open without explicitly specifying an encoding (unspecified-encoding) TTS/utils/manage.py:299:4: R1710: Either all return statements in a function should return an expression, or none of them should. (inconsistent-return-statements) TTS/utils/manage.py:299:4: R0201: Method could be a function (no-self-use) TTS/utils/manage.py:314:4: R0201: Method could be a function (no-self-use) --- TTS/utils/manage.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index c732e1f5..1cd437e6 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -109,7 +109,6 @@ class ModelManager(object): def _list_for_model_type(self, model_type): models_name_list = [] model_count = 1 - model_type = "tts_models" models_name_list.extend(self._list_models(model_type, model_count)) return models_name_list @@ -298,22 +297,22 @@ class ModelManager(object): model_item = self.set_model_url(model_item) return model_item, model_full_name, model, md5hash - def ask_tos(self, model_full_path): + @staticmethod + def ask_tos(model_full_path): """Ask the user to agree to the terms of service""" tos_path = os.path.join(model_full_path, "tos_agreed.txt") - if not os.path.exists(tos_path): - print(" > You must agree to the terms of service to use this model.") - print(" | > Please see the terms of service at https://coqui.ai/cpml.txt") - print(' | > "I have read, understood and agreed the Terms and Conditions." - [y/n]') - answer = input(" | | > ") - if answer.lower() == "y": - with open(tos_path, "w") as f: - f.write("I have read, understood ad agree the Terms and Conditions.") - return True - else: - return False + print(" > You must agree to the terms of service to use this model.") + print(" | > Please see the terms of service at https://coqui.ai/cpml.txt") + print(' | > "I have read, understood and agreed to the Terms and Conditions." - [y/n]') + answer = input(" | | > ") + if answer.lower() == "y": + with open(tos_path, "w", encoding="utf-8") as f: + f.write("I have read, understood and agreed to the Terms and Conditions.") + return True + return False - def tos_agreed(self, model_item, model_full_path): + @staticmethod + def tos_agreed(model_item, model_full_path): """Check if the user has agreed to the terms of service""" if "tos_required" in model_item and model_item["tos_required"]: tos_path = os.path.join(model_full_path, "tos_agreed.txt") From 78a596618a4deb21ef1058e911c15334b81b0669 Mon Sep 17 00:00:00 2001 From: Gorkem Date: Wed, 8 Nov 2023 13:32:02 +0300 Subject: [PATCH 6/9] Fix for exception on streaming if last chunk empty (#3160) --- TTS/tts/models/xtts.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 4ab00270..a8a574c0 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -603,10 +603,21 @@ class Xtts(BaseTTS): if wav_gen_prev is not None: wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len] if wav_overlap is not None: - crossfade_wav = wav_chunk[:overlap_len] - crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device) - wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device) - wav_chunk[:overlap_len] += crossfade_wav + # cross fade the overlap section + if overlap_len > len(wav_chunk): + # wav_chunk is smaller than overlap_len, pass on last wav_gen + if wav_gen_prev is not None: + wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len):] + else: + # not expecting will hit here as problem happens on last chunk + wav_chunk = wav_gen[-overlap_len:] + return wav_chunk, wav_gen, None + else: + crossfade_wav = wav_chunk[:overlap_len] + crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device) + wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device) + wav_chunk[:overlap_len] += crossfade_wav + wav_overlap = wav_gen[-overlap_len:] wav_gen_prev = wav_gen return wav_chunk, wav_gen_prev, wav_overlap From 03ad90135bb70d1ca6b46b3b7f3e89563aa65af6 Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Wed, 8 Nov 2023 13:47:33 +0100 Subject: [PATCH 7/9] Add lang code in XTTS doc (#3158) * Add lang code in XTTS doc * Remove ununsed config and args * update docs * woops --- TTS/tts/configs/xtts_config.py | 22 -------------------- TTS/tts/models/xtts.py | 37 ---------------------------------- docs/source/models/xtts.md | 11 ++++------ 3 files changed, 4 insertions(+), 66 deletions(-) diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py index ea95faf5..2d3edaf4 100644 --- a/TTS/tts/configs/xtts_config.py +++ b/TTS/tts/configs/xtts_config.py @@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. Defaults to `0.8`. - cond_free_k (float): - Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. - As cond_free_k increases, the output becomes dominated by the conditioning-free signal. - Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`. - - diffusion_temperature (float): - Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 - are the "mean" prediction of the diffusion network and will sound bland and smeared. - Defaults to `1.0`. - num_gpt_outputs (int): Number of samples taken from the autoregressive model, all of which are filtered using CLVP. As XTTS is a probabilistic model, more samples means a higher probability of creating something "great". Defaults to `16`. - decoder_iterations (int): - Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine - the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better, - however. Defaults to `30`. - - decoder_sampler (str): - Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`. - gpt_cond_len (int): Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`. @@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig): repetition_penalty: float = 2.0 top_k: int = 50 top_p: float = 0.85 - cond_free_k: float = 2.0 - diffusion_temperature: float = 1.0 num_gpt_outputs: int = 1 - decoder_iterations: int = 30 - decoder_sampler: str = "ddim" # cloning gpt_cond_len: int = 3 diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index a8a574c0..7cc9836a 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -152,19 +152,6 @@ class XttsArgs(Coqpit): gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024. gpt_use_masking_gt_prompt_approach (bool, optional): If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True. gpt_use_perceiver_resampler (bool, optional): If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False. - - For DiffTTS model: - diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024. - diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10. - diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100. - diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200. - diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024. - diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193. - diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0. - diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False. - diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16. - diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0. - diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0. """ gpt_batch_size: int = 1 @@ -193,19 +180,6 @@ class XttsArgs(Coqpit): gpt_use_masking_gt_prompt_approach: bool = True gpt_use_perceiver_resampler: bool = False - # Diffusion Decoder params - diff_model_channels: int = 1024 - diff_num_layers: int = 10 - diff_in_channels: int = 100 - diff_out_channels: int = 200 - diff_in_latent_channels: int = 1024 - diff_in_tokens: int = 8193 - diff_dropout: int = 0 - diff_use_fp16: bool = False - diff_num_heads: int = 16 - diff_layer_drop: int = 0 - diff_unconditioned_percentage: int = 0 - # HifiGAN Decoder params input_sample_rate: int = 22050 output_sample_rate: int = 24000 @@ -426,10 +400,6 @@ class Xtts(BaseTTS): "repetition_penalty": config.repetition_penalty, "top_k": config.top_k, "top_p": config.top_p, - "cond_free_k": config.cond_free_k, - "diffusion_temperature": config.diffusion_temperature, - "decoder_iterations": config.decoder_iterations, - "decoder_sampler": config.decoder_sampler, "gpt_cond_len": config.gpt_cond_len, "max_ref_len": config.max_ref_len, "sound_norm_refs": config.sound_norm_refs, @@ -454,13 +424,6 @@ class Xtts(BaseTTS): gpt_cond_len=6, max_ref_len=10, sound_norm_refs=False, - # Decoder inference - decoder_iterations=100, - cond_free=True, - cond_free_k=2, - diffusion_temperature=1.0, - decoder_sampler="ddim", - decoder="hifigan", **hf_generate_kwargs, ): """ diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index 8167a1d1..03e44af1 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -24,8 +24,7 @@ a few tricks to make it faster and support streaming inference. Current implementation only supports inference. ### Languages -As of now, XTTS-v2 supports 16 languages: English, Spanish, French, German, Italian, Portuguese, -Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese (Simplified), Japanese, Hungarian, Korean +As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko). Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out. @@ -116,7 +115,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru model.cuda() print("Computing speaker latents...") -gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) print("Inference...") out = model.inference( @@ -124,7 +123,6 @@ out = model.inference( "en", gpt_cond_latent, speaker_embedding, - diffusion_conditioning, temperature=0.7, # Add custom parameters here ) torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000) @@ -153,7 +151,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru model.cuda() print("Computing speaker latents...") -gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) print("Inference...") t0 = time.time() @@ -210,7 +208,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI model.cuda() print("Computing speaker latents...") -gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE]) +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE]) print("Inference...") out = model.inference( @@ -218,7 +216,6 @@ out = model.inference( "en", gpt_cond_latent, speaker_embedding, - diffusion_conditioning, temperature=0.7, # Add custom parameters here ) torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000) From 58cb0d8dd0a67e0d599d264987e518d823a78f46 Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Wed, 8 Nov 2023 14:51:42 +0100 Subject: [PATCH 8/9] Remove v1 doc and tests (#3172) * remove v1 in inference.md * remove v1 in README.md * Update test_models.py --- README.md | 2 +- docs/source/inference.md | 4 ++-- tests/zoo_tests/test_models.py | 7 +++---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 353db7cf..935627e5 100644 --- a/README.md +++ b/README.md @@ -205,7 +205,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu" print(TTS().list_models()) # Init TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) # Run TTS # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language diff --git a/docs/source/inference.md b/docs/source/inference.md index b40445ae..611a2445 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -124,7 +124,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu" print(TTS().list_models()) # Init TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) # Run TTS # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language @@ -231,4 +231,4 @@ api.tts_with_vc_to_file( speaker_wav="target/speaker.wav", file_path="ouptut.wav" ) -``` \ No newline at end of file +``` diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 79aef5cb..d1c6b67c 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -14,7 +14,6 @@ from TTS.utils.manage import ModelManager MODELS_WITH_SEP_TESTS = [ "tts_models/multilingual/multi-dataset/bark", "tts_models/en/multi-dataset/tortoise-v2", - "tts_models/multilingual/multi-dataset/xtts_v1", "tts_models/multilingual/multi-dataset/xtts_v1.1", "tts_models/multilingual/multi-dataset/xtts_v2", ] @@ -83,14 +82,14 @@ def test_xtts(): if use_gpu: run_cli( "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 " + f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' f'--speaker_wav "{speaker_wav}" --language_idx "en"' ) else: run_cli( "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 " + f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' f'--speaker_wav "{speaker_wav}" --language_idx "en"' ) @@ -104,7 +103,7 @@ def test_xtts_streaming(): speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") speaker_wav.append(speaker_wav_2) - model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1") + model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1") config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) model = Xtts.init_from_config(config) From 46d9c27212939aa54b22f9df842c753de67b1f34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Nov 2023 16:07:56 +0100 Subject: [PATCH 9/9] Update to v0.20.2 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 847e9aef..727d97b9 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.20.1 +0.20.2