diff --git a/TTS/.models.json b/TTS/.models.json index 52cdf795..0b502073 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -4,7 +4,7 @@ "multi-dataset":{ "your_tts":{ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip", "default_vocoder": null, "commit": "e9a1953e", "license": "CC BY-NC-ND 4.0", diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 3469f701..4e574c5a 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -167,7 +167,7 @@ class VitsConfig(BaseTTSConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: str = None + d_vector_file: List[str] = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 518809b3..1b367cd7 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -477,8 +477,8 @@ class VitsArgs(Coqpit): use_d_vector_file (bool): Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False. - d_vector_file (str): - Path to the file including pre-computed speaker embeddings. Defaults to None. + d_vector_file (List[str]): + List of paths to the files including pre-computed speaker embeddings. Defaults to None. d_vector_dim (int): Number of d-vector channels. Defaults to 0. @@ -573,7 +573,7 @@ class VitsArgs(Coqpit): use_speaker_embedding: bool = False num_speakers: int = 0 speakers_file: str = None - d_vector_file: str = None + d_vector_file: List[str] = None speaker_embedding_channels: int = 256 use_d_vector_file: bool = False d_vector_dim: int = 0 diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index 46d999a2..0159a9d2 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -235,6 +235,9 @@ class EmbeddingManager(BaseIDManager): self.embeddings_by_names.update(embeddings_by_names) self.embeddings.update(embeddings) + # reset name_to_id to get the right speaker ids + self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)} + def get_embedding_by_clip(self, clip_idx: str) -> List: """Get embedding by clip ID. diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 21fefa0b..e4969526 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -109,10 +109,6 @@ class SpeakerManager(EmbeddingManager): if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False): speaker_manager = SpeakerManager() - if get_from_config_or_model_args_with_default(config, "speakers_file", None): - speaker_manager = SpeakerManager( - d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None) - ) if get_from_config_or_model_args_with_default(config, "d_vector_file", None): speaker_manager = SpeakerManager( d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 44348b29..ef4c11f5 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -339,10 +339,18 @@ class ModelManager(object): sub_conf = sub_conf[fd] else: return - sub_conf[field_names[-1]] = new_path + if isinstance(sub_conf[field_names[-1]], list): + sub_conf[field_names[-1]] = [new_path] + else: + sub_conf[field_names[-1]] = new_path else: # field name points to a top-level field - config[field_name] = new_path + if not field_name in config: + return + if isinstance(config[field_name], list): + config[field_name] = [new_path] + else: + config[field_name] = new_path config.save_json(config_path) @staticmethod diff --git a/recipes/vctk/yourtts/train_yourtts.py b/recipes/vctk/yourtts/train_yourtts.py index aa584396..b783c5d6 100644 --- a/recipes/vctk/yourtts/train_yourtts.py +++ b/recipes/vctk/yourtts/train_yourtts.py @@ -57,7 +57,25 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH): # init configs vctk_config = BaseDatasetConfig( - formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en" + formatter="vctk", + dataset_name="vctk", + meta_file_train="", + meta_file_val="", + path=VCTK_DOWNLOAD_PATH, + language="en", + ignored_speakers=[ + "p261", + "p225", + "p294", + "p347", + "p238", + "p234", + "p248", + "p335", + "p245", + "p326", + "p302", + ], # Ignore the test speakers to full replicate the paper experiment ) # Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :) @@ -111,11 +129,11 @@ model_args = VitsArgs( use_d_vector_file=True, d_vector_dim=512, num_layers_text_encoder=10, + speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH, + speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH, resblock_type_decoder="2", # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model # Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper # use_speaker_encoder_as_loss=True, - # speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH, - # speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH, # Usefull parameters to the enable multilingual training # use_language_embedding=True, # embedded_language_dim=4, @@ -207,6 +225,7 @@ config = VitsConfig( use_weighted_sampler=True, # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has weighted_sampler_attrs={"speaker_name": 1.0}, + weighted_sampler_multipliers={}, # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper speaker_encoder_loss_alpha=9.0, ) diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index ccc3be1c..8e408519 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -210,7 +210,7 @@ class TestVits(unittest.TestCase): num_chars=32, use_d_vector_file=True, d_vector_dim=256, - d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")], ) config = VitsConfig(model_args=args) model = Vits.init_from_config(config, verbose=False).to(device) @@ -355,7 +355,7 @@ class TestVits(unittest.TestCase): num_chars=32, use_d_vector_file=True, d_vector_dim=256, - d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")], ) config = VitsConfig(model_args=args) model = Vits.init_from_config(config, verbose=False).to(device) @@ -587,7 +587,7 @@ class TestVits(unittest.TestCase): num_chars=32, use_d_vector_file=True, d_vector_dim=256, - d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")], ) ) model = Vits.init_from_config(config, verbose=False).to(device) diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py index 29c5b438..741bda91 100644 --- a/tests/tts_tests/test_vits_d-vectors_train.py +++ b/tests/tts_tests/test_vits_d-vectors_train.py @@ -33,7 +33,7 @@ config.audio.trim_db = 60 # active multispeaker d-vec mode config.model_args.use_d_vector_file = True -config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" +config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] config.model_args.d_vector_dim = 256 diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index db66802b..fd58db53 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -63,8 +63,8 @@ config.use_speaker_embedding = False # active multispeaker d-vec mode config.model_args.use_d_vector_file = True config.use_d_vector_file = True -config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" -config.d_vector_file = "tests/data/ljspeech/speakers.json" +config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] +config.d_vector_file = ["tests/data/ljspeech/speakers.json"] config.model_args.d_vector_dim = 256 config.d_vector_dim = 256