mirror of https://github.com/coqui-ai/TTS.git
Fixed bug related to yourtts speaker embeddings issue (#2234)
* Fixed bug related to yourtts speaker embeddings issue * Reverted code for base_tts * Bug fix on VITS d_vector_file type * Ignore the test speakers on YourTTS recipe * Add speaker encoder model and config on YourTTS recipe to easily do zero-shot inference * Update YourTTS config file * Update ModelManager._update_path to deal with list attributes * Fix lint checks * Remove unused code * Fix unit tests * Reset name_to_id to get the right speaker ids on load_embeddings_from_list_of_files * Set weighted_sampler_multipliers as an empty dict to prevent users' mistakes Co-authored-by: Edresson Casanova <edresson1@gmail.com>
This commit is contained in:
parent
da93d768b8
commit
42afad5e79
|
@ -4,7 +4,7 @@
|
||||||
"multi-dataset":{
|
"multi-dataset":{
|
||||||
"your_tts":{
|
"your_tts":{
|
||||||
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "e9a1953e",
|
"commit": "e9a1953e",
|
||||||
"license": "CC BY-NC-ND 4.0",
|
"license": "CC BY-NC-ND 4.0",
|
||||||
|
|
|
@ -167,7 +167,7 @@ class VitsConfig(BaseTTSConfig):
|
||||||
|
|
||||||
# use d-vectors
|
# use d-vectors
|
||||||
use_d_vector_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
d_vector_file: str = None
|
d_vector_file: List[str] = None
|
||||||
d_vector_dim: int = None
|
d_vector_dim: int = None
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
|
|
@ -477,8 +477,8 @@ class VitsArgs(Coqpit):
|
||||||
use_d_vector_file (bool):
|
use_d_vector_file (bool):
|
||||||
Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
|
Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
|
||||||
|
|
||||||
d_vector_file (str):
|
d_vector_file (List[str]):
|
||||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
List of paths to the files including pre-computed speaker embeddings. Defaults to None.
|
||||||
|
|
||||||
d_vector_dim (int):
|
d_vector_dim (int):
|
||||||
Number of d-vector channels. Defaults to 0.
|
Number of d-vector channels. Defaults to 0.
|
||||||
|
@ -573,7 +573,7 @@ class VitsArgs(Coqpit):
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
num_speakers: int = 0
|
num_speakers: int = 0
|
||||||
speakers_file: str = None
|
speakers_file: str = None
|
||||||
d_vector_file: str = None
|
d_vector_file: List[str] = None
|
||||||
speaker_embedding_channels: int = 256
|
speaker_embedding_channels: int = 256
|
||||||
use_d_vector_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
d_vector_dim: int = 0
|
d_vector_dim: int = 0
|
||||||
|
|
|
@ -235,6 +235,9 @@ class EmbeddingManager(BaseIDManager):
|
||||||
self.embeddings_by_names.update(embeddings_by_names)
|
self.embeddings_by_names.update(embeddings_by_names)
|
||||||
self.embeddings.update(embeddings)
|
self.embeddings.update(embeddings)
|
||||||
|
|
||||||
|
# reset name_to_id to get the right speaker ids
|
||||||
|
self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
|
||||||
|
|
||||||
def get_embedding_by_clip(self, clip_idx: str) -> List:
|
def get_embedding_by_clip(self, clip_idx: str) -> List:
|
||||||
"""Get embedding by clip ID.
|
"""Get embedding by clip ID.
|
||||||
|
|
||||||
|
|
|
@ -109,10 +109,6 @@ class SpeakerManager(EmbeddingManager):
|
||||||
|
|
||||||
if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
|
if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
|
||||||
speaker_manager = SpeakerManager()
|
speaker_manager = SpeakerManager()
|
||||||
if get_from_config_or_model_args_with_default(config, "speakers_file", None):
|
|
||||||
speaker_manager = SpeakerManager(
|
|
||||||
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
|
|
||||||
)
|
|
||||||
if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
|
if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
|
||||||
speaker_manager = SpeakerManager(
|
speaker_manager = SpeakerManager(
|
||||||
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
|
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
|
||||||
|
|
|
@ -339,10 +339,18 @@ class ModelManager(object):
|
||||||
sub_conf = sub_conf[fd]
|
sub_conf = sub_conf[fd]
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
sub_conf[field_names[-1]] = new_path
|
if isinstance(sub_conf[field_names[-1]], list):
|
||||||
|
sub_conf[field_names[-1]] = [new_path]
|
||||||
|
else:
|
||||||
|
sub_conf[field_names[-1]] = new_path
|
||||||
else:
|
else:
|
||||||
# field name points to a top-level field
|
# field name points to a top-level field
|
||||||
config[field_name] = new_path
|
if not field_name in config:
|
||||||
|
return
|
||||||
|
if isinstance(config[field_name], list):
|
||||||
|
config[field_name] = [new_path]
|
||||||
|
else:
|
||||||
|
config[field_name] = new_path
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -57,7 +57,25 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH):
|
||||||
|
|
||||||
# init configs
|
# init configs
|
||||||
vctk_config = BaseDatasetConfig(
|
vctk_config = BaseDatasetConfig(
|
||||||
formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en"
|
formatter="vctk",
|
||||||
|
dataset_name="vctk",
|
||||||
|
meta_file_train="",
|
||||||
|
meta_file_val="",
|
||||||
|
path=VCTK_DOWNLOAD_PATH,
|
||||||
|
language="en",
|
||||||
|
ignored_speakers=[
|
||||||
|
"p261",
|
||||||
|
"p225",
|
||||||
|
"p294",
|
||||||
|
"p347",
|
||||||
|
"p238",
|
||||||
|
"p234",
|
||||||
|
"p248",
|
||||||
|
"p335",
|
||||||
|
"p245",
|
||||||
|
"p326",
|
||||||
|
"p302",
|
||||||
|
], # Ignore the test speakers to full replicate the paper experiment
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
|
# Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
|
||||||
|
@ -111,11 +129,11 @@ model_args = VitsArgs(
|
||||||
use_d_vector_file=True,
|
use_d_vector_file=True,
|
||||||
d_vector_dim=512,
|
d_vector_dim=512,
|
||||||
num_layers_text_encoder=10,
|
num_layers_text_encoder=10,
|
||||||
|
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
||||||
|
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
||||||
resblock_type_decoder="2", # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
resblock_type_decoder="2", # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
||||||
# Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
|
# Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
|
||||||
# use_speaker_encoder_as_loss=True,
|
# use_speaker_encoder_as_loss=True,
|
||||||
# speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
|
||||||
# speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
|
||||||
# Usefull parameters to the enable multilingual training
|
# Usefull parameters to the enable multilingual training
|
||||||
# use_language_embedding=True,
|
# use_language_embedding=True,
|
||||||
# embedded_language_dim=4,
|
# embedded_language_dim=4,
|
||||||
|
@ -207,6 +225,7 @@ config = VitsConfig(
|
||||||
use_weighted_sampler=True,
|
use_weighted_sampler=True,
|
||||||
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
||||||
weighted_sampler_attrs={"speaker_name": 1.0},
|
weighted_sampler_attrs={"speaker_name": 1.0},
|
||||||
|
weighted_sampler_multipliers={},
|
||||||
# It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
|
# It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
|
||||||
speaker_encoder_loss_alpha=9.0,
|
speaker_encoder_loss_alpha=9.0,
|
||||||
)
|
)
|
||||||
|
|
|
@ -210,7 +210,7 @@ class TestVits(unittest.TestCase):
|
||||||
num_chars=32,
|
num_chars=32,
|
||||||
use_d_vector_file=True,
|
use_d_vector_file=True,
|
||||||
d_vector_dim=256,
|
d_vector_dim=256,
|
||||||
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
|
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
|
||||||
)
|
)
|
||||||
config = VitsConfig(model_args=args)
|
config = VitsConfig(model_args=args)
|
||||||
model = Vits.init_from_config(config, verbose=False).to(device)
|
model = Vits.init_from_config(config, verbose=False).to(device)
|
||||||
|
@ -355,7 +355,7 @@ class TestVits(unittest.TestCase):
|
||||||
num_chars=32,
|
num_chars=32,
|
||||||
use_d_vector_file=True,
|
use_d_vector_file=True,
|
||||||
d_vector_dim=256,
|
d_vector_dim=256,
|
||||||
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
|
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
|
||||||
)
|
)
|
||||||
config = VitsConfig(model_args=args)
|
config = VitsConfig(model_args=args)
|
||||||
model = Vits.init_from_config(config, verbose=False).to(device)
|
model = Vits.init_from_config(config, verbose=False).to(device)
|
||||||
|
@ -587,7 +587,7 @@ class TestVits(unittest.TestCase):
|
||||||
num_chars=32,
|
num_chars=32,
|
||||||
use_d_vector_file=True,
|
use_d_vector_file=True,
|
||||||
d_vector_dim=256,
|
d_vector_dim=256,
|
||||||
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
|
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
model = Vits.init_from_config(config, verbose=False).to(device)
|
model = Vits.init_from_config(config, verbose=False).to(device)
|
||||||
|
|
|
@ -33,7 +33,7 @@ config.audio.trim_db = 60
|
||||||
|
|
||||||
# active multispeaker d-vec mode
|
# active multispeaker d-vec mode
|
||||||
config.model_args.use_d_vector_file = True
|
config.model_args.use_d_vector_file = True
|
||||||
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
|
config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
|
||||||
config.model_args.d_vector_dim = 256
|
config.model_args.d_vector_dim = 256
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -63,8 +63,8 @@ config.use_speaker_embedding = False
|
||||||
# active multispeaker d-vec mode
|
# active multispeaker d-vec mode
|
||||||
config.model_args.use_d_vector_file = True
|
config.model_args.use_d_vector_file = True
|
||||||
config.use_d_vector_file = True
|
config.use_d_vector_file = True
|
||||||
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
|
config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
|
||||||
config.d_vector_file = "tests/data/ljspeech/speakers.json"
|
config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
|
||||||
config.model_args.d_vector_dim = 256
|
config.model_args.d_vector_dim = 256
|
||||||
config.d_vector_dim = 256
|
config.d_vector_dim = 256
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue