diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py index 668ea227..8f063102 100644 --- a/TTS/tts/configs/fast_pitch_config.py +++ b/TTS/tts/configs/fast_pitch_config.py @@ -11,7 +11,7 @@ class FastPitchConfig(BaseTTSConfig): Example: - >>> from TTS.tts.configs import FastPitchConfig + >>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig >>> config = FastPitchConfig() Args: @@ -30,6 +30,10 @@ class FastPitchConfig(BaseTTSConfig): Activation Normalization that pre-computes normalization stats at the beginning and use the same values for the rest. Defaults to 10. + speakers_file (str): + Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to + speaker names. Defaults to `None`. + use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. @@ -105,6 +109,8 @@ class FastPitchConfig(BaseTTSConfig): model_args: ForwardTTSArgs = ForwardTTSArgs() # multi-speaker settings + num_speakers: int = 0 + speakers_file: str = None use_speaker_embedding: bool = False use_d_vector_file: bool = False d_vector_file: str = False @@ -149,3 +155,22 @@ class FastPitchConfig(BaseTTSConfig): "Prior to November 22, 1963.", ] ) + + def __post_init__(self): + # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there. + if self.num_speakers > 0: + self.model_args.num_speakers = self.num_speakers + + # speaker embedding settings + if self.use_speaker_embedding: + self.model_args.use_speaker_embedding = True + if self.speakers_file: + self.model_args.speakers_file = self.speakers_file + + # d-vector settings + if self.use_d_vector_file: + self.model_args.use_d_vector_file = True + if self.d_vector_dim is not None and self.d_vector_dim > 0: + self.model_args.d_vector_dim = self.d_vector_dim + if self.d_vector_file: + self.model_args.d_vector_file = self.d_vector_file diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py index bba47bb3..682a69bb 100644 --- a/TTS/tts/configs/fast_speech_config.py +++ b/TTS/tts/configs/fast_speech_config.py @@ -30,6 +30,11 @@ class FastSpeechConfig(BaseTTSConfig): Activation Normalization that pre-computes normalization stats at the beginning and use the same values for the rest. Defaults to 10. + speakers_file (str): + Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to + speaker names. Defaults to `None`. + + use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. @@ -105,6 +110,7 @@ class FastSpeechConfig(BaseTTSConfig): model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False) # multi-speaker settings + speakers_file: str = None use_speaker_embedding: bool = False use_d_vector_file: bool = False d_vector_file: str = False @@ -149,3 +155,22 @@ class FastSpeechConfig(BaseTTSConfig): "Prior to November 22, 1963.", ] ) + + def __post_init__(self): + # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there. + if self.num_speakers > 0: + self.model_args.num_speakers = self.num_speakers + + # speaker embedding settings + if self.use_speaker_embedding: + self.model_args.use_speaker_embedding = True + if self.speakers_file: + self.model_args.speakers_file = self.speakers_file + + # d-vector settings + if self.use_d_vector_file: + self.model_args.use_d_vector_file = True + if self.d_vector_dim is not None and self.d_vector_dim > 0: + self.model_args.d_vector_dim = self.d_vector_dim + if self.d_vector_file: + self.model_args.d_vector_file = self.d_vector_file diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index ba561c89..6007b741 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -1,8 +1,8 @@ from dataclasses import dataclass, field from typing import List -from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs +from TTS.tts.configs.shared_configs import BaseTTSConfig @dataclass @@ -30,6 +30,10 @@ class SpeedySpeechConfig(BaseTTSConfig): Activation Normalization that pre-computes normalization stats at the beginning and use the same values for the rest. Defaults to 10. + speakers_file (str): + Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to + speaker names. Defaults to `None`. + use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. @@ -117,12 +121,13 @@ class SpeedySpeechConfig(BaseTTSConfig): }, out_channels=80, hidden_channels=128, - num_speakers=0, positional_encoding=True, detach_duration_predictor=True, ) # multi-speaker settings + num_speakers: int = 0 + speakers_file: str = None use_speaker_embedding: bool = False use_d_vector_file: bool = False d_vector_file: str = False @@ -166,3 +171,22 @@ class SpeedySpeechConfig(BaseTTSConfig): "Prior to November 22, 1963.", ] ) + + def __post_init__(self): + # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there. + if self.num_speakers > 0: + self.model_args.num_speakers = self.num_speakers + + # speaker embedding settings + if self.use_speaker_embedding: + self.model_args.use_speaker_embedding = True + if self.speakers_file: + self.model_args.speakers_file = self.speakers_file + + # d-vector settings + if self.use_d_vector_file: + self.model_args.use_d_vector_file = True + if self.d_vector_dim is not None and self.d_vector_dim > 0: + self.model_args.d_vector_dim = self.d_vector_dim + if self.d_vector_file: + self.model_args.d_vector_file = self.d_vector_file