Edit model configs for multi-speaker

2021-10-21 13:51:37 +00:00 · 2021-10-21 13:51:37 +00:00 · 3ab009ca8d
parent cea8e1739b
commit 3ab009ca8d
3 changed files with 77 additions and 3 deletions
--- a/TTS/tts/configs/fast_pitch_config.py
+++ b/TTS/tts/configs/fast_pitch_config.py
@ -11,7 +11,7 @@ class FastPitchConfig(BaseTTSConfig):

    Example:

-        >>> from TTS.tts.configs import FastPitchConfig
+        >>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig
        >>> config = FastPitchConfig()

    Args:
@ -30,6 +30,10 @@ class FastPitchConfig(BaseTTSConfig):
            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
            for the rest. Defaults to 10.

+        speakers_file (str):
+            Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+            speaker names. Defaults to `None`.
+
        use_speaker_embedding (bool):
            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
            in the multi-speaker mode. Defaults to False.
@ -105,6 +109,8 @@ class FastPitchConfig(BaseTTSConfig):
    model_args: ForwardTTSArgs = ForwardTTSArgs()

    # multi-speaker settings
+    num_speakers: int = 0
+    speakers_file: str = None
    use_speaker_embedding: bool = False
    use_d_vector_file: bool = False
    d_vector_file: str = False
@ -149,3 +155,22 @@ class FastPitchConfig(BaseTTSConfig):
            "Prior to November 22, 1963.",
        ]
    )
+
+    def __post_init__(self):
+        # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+        if self.num_speakers > 0:
+            self.model_args.num_speakers = self.num_speakers
+
+        # speaker embedding settings
+        if self.use_speaker_embedding:
+            self.model_args.use_speaker_embedding = True
+        if self.speakers_file:
+            self.model_args.speakers_file = self.speakers_file
+
+        # d-vector settings
+        if self.use_d_vector_file:
+            self.model_args.use_d_vector_file = True
+        if self.d_vector_dim is not None and self.d_vector_dim > 0:
+            self.model_args.d_vector_dim = self.d_vector_dim
+        if self.d_vector_file:
+            self.model_args.d_vector_file = self.d_vector_file
--- a/TTS/tts/configs/fast_speech_config.py
+++ b/TTS/tts/configs/fast_speech_config.py
@ -30,6 +30,11 @@ class FastSpeechConfig(BaseTTSConfig):
            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
            for the rest. Defaults to 10.

+        speakers_file (str):
+            Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+            speaker names. Defaults to `None`.
+
+
        use_speaker_embedding (bool):
            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
            in the multi-speaker mode. Defaults to False.
@ -105,6 +110,7 @@ class FastSpeechConfig(BaseTTSConfig):
    model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)

    # multi-speaker settings
+    speakers_file: str = None
    use_speaker_embedding: bool = False
    use_d_vector_file: bool = False
    d_vector_file: str = False
@ -149,3 +155,22 @@ class FastSpeechConfig(BaseTTSConfig):
            "Prior to November 22, 1963.",
        ]
    )
+
+    def __post_init__(self):
+        # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+        if self.num_speakers > 0:
+            self.model_args.num_speakers = self.num_speakers
+
+        # speaker embedding settings
+        if self.use_speaker_embedding:
+            self.model_args.use_speaker_embedding = True
+        if self.speakers_file:
+            self.model_args.speakers_file = self.speakers_file
+
+        # d-vector settings
+        if self.use_d_vector_file:
+            self.model_args.use_d_vector_file = True
+        if self.d_vector_dim is not None and self.d_vector_dim > 0:
+            self.model_args.d_vector_dim = self.d_vector_dim
+        if self.d_vector_file:
+            self.model_args.d_vector_file = self.d_vector_file
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@ -1,8 +1,8 @@
 from dataclasses import dataclass, field
 from typing import List

-from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
+from TTS.tts.configs.shared_configs import BaseTTSConfig


@dataclass
@ -30,6 +30,10 @@ class SpeedySpeechConfig(BaseTTSConfig):
            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
            for the rest. Defaults to 10.

+        speakers_file (str):
+            Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+            speaker names. Defaults to `None`.
+
        use_speaker_embedding (bool):
            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
            in the multi-speaker mode. Defaults to False.
@ -117,12 +121,13 @@ class SpeedySpeechConfig(BaseTTSConfig):
        },
        out_channels=80,
        hidden_channels=128,
-        num_speakers=0,
        positional_encoding=True,
        detach_duration_predictor=True,
    )

    # multi-speaker settings
+    num_speakers: int = 0
+    speakers_file: str = None
    use_speaker_embedding: bool = False
    use_d_vector_file: bool = False
    d_vector_file: str = False
@ -166,3 +171,22 @@ class SpeedySpeechConfig(BaseTTSConfig):
            "Prior to November 22, 1963.",
        ]
    )
+
+    def __post_init__(self):
+        # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+        if self.num_speakers > 0:
+            self.model_args.num_speakers = self.num_speakers
+
+        # speaker embedding settings
+        if self.use_speaker_embedding:
+            self.model_args.use_speaker_embedding = True
+        if self.speakers_file:
+            self.model_args.speakers_file = self.speakers_file
+
+        # d-vector settings
+        if self.use_d_vector_file:
+            self.model_args.use_d_vector_file = True
+        if self.d_vector_dim is not None and self.d_vector_dim > 0:
+            self.model_args.d_vector_dim = self.d_vector_dim
+        if self.d_vector_file:
+            self.model_args.d_vector_file = self.d_vector_file