enable multi-speaker CoquiTTS models for synthesize.py

2021-04-26 19:36:53 +02:00 · 2021-04-26 19:36:53 +02:00 · 2f0716073e
parent b531fa699c
commit 2f0716073e
5 changed files with 109 additions and 43 deletions
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -29,27 +29,65 @@ def main():
        """You can either use your trained model or choose a model from the provided list.\n\n"""
        """If you don't specify any models, then it uses LJSpeech based English models\n\n"""
        """
-    Example runs:
+    # Example Runs:

-    # list provided models
-    ./TTS/bin/synthesize.py --list_models
+    ## Single Speaker Models

-    # run tts with default models.
-    ./TTS/bin synthesize.py --text "Text for TTS"
+    - list provided models

-    # run a tts model with its default vocoder model.
-     ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
+    ```
+    $ ./TTS/bin/synthesize.py --list_models
+    ```

-    # run with specific tts and vocoder models from the list
-    ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
+    - run tts with default models.

-    # run your own TTS model (Using Griffin-Lim Vocoder)
-    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    ```
+    $ ./TTS/bin synthesize.py --text "Text for TTS"
+    ```

-    # run your own TTS and Vocoder models
-    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
+    - run a tts model with its default vocoder model.
+
+    ```
+    $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
+    ```
+
+    - run with specific tts and vocoder models from the list
+
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
+    ```
+
+    - run your own TTS model (Using Griffin-Lim Vocoder)
+
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    ```
+
+    - run your own TTS and Vocoder models
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    ```

+    ## MULTI-SPEAKER MODELS
+
+    - list the available speakers and choose as <speaker_id> among them.
+
+    ```
+    $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+    ```
+
+    - run the multi-speaker TTS model with the target speaker ID.
+
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+    ```
+
+    - run your own multi-speaker TTS model.
+
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    ```
    """,
        formatter_class=RawTextHelpFormatter,
    )
@ -113,21 +151,23 @@ def main():
    parser.add_argument(
        "--speaker_idx",
        type=str,
-        help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
+        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
-        help="wav file(s) to condition a multi-speaker model. You can give multiple file paths. The x_vectors is computed as their average.",
+        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
-        default=False,
        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
    )
    # aux args
    parser.add_argument(
@ -202,6 +242,14 @@ def main():
        print(synthesizer.speaker_manager.speaker_ids)
        return

+    # check the arguments against a multi-speaker model.
+    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+        print(
+            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
+            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
+        )
+        return
+
    # RUN THE SYNTHESIS
    print(" > Text: {}".format(args.text))

--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -51,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
                    print(
                        "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
                    )
-                    speaker_mapping = load_speaker_mapping(
-                        c.external_speaker_embedding_file)
+                    speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
                    if not speaker_mapping:
                        raise RuntimeError(
                            "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
                        )
-                speaker_embedding_dim = len(speaker_mapping[list(
-                    speaker_mapping.keys())[0]]["embedding"])
+                speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
            elif (
-                    not c.use_external_speaker_embedding_file
+                not c.use_external_speaker_embedding_file
            ):  # if restore checkpoint and don't use External Embedding file
                prev_out_path = os.path.dirname(args.restore_path)
                speaker_mapping = load_speaker_mapping(prev_out_path)
                speaker_embedding_dim = None
-                assert all(
-                    speaker in speaker_mapping
-                    for speaker in speakers), ("As of now you, you cannot "
-                                               "introduce new speakers to "
-                                               "a previously trained model.")
-        elif (c.use_external_speaker_embedding_file
-              and c.external_speaker_embedding_file
-              ):  # if start new train using External Embedding file
-            speaker_mapping = load_speaker_mapping(
-                c.external_speaker_embedding_file)
-            speaker_embedding_dim = len(speaker_mapping[list(
-                speaker_mapping.keys())[0]]["embedding"])
+                assert all(speaker in speaker_mapping for speaker in speakers), (
+                    "As of now you, you cannot " "introduce new speakers to " "a previously trained model."
+                )
        elif (
-                c.use_external_speaker_embedding_file
-                and not c.external_speaker_embedding_file
+            c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
+        ):  # if start new train using External Embedding file
+            speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
+            speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
+        elif (
+            c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
        ):  # if start new train using External Embedding file and don't pass external embedding file
            raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
        else:  # if start new train and don't use External Embedding file
@ -87,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
            speaker_embedding_dim = None
        save_speaker_mapping(OUT_PATH, speaker_mapping)
        num_speakers = len(speaker_mapping)
-        print(" > Training with {} speakers: {}".format(
-            len(speakers), ", ".join(speakers)))
+        print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
    else:
        num_speakers = 0
        speaker_embedding_dim = None
@ -133,6 +125,7 @@ class SpeakerManager:
        encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
        encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
    """
+
    def __init__(
        self,
        x_vectors_file_path: str = "",
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -107,6 +107,7 @@ class ModelManager(object):
            os.makedirs(output_path, exist_ok=True)
            print(f" > Downloading model to {output_path}")
            output_stats_path = os.path.join(output_path, "scale_stats.npy")
+            output_speakers_path = os.path.join(output_path, "speakers.json")
            # download files to the output path
            if self._check_dict_key(model_item, "github_rls_url"):
                # download from github release
@ -119,7 +120,7 @@ class ModelManager(object):
                if self._check_dict_key(model_item, "stats_file"):
                    self._download_gdrive_file(model_item["stats_file"], output_stats_path)

-            # set the scale_path.npy file path in the model config.json
+            # update the scale_path.npy file path in the model config.json
            if self._check_dict_key(model_item, "stats_file") or os.path.exists(output_stats_path):
                # set scale stats path in config.json
                config_path = output_config_path
@ -127,6 +128,14 @@ class ModelManager(object):
                config["audio"]["stats_path"] = output_stats_path
                with open(config_path, "w") as jf:
                    json.dump(config, jf)
+            # update the speakers.json file path in the model config.json to the current path
+            if os.path.exists(output_speakers_path):
+                # set scale stats path in config.json
+                config_path = output_config_path
+                config = load_config(config_path)
+                config["external_speaker_embedding_file"] = output_speakers_path
+                with open(config_path, "w") as jf:
+                    json.dump(config, jf)
        return output_model_path, output_config_path, model_item

    def _download_gdrive_file(self, gdrive_idx, output):
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -127,6 +127,9 @@ class Synthesizer(object):
            self.input_size = len(symbols)

        if self.tts_config.use_speaker_embedding is True:
+            self.tts_speakers_file = (
+                self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]
+            )
            self._load_speakers(self.tts_speakers_file)

        self.tts_model = setup_model(
@ -189,15 +192,27 @@ class Synthesizer(object):
        """
        start_time = time.time()
        wavs = []
-        sens = self._split_into_sentences(text)
+        sens = self.split_into_sentences(text)
        print(" > Text splitted to sentences.")
        print(sens)

-        # get the speaker embedding from the saved x_vectors.
-        if speaker_idx and isinstance(speaker_idx, str):
-            speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
+        if self.tts_speakers_file:
+            # get the speaker embedding from the saved x_vectors.
+            if speaker_idx and isinstance(speaker_idx, str):
+                speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
+            elif not speaker_idx and not speaker_wav:
+                raise ValueError(
+                    " [!] Look like you use a multi-speaker model. "
+                    "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model."
+                )
+            else:
+                speaker_embedding = None
        else:
-            speaker_embedding = None
+            if speaker_idx:
+                raise ValueError(
+                    f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}."
+                    "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
+                )

        # compute a new x_vector from the given clip.
        if speaker_wav is not None:
--- a/tests/test_speakers_manager.py
+++ b/tests/test_speakers_manager.py
@ -16,6 +16,7 @@ x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speake

 class SpeakerManagerTest(unittest.TestCase):
    """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
+
    @staticmethod
    def test_speaker_embedding():
        # load config