Add docs for `SpeakerManager`

2021-07-03 13:55:27 +02:00 · 2021-07-03 13:55:27 +02:00 · c25a2184e7
parent f382e4c700
commit c25a2184e7
3 changed files with 41 additions and 26 deletions
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -13,14 +13,16 @@ from TTS.utils.audio import AudioProcessor
 class SpeakerManager:
-    """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information
+    """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
-    in a way that you can query. There are 3 different scenarios considered.
+    in a way that can be queried by speaker or clip.
-    1. Models using speaker embedding layers. The metafile only includes a mapping of speaker names to ids.
+    There are 3 different scenarios considered:
-    2. Models using external embedding vectors (x vectors). The metafile includes a dictionary in the following
+
-    format.
+    1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer.
    2. Models using d-vectors. The datafile includes a dictionary in the following format.
    ::
    ```
        {
            'clip_name.wav':{
                'name': 'speakerA',
@ -28,18 +30,10 @@ class SpeakerManager:
            },
            ...
        }
    ```
    3. Computing x vectors at inference with the speaker encoder. It loads the speaker encoder model and
    computes x vectors for a given instance.
-    >>> >>> # load audio processor and speaker encoder
+    3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and
-    >>> ap = AudioProcessor(**config.audio)
+    computes the d-vectors for a given clip or speaker.
    >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
    >>> # load a sample audio and compute embedding
    >>> waveform = ap.load_wav(sample_wav_path)
    >>> mel = ap.melspectrogram(waveform)
    >>> d_vector = manager.compute_d_vector(mel.T)
    Args:
        d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
@ -47,6 +41,15 @@ class SpeakerManager:
        TTS models. Defaults to "".
        encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
        encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
    Examples:
        >>> # load audio processor and speaker encoder
        >>> ap = AudioProcessor(**config.audio)
        >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
        >>> # load a sample audio and compute embedding
        >>> waveform = ap.load_wav(sample_wav_path)
        >>> mel = ap.melspectrogram(waveform)
        >>> d_vector = manager.compute_d_vector(mel.T)
    """
    def __init__(
@ -311,7 +314,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
 def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
-    """Create a SpeakerManager instance based on provided configuration.
+    """Initiate a `SpeakerManager` instance by the provided config.
    Args:
        c (Coqpit): Model configuration.
@ -321,7 +324,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
        out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None.
    Returns:
-        SpeakerManager:
+        SpeakerManager: initialized and ready to use instance.
    """
    speaker_manager = SpeakerManager()
    if c.use_speaker_embedding:
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -37,6 +37,7 @@
    main_classes/model_api
    main_classes/dataset
    main_classes/gan
    main_classes/speaker_manager
 .. toctree::
    :maxdepth: 2
--- a/docs/source/main_classes/speaker_manager.md
+++ b/docs/source/main_classes/speaker_manager.md
@ -0,0 +1,11 @@
 # Speaker Manager API
 The {class}`TTS.tts.utils.speakers.SpeakerManager` organize speaker related data and information for 🐸TTS models. It is
 especially useful for multi-speaker models.
 ## Speaker Manager
 ```{eval-rst}
 .. automodule:: TTS.tts.utils.speakers
    :members:
 ```