Add docs for `SpeakerManager`

This commit is contained in:
Eren Gölge 2021-07-03 13:55:27 +02:00
parent f382e4c700
commit c25a2184e7
3 changed files with 41 additions and 26 deletions

View File

@ -13,14 +13,16 @@ from TTS.utils.audio import AudioProcessor
class SpeakerManager: class SpeakerManager:
"""It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
in a way that you can query. There are 3 different scenarios considered. in a way that can be queried by speaker or clip.
1. Models using speaker embedding layers. The metafile only includes a mapping of speaker names to ids. There are 3 different scenarios considered:
2. Models using external embedding vectors (x vectors). The metafile includes a dictionary in the following
format. 1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer.
2. Models using d-vectors. The datafile includes a dictionary in the following format.
::
```
{ {
'clip_name.wav':{ 'clip_name.wav':{
'name': 'speakerA', 'name': 'speakerA',
@ -28,18 +30,10 @@ class SpeakerManager:
}, },
... ...
} }
```
3. Computing x vectors at inference with the speaker encoder. It loads the speaker encoder model and
computes x vectors for a given instance.
>>> >>> # load audio processor and speaker encoder 3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and
>>> ap = AudioProcessor(**config.audio) computes the d-vectors for a given clip or speaker.
>>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
>>> # load a sample audio and compute embedding
>>> waveform = ap.load_wav(sample_wav_path)
>>> mel = ap.melspectrogram(waveform)
>>> d_vector = manager.compute_d_vector(mel.T)
Args: Args:
d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
@ -47,6 +41,15 @@ class SpeakerManager:
TTS models. Defaults to "". TTS models. Defaults to "".
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
Examples:
>>> # load audio processor and speaker encoder
>>> ap = AudioProcessor(**config.audio)
>>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
>>> # load a sample audio and compute embedding
>>> waveform = ap.load_wav(sample_wav_path)
>>> mel = ap.melspectrogram(waveform)
>>> d_vector = manager.compute_d_vector(mel.T)
""" """
def __init__( def __init__(
@ -311,7 +314,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
"""Create a SpeakerManager instance based on provided configuration. """Initiate a `SpeakerManager` instance by the provided config.
Args: Args:
c (Coqpit): Model configuration. c (Coqpit): Model configuration.
@ -321,7 +324,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None. out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None.
Returns: Returns:
SpeakerManager: SpeakerManager: initialized and ready to use instance.
""" """
speaker_manager = SpeakerManager() speaker_manager = SpeakerManager()
if c.use_speaker_embedding: if c.use_speaker_embedding:

View File

@ -37,6 +37,7 @@
main_classes/model_api main_classes/model_api
main_classes/dataset main_classes/dataset
main_classes/gan main_classes/gan
main_classes/speaker_manager
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2

View File

@ -0,0 +1,11 @@
# Speaker Manager API
The {class}`TTS.tts.utils.speakers.SpeakerManager` organize speaker related data and information for 🐸TTS models. It is
especially useful for multi-speaker models.
## Speaker Manager
```{eval-rst}
.. automodule:: TTS.tts.utils.speakers
:members:
```