Comment synthesis.py

2021-10-21 13:53:45 +00:00 · 2021-10-21 13:53:45 +00:00 · aea90e2501
parent 1987aaaaed
commit aea90e2501
1 changed files with 35 additions and 13 deletions
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -172,7 +172,7 @@ def speaker_id_to_torch(speaker_id, cuda=False):
 def embedding_to_torch(d_vector, cuda=False):
    if d_vector is not None:
        d_vector = np.asarray(d_vector)
-        d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor)
+        d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
    if cuda:
        return d_vector.cuda()
    return d_vector
@ -210,20 +210,42 @@ def synthesis(
    d_vector=None,
    backend="torch",
 ):
-    """Synthesize voice for the given text.
+    """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
    the vocoder model.
    Args:
-        model (TTS.tts.models): model to synthesize.
+        model (TTS.tts.models):
-        text (str): target text
+            The TTS model to synthesize audio with.
-        CONFIG (dict): config dictionary to be loaded from config.json.
+
-        use_cuda (bool): enable cuda.
+        text (str):
-        ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
+            The input text to convert to speech.
-            model outputs.
+
-        speaker_id (int): id of speaker
+        CONFIG (Coqpit):
-        style_wav (str | Dict[str, float]): Uses for style embedding of GST.
+            Model configuration.
-        enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
+
-        do_trim_silence (bool): trim silence after synthesis.
+        use_cuda (bool):
-        backend (str): tf or torch
+            Enable/disable CUDA.
        ap (TTS.tts.utils.audio.AudioProcessor):
            The audio processor for extracting features and pre/post-processing audio.
        speaker_id (int):
            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
        style_wav (str | Dict[str, float]):
            Path or tensor to/of a waveform used for computing the style embedding. Defaults to None.
        enable_eos_bos_chars (bool):
            enable special chars for end of sentence and start of sentence. Defaults to False.
        do_trim_silence (bool):
            trim silence after synthesis. Defaults to False.
        d_vector (torch.Tensor):
            d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
        backend (str):
            tf or torch. Defaults to "torch".
    """
    # GST processing
    style_mel = None