From aea90e250124899fa4f1c04298b595550078eaa9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 21 Oct 2021 13:53:45 +0000
Subject: [PATCH] Comment synthesis.py

---
 TTS/tts/utils/synthesis.py | 48 +++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index ca15f4cc..5185139e 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -172,7 +172,7 @@ def speaker_id_to_torch(speaker_id, cuda=False):
 def embedding_to_torch(d_vector, cuda=False):
     if d_vector is not None:
         d_vector = np.asarray(d_vector)
-        d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor)
+        d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
     if cuda:
         return d_vector.cuda()
     return d_vector
@@ -210,20 +210,42 @@ def synthesis(
     d_vector=None,
     backend="torch",
 ):
-    """Synthesize voice for the given text.
+    """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
+    the vocoder model.
 
     Args:
-        model (TTS.tts.models): model to synthesize.
-        text (str): target text
-        CONFIG (dict): config dictionary to be loaded from config.json.
-        use_cuda (bool): enable cuda.
-        ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
-            model outputs.
-        speaker_id (int): id of speaker
-        style_wav (str | Dict[str, float]): Uses for style embedding of GST.
-        enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
-        do_trim_silence (bool): trim silence after synthesis.
-        backend (str): tf or torch
+        model (TTS.tts.models):
+            The TTS model to synthesize audio with.
+
+        text (str):
+            The input text to convert to speech.
+
+        CONFIG (Coqpit):
+            Model configuration.
+
+        use_cuda (bool):
+            Enable/disable CUDA.
+
+        ap (TTS.tts.utils.audio.AudioProcessor):
+            The audio processor for extracting features and pre/post-processing audio.
+
+        speaker_id (int):
+            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
+
+        style_wav (str | Dict[str, float]):
+            Path or tensor to/of a waveform used for computing the style embedding. Defaults to None.
+
+        enable_eos_bos_chars (bool):
+            enable special chars for end of sentence and start of sentence. Defaults to False.
+
+        do_trim_silence (bool):
+            trim silence after synthesis. Defaults to False.
+
+        d_vector (torch.Tensor):
+            d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
+
+        backend (str):
+            tf or torch. Defaults to "torch".
     """
     # GST processing
     style_mel = None