<add> Chinese mandarin implementation (tacotron2)

2021-02-15 16:04:47 +01:00 · 2021-02-15 16:04:47 +01:00 · 42ba30eb8f
parent 49665783a6
commit 42ba30eb8f
3 changed files with 14 additions and 11 deletions
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@ -372,15 +372,10 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):



-def baker(root_path: str, meta_file: str) ->  List[List[str]]:
-    """Normalizes the Baker meta data file to TTS format

-    Args:
-        root_path (str): path to the baker dataset
-        meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence  
-    Returns:
-        List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences
-    """
+# ======================================== Baker (chinese mandarin single speaker) ===========================================
+def baker(root_path, meta_file):
+    """Normalizes the Baker meta data file to TTS format"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "baker"
@ -389,4 +384,4 @@ def baker(root_path: str, meta_file: str) ->  List[List[str]]:
            wav_name, text = line.rstrip('\n').split("|")
            wav_path = os.path.join(root_path, "clips_22", wav_name)
            items.append([text, wav_path, speaker_name])
-    return items 
+    return items
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -1,5 +1,5 @@
 import os
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 import pkg_resources
 installed = {pkg.key for pkg in pkg_resources.working_set}  #pylint: disable=not-an-iterable
 if 'tensorflow' in installed or 'tensorflow-gpu' in installed:
@ -220,6 +220,7 @@ def synthesis(model,
                model outputs.
            speaker_id (int): id of speaker
            style_wav (str | Dict[str, float]): Uses for style embedding of GST.
+            style_wav (str): Uses for style embedding of GST.
            truncated (bool): keep model states after inference. It can be used
                for continuous inference at long texts.
            enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -122,6 +122,13 @@ class Synthesizer(object):
        speaker_embedding = self.init_speaker(speaker_idx)
        use_gl = self.vocoder_model is None

+
+        # check if compute gst style
+        gst_style_input = None
+        if self.tts_config.use_gst:
+            if self.tts_config.gst["gst_style_input"] not in ["", {}]:
+                style_wav = self.tts_config.gst["gst_style_input"]
+
        for sen in sens:
            # synthesize voice
            waveform, _, _, mel_postnet_spec, _, _ = synthesis(
@ -131,7 +138,7 @@ class Synthesizer(object):
                self.use_cuda,
                self.ap,
                speaker_idx,
-                None,
+                gst_style_input,
                False,
                self.tts_config.enable_eos_bos_chars,
                use_gl,