diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 26c17174..01a3ce1c 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -372,15 +372,10 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): -def baker(root_path: str, meta_file: str) -> List[List[str]]: - """Normalizes the Baker meta data file to TTS format - Args: - root_path (str): path to the baker dataset - meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence - Returns: - List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences - """ +# ======================================== Baker (chinese mandarin single speaker) =========================================== +def baker(root_path, meta_file): + """Normalizes the Baker meta data file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "baker" @@ -389,4 +384,4 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]: wav_name, text = line.rstrip('\n').split("|") wav_path = os.path.join(root_path, "clips_22", wav_name) items.append([text, wav_path, speaker_name]) - return items + return items diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index adbd0d20..f407f605 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,5 +1,5 @@ import os -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import pkg_resources installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable if 'tensorflow' in installed or 'tensorflow-gpu' in installed: @@ -220,6 +220,7 @@ def synthesis(model, model outputs. speaker_id (int): id of speaker style_wav (str | Dict[str, float]): Uses for style embedding of GST. + style_wav (str): Uses for style embedding of GST. truncated (bool): keep model states after inference. It can be used for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2a779e53..4b4bc04c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,6 +122,13 @@ class Synthesizer(object): speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None + + # check if compute gst style + gst_style_input = None + if self.tts_config.use_gst: + if self.tts_config.gst["gst_style_input"] not in ["", {}]: + style_wav = self.tts_config.gst["gst_style_input"] + for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( @@ -131,7 +138,7 @@ class Synthesizer(object): self.use_cuda, self.ap, speaker_idx, - None, + gst_style_input, False, self.tts_config.enable_eos_bos_chars, use_gl,