mirror of https://github.com/coqui-ai/TTS.git
<add> Chinese mandarin implementation (tacotron2)
This commit is contained in:
parent
49665783a6
commit
42ba30eb8f
|
@ -372,15 +372,10 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def baker(root_path: str, meta_file: str) -> List[List[str]]:
|
|
||||||
"""Normalizes the Baker meta data file to TTS format
|
|
||||||
|
|
||||||
Args:
|
# ======================================== Baker (chinese mandarin single speaker) ===========================================
|
||||||
root_path (str): path to the baker dataset
|
def baker(root_path, meta_file):
|
||||||
meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence
|
"""Normalizes the Baker meta data file to TTS format"""
|
||||||
Returns:
|
|
||||||
List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences
|
|
||||||
"""
|
|
||||||
txt_file = os.path.join(root_path, meta_file)
|
txt_file = os.path.join(root_path, meta_file)
|
||||||
items = []
|
items = []
|
||||||
speaker_name = "baker"
|
speaker_name = "baker"
|
||||||
|
@ -389,4 +384,4 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]:
|
||||||
wav_name, text = line.rstrip('\n').split("|")
|
wav_name, text = line.rstrip('\n').split("|")
|
||||||
wav_path = os.path.join(root_path, "clips_22", wav_name)
|
wav_path = os.path.join(root_path, "clips_22", wav_name)
|
||||||
items.append([text, wav_path, speaker_name])
|
items.append([text, wav_path, speaker_name])
|
||||||
return items
|
return items
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import os
|
import os
|
||||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable
|
installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable
|
||||||
if 'tensorflow' in installed or 'tensorflow-gpu' in installed:
|
if 'tensorflow' in installed or 'tensorflow-gpu' in installed:
|
||||||
|
@ -220,6 +220,7 @@ def synthesis(model,
|
||||||
model outputs.
|
model outputs.
|
||||||
speaker_id (int): id of speaker
|
speaker_id (int): id of speaker
|
||||||
style_wav (str | Dict[str, float]): Uses for style embedding of GST.
|
style_wav (str | Dict[str, float]): Uses for style embedding of GST.
|
||||||
|
style_wav (str): Uses for style embedding of GST.
|
||||||
truncated (bool): keep model states after inference. It can be used
|
truncated (bool): keep model states after inference. It can be used
|
||||||
for continuous inference at long texts.
|
for continuous inference at long texts.
|
||||||
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
|
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
|
||||||
|
|
|
@ -122,6 +122,13 @@ class Synthesizer(object):
|
||||||
speaker_embedding = self.init_speaker(speaker_idx)
|
speaker_embedding = self.init_speaker(speaker_idx)
|
||||||
use_gl = self.vocoder_model is None
|
use_gl = self.vocoder_model is None
|
||||||
|
|
||||||
|
|
||||||
|
# check if compute gst style
|
||||||
|
gst_style_input = None
|
||||||
|
if self.tts_config.use_gst:
|
||||||
|
if self.tts_config.gst["gst_style_input"] not in ["", {}]:
|
||||||
|
style_wav = self.tts_config.gst["gst_style_input"]
|
||||||
|
|
||||||
for sen in sens:
|
for sen in sens:
|
||||||
# synthesize voice
|
# synthesize voice
|
||||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(
|
waveform, _, _, mel_postnet_spec, _, _ = synthesis(
|
||||||
|
@ -131,7 +138,7 @@ class Synthesizer(object):
|
||||||
self.use_cuda,
|
self.use_cuda,
|
||||||
self.ap,
|
self.ap,
|
||||||
speaker_idx,
|
speaker_idx,
|
||||||
None,
|
gst_style_input,
|
||||||
False,
|
False,
|
||||||
self.tts_config.enable_eos_bos_chars,
|
self.tts_config.enable_eos_bos_chars,
|
||||||
use_gl,
|
use_gl,
|
||||||
|
|
Loading…
Reference in New Issue