Fix phonemes

2021-11-06 00:27:58 +01:00 · 2021-11-06 00:27:58 +01:00 · 120332d53f
parent 846bf16f02
commit 120332d53f
4 changed files with 8 additions and 5 deletions
--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@ -7,7 +7,7 @@ from tqdm.contrib.concurrent import process_map

 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.text import text2phone
+from TTS.tts.utils.text import text2phone, phoneme_to_sequence


 def compute_phonemes(item):
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@ -273,7 +273,7 @@ class TTSDataset(Dataset):
        item = args[0]
        func_args = args[1]
        text, wav_file, *_ = item
-        func_args[3] = item[4]
+        func_args[3] = item[3]
        phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args)
        return phonemes

--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -543,6 +543,7 @@ class Vits(BaseTTS):
            "style_wav": style_wav,
            "d_vector": d_vector,
            "language_id": language_id,
+            "language_name": language_name,
        }

    def forward(
@ -1061,6 +1062,7 @@ class Vits(BaseTTS):
                    d_vector=aux_inputs["d_vector"],
                    style_wav=aux_inputs["style_wav"],
                    language_id=aux_inputs["language_id"],
+                    language_name=aux_inputs["language_name"],
                    enable_eos_bos_chars=self.config.enable_eos_bos_chars,
                    use_griffin_lim=True,
                    do_trim_silence=False,
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -15,7 +15,7 @@ if "tensorflow" in installed or "tensorflow-gpu" in installed:
    import tensorflow as tf


-def text_to_seq(text, CONFIG, custom_symbols=None):
+def text_to_seq(text, CONFIG, custom_symbols=None, language=None):
    text_cleaner = [CONFIG.text_cleaner]
    # text ot phonemes to sequence vector
    if CONFIG.use_phonemes:
@ -23,7 +23,7 @@ def text_to_seq(text, CONFIG, custom_symbols=None):
            phoneme_to_sequence(
                text,
                text_cleaner,
-                CONFIG.phoneme_language,
+                language if language else CONFIG.phoneme_language,
                CONFIG.enable_eos_bos_chars,
                tp=CONFIG.characters,
                add_blank=CONFIG.add_blank,
@ -212,6 +212,7 @@ def synthesis(
    do_trim_silence=False,
    d_vector=None,
    language_id=None,
+    language_name=None,
    backend="torch",
 ):
    """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
@ -262,7 +263,7 @@ def synthesis(
    if hasattr(model, "make_symbols"):
        custom_symbols = model.make_symbols(CONFIG)
    # preprocess the given text
-    text_inputs = text_to_seq(text, CONFIG, custom_symbols=custom_symbols)
+    text_inputs = text_to_seq(text, CONFIG, custom_symbols=custom_symbols, language=language_name)
    # pass tensors to backend
    if backend == "torch":
        if speaker_id is not None: