From c9972e6f145a3cbf3b79c24b5c18c6d654b86f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:51:58 +0000 Subject: [PATCH] Make lint --- TTS/tts/datasets/__init__.py | 4 +- TTS/tts/datasets/dataset.py | 157 +++++++------- TTS/tts/utils/synthesis.py | 4 +- TTS/tts/utils/text/characters.py | 191 ++++++++++++------ TTS/tts/utils/text/phonemizers/base.py | 36 ++-- .../utils/text/phonemizers/espeak_wrapper.py | 65 +++--- .../utils/text/phonemizers/gruut_wrapper.py | 3 +- .../text/phonemizers/ja_jp_phonemizer.py | 20 +- .../text/phonemizers/multi_phonemizer.py | 28 +-- .../text/phonemizers/zh_cn_phonemizer.py | 23 ++- TTS/tts/utils/text/punctuation.py | 18 +- TTS/utils/audio.py | 3 +- TTS/utils/synthesizer.py | 1 - TTS/vocoder/models/gan.py | 2 +- 14 files changed, 319 insertions(+), 236 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index d80e92c9..f0a6ea95 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -111,8 +111,8 @@ def load_tts_samples( meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for the duration predictor training - if dataset.meta_file_attn_mask: - meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) + if d.meta_file_attn_mask: + meta_data = dict(load_attention_mask_meta_data(d["meta_file_attn_mask"])) for idx, ins in enumerate(meta_data_train_all): attn_file = meta_data[ins["audio_file"]].strip() meta_data_train_all[idx].update({"alignment_file": attn_file}) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index d4a12c07..210de803 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -1,7 +1,6 @@ import collections import os import random -from multiprocessing import Pool from typing import Dict, List, Union import numpy as np @@ -10,7 +9,6 @@ import tqdm from torch.utils.data import Dataset from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor -from TTS.tts.utils.text import TTSTokenizer from TTS.utils.audio import AudioProcessor @@ -183,7 +181,7 @@ class TTSDataset(Dataset): def get_phonemes(self, idx, text): out_dict = self.phoneme_dataset[idx] assert text == out_dict["text"], f"{text} != {out_dict['text']}" - assert out_dict["token_ids"].size > 0 + assert len(out_dict["token_ids"]) > 0 return out_dict def get_f0(self, idx): @@ -192,7 +190,8 @@ class TTSDataset(Dataset): assert wav_file == out_dict["audio_file"] return out_dict - def get_attn_maks(self, attn_file): + @staticmethod + def get_attn_mask(attn_file): return np.load(attn_file) def get_token_ids(self, idx, text): @@ -207,7 +206,7 @@ class TTSDataset(Dataset): raw_text = item["text"] - wav = np.asarray(self.load_wav(item[]), dtype=np.float32) + wav = np.asarray(self.load_wav(item["audio_file"]), dtype=np.float32) # apply noise for augmentation if self.use_noise_augment: @@ -262,7 +261,7 @@ class TTSDataset(Dataset): idxs = np.argsort(lengths) # ascending order ignore_idx = [] keep_idx = [] - for i, idx in enumerate(idxs): + for idx in idxs: length = lengths[idx] if length < min_len or length > max_len: ignore_idx.append(idx) @@ -277,6 +276,7 @@ class TTSDataset(Dataset): @staticmethod def create_buckets(samples, batch_group_size: int): + assert batch_group_size > 0 for i in range(len(samples) // batch_group_size): offset = i * batch_group_size end_offset = offset + batch_group_size @@ -319,7 +319,8 @@ class TTSDataset(Dataset): # shuffle batch groups # create batches with similar length items # the larger the `batch_group_size`, the higher the length variety in a batch. - samples = self.create_buckets(samples, self.batch_group_size) + if self.batch_group_size > 0: + samples = self.create_buckets(samples, self.batch_group_size) # update items to the new sorted items self.samples = samples @@ -571,6 +572,7 @@ class PhonemeDataset(Dataset): We use pytorch dataloader because we are lazy. """ + print("[*] Pre-computing phonemes...") with tqdm.tqdm(total=len(self)) as pbar: batch_size = num_workers if num_workers > 0 else 1 dataloder = torch.utils.data.DataLoader( @@ -658,16 +660,21 @@ class F0Dataset: return len(self.samples) def precompute(self, num_workers=0): + print("[*] Pre-computing F0s...") with tqdm.tqdm(total=len(self)) as pbar: batch_size = num_workers if num_workers > 0 else 1 + # we do not normalize at preproessing + normalize_f0 = self.normalize_f0 + self.normalize_f0 = False dataloder = torch.utils.data.DataLoader( batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn ) computed_data = [] for batch in dataloder: f0 = batch["f0"] - computed_data.append([f for f in f0]) + computed_data.append(f for f in f0) pbar.update(batch_size) + self.normalize_f0 = normalize_f0 if self.normalize_f0: computed_data = [tensor for batch in computed_data for tensor in batch] # flatten @@ -746,80 +753,80 @@ class F0Dataset: print(f"{indent}| > Number of instances : {len(self.samples)}") -if __name__ == "__main__": - from torch.utils.data import DataLoader +# if __name__ == "__main__": +# from torch.utils.data import DataLoader - from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig - from TTS.tts.datasets import load_tts_samples - from TTS.tts.utils.text.characters import IPAPhonemes - from TTS.tts.utils.text.phonemizers import ESpeak +# from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig +# from TTS.tts.datasets import load_tts_samples +# from TTS.tts.utils.text.characters import IPAPhonemes +# from TTS.tts.utils.text.phonemizers import ESpeak - dataset_config = BaseDatasetConfig( - name="ljspeech", - meta_file_train="metadata.csv", - path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1", - ) - train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) - samples = train_samples + eval_samples +# dataset_config = BaseDatasetConfig( +# name="ljspeech", +# meta_file_train="metadata.csv", +# path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1", +# ) +# train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +# samples = train_samples + eval_samples - phonemizer = ESpeak(language="en-us") - tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer) - # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests") - # ph_dataset.precompute(num_workers=4) +# phonemizer = ESpeak(language="en-us") +# tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer) +# # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests") +# # ph_dataset.precompute(num_workers=4) - # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn) - # for batch in dataloader: - # print(batch) - # break +# # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn) +# # for batch in dataloader: +# # print(batch) +# # break - audio_config = BaseAudioConfig( - sample_rate=22050, - win_length=1024, - hop_length=256, - num_mels=80, - preemphasis=0.0, - ref_level_db=20, - log_func="np.log", - do_trim_silence=True, - trim_db=45, - mel_fmin=0, - mel_fmax=8000, - spec_gain=1.0, - signal_norm=False, - do_amp_to_db_linear=False, - ) +# audio_config = BaseAudioConfig( +# sample_rate=22050, +# win_length=1024, +# hop_length=256, +# num_mels=80, +# preemphasis=0.0, +# ref_level_db=20, +# log_func="np.log", +# do_trim_silence=True, +# trim_db=45, +# mel_fmin=0, +# mel_fmax=8000, +# spec_gain=1.0, +# signal_norm=False, +# do_amp_to_db_linear=False, +# ) - ap = AudioProcessor.init_from_config(audio_config) +# ap = AudioProcessor.init_from_config(audio_config) - # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4) +# # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4) - # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn) - # for batch in dataloader: - # print(batch) - # breakpoint() - # break +# # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn) +# # for batch in dataloader: +# # print(batch) +# # breakpoint() +# # break - dataset = TTSDataset( - outputs_per_step=1, - compute_linear_spec=False, - samples=samples, - ap=ap, - return_wav=False, - batch_group_size=0, - min_seq_len=0, - max_seq_len=500, - use_noise_augment=False, - verbose=True, - speaker_id_mapping=None, - d_vector_mapping=None, - compute_f0=True, - f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests", - tokenizer=tokenizer, - phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests", - precompute_num_workers=4, - ) +# dataset = TTSDataset( +# outputs_per_step=1, +# compute_linear_spec=False, +# samples=samples, +# ap=ap, +# return_wav=False, +# batch_group_size=0, +# min_seq_len=0, +# max_seq_len=500, +# use_noise_augment=False, +# verbose=True, +# speaker_id_mapping=None, +# d_vector_mapping=None, +# compute_f0=True, +# f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests", +# tokenizer=tokenizer, +# phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests", +# precompute_num_workers=4, +# ) - dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) - for batch in dataloader: - print(batch) - break +# dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) +# for batch in dataloader: +# print(batch) +# break diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 979769a8..65dcc1ad 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -199,10 +199,10 @@ def synthesis( wav = model_outputs.squeeze(0) else: if use_griffin_lim: - wav = inv_spectrogram(model_outputs, ap, CONFIG) + wav = inv_spectrogram(model_outputs, model.ap, CONFIG) # trim silence if do_trim_silence: - wav = trim_silence(wav, ap) + wav = trim_silence(wav, model.ap) return_dict = { "wav": wav, "alignments": alignments, diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index 24ce51f1..aae6844f 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -1,3 +1,8 @@ +from dataclasses import replace + +from TTS.tts.configs.shared_configs import CharactersConfig + + def parse_symbols(): return { "pad": _pad, @@ -29,46 +34,49 @@ _diacrilics = "ɚ˞ɫ" _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics -def create_graphemes( - characters=_characters, - punctuations=_punctuations, - pad=_pad, - eos=_eos, - bos=_bos, - blank=_blank, - unique=True, -): # pylint: disable=redefined-outer-name - """Function to create default characters and phonemes""" - # create graphemes - _graphemes = list(characters) - _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes - _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes - _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes - _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes - _graphemes = _graphemes + list(punctuations) - return _graphemes, _phonemes +# def create_graphemes( +# characters=_characters, +# punctuations=_punctuations, +# pad=_pad, +# eos=_eos, +# bos=_bos, +# blank=_blank, +# unique=True, +# ): # pylint: disable=redefined-outer-name +# """Function to create default characters and phonemes""" +# # create graphemes +# = ( +# sorted(list(set(phonemes))) if unique else sorted(list(phonemes)) +# ) # this is to keep previous models compatible. +# _graphemes = list(characters) +# _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes +# _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes +# _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes +# _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes +# _graphemes = _graphemes + list(punctuations) +# return _graphemes, _phonemes -def create_phonemes( - phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True -): - # create phonemes - _phonemes = None - _phonemes_sorted = ( - sorted(list(set(phonemes))) if unique else sorted(list(phonemes)) - ) # this is to keep previous models compatible. - _phonemes = list(_phonemes_sorted) - _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes - _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes - _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes - _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes - _phonemes = _phonemes + list(punctuations) - _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) - return _phonemes +# def create_phonemes( +# phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True +# ): +# # create phonemes +# _phonemes = None +# _phonemes_sorted = ( +# sorted(list(set(phonemes))) if unique else sorted(list(phonemes)) +# ) # this is to keep previous models compatible. +# _phonemes = list(_phonemes_sorted) +# _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes +# _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes +# _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes +# _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes +# _phonemes = _phonemes + list(punctuations) +# _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) +# return _phonemes -graphemes = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos) -phonemes = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank) +# DEF_GRAPHEMES = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos) +# DEF_PHONEMES = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank) class BaseCharacters: @@ -114,7 +122,7 @@ class BaseCharacters: eos: str, bos: str, blank: str, - is_unique: bool = True, + is_unique: bool = False, is_sorted: bool = True, ) -> None: self._characters = characters @@ -202,14 +210,20 @@ class BaseCharacters: _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab self._vocab = _vocab + list(self._punctuations) self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)} - self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)} + self._id_to_char = { + idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension + } if self.is_unique: + duplicates = {x for x in self.vocab if self.vocab.count(x) > 1} assert ( len(self.vocab) == len(self._char_to_id) == len(self._id_to_char) - ), f" [!] There are duplicate characters in the character set. {set([x for x in self.vocab if self.vocab.count(x) > 1])}" + ), f" [!] There are duplicate characters in the character set. {duplicates}" def char_to_id(self, char: str) -> int: - return self._char_to_id[char] + try: + return self._char_to_id[char] + except KeyError as e: + raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e def id_to_char(self, idx: int) -> str: return self._id_to_char[idx] @@ -229,9 +243,23 @@ class BaseCharacters: print(f"{indent}| > Num chars: {self.num_chars}") @staticmethod - def init_from_config(config: "Coqpit"): - return BaseCharacters( - **config.characters if config.characters is not None else {}, + def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument + """Init your character class from a config. + + Implement this method for your subclass. + """ + ... + + def to_config(self) -> "CharactersConfig": + return CharactersConfig( + characters=self._characters, + punctuations=self._punctuations, + pad=self._pad, + eos=self._eos, + bos=self._bos, + blank=self._blank, + is_unique=self.is_unique, + is_sorted=self.is_sorted, ) @@ -275,31 +303,42 @@ class IPAPhonemes(BaseCharacters): eos: str = _eos, bos: str = _bos, blank: str = _blank, - is_unique: bool = True, + is_unique: bool = False, is_sorted: bool = True, ) -> None: super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted) @staticmethod def init_from_config(config: "Coqpit"): + """Init a IPAPhonemes object from a model config + + If characters are not defined in the config, it will be set to the default characters and the config + will be updated. + """ # band-aid for compatibility with old models if "characters" in config and config.characters is not None: if "phonemes" in config.characters and config.characters.phonemes is not None: config.characters["characters"] = config.characters["phonemes"] - return IPAPhonemes( - characters=config.characters["characters"], - punctuations=config.characters["punctuations"], - pad=config.characters["pad"], - eos=config.characters["eos"], - bos=config.characters["bos"], - blank=config.characters["blank"], - is_unique=config.characters["is_unique"], - is_sorted=config.characters["is_sorted"], - ) - else: - return IPAPhonemes( - **config.characters if config.characters is not None else {}, + return ( + IPAPhonemes( + characters=config.characters["characters"], + punctuations=config.characters["punctuations"], + pad=config.characters["pad"], + eos=config.characters["eos"], + bos=config.characters["bos"], + blank=config.characters["blank"], + is_unique=config.characters["is_unique"], + is_sorted=config.characters["is_sorted"], + ), + config, ) + # use character set from config + if config.characters is not None: + return IPAPhonemes(**config.characters), config + # return default character set + characters = IPAPhonemes() + new_config = replace(config, characters=characters.to_config()) + return characters, new_config class Graphemes(BaseCharacters): @@ -339,24 +378,42 @@ class Graphemes(BaseCharacters): eos: str = _eos, bos: str = _bos, blank: str = _blank, - is_unique: bool = True, + is_unique: bool = False, is_sorted: bool = True, ) -> None: super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted) @staticmethod def init_from_config(config: "Coqpit"): - return Graphemes( - **config.characters if config.characters is not None else {}, - ) + """Init a Graphemes object from a model config + + If characters are not defined in the config, it will be set to the default characters and the config + will be updated. + """ + if config.characters is not None: + # band-aid for compatibility with old models + if "phonemes" in config.characters: + return ( + Graphemes( + characters=config.characters["characters"], + punctuations=config.characters["punctuations"], + pad=config.characters["pad"], + eos=config.characters["eos"], + bos=config.characters["bos"], + blank=config.characters["blank"], + is_unique=config.characters["is_unique"], + is_sorted=config.characters["is_sorted"], + ), + config, + ) + return Graphemes(**config.characters), config + characters = Graphemes() + new_config = replace(config, characters=characters.to_config()) + return characters, new_config if __name__ == "__main__": gr = Graphemes() ph = IPAPhonemes() - - print(gr.vocab) - print(ph.vocab) - - print(gr.num_chars) - assert "a" == gr.id_to_char(gr.char_to_id("a")) + gr.print_log() + ph.print_log() diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 249c8bce..08fa8e13 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -1,6 +1,5 @@ import abc -import itertools -from typing import List, Tuple, Union +from typing import List, Tuple from TTS.tts.utils.text.punctuation import Punctuation @@ -8,6 +7,19 @@ from TTS.tts.utils.text.punctuation import Punctuation class BasePhonemizer(abc.ABC): """Base phonemizer class + Phonemization follows the following steps: + 1. Preprocessing: + - remove empty lines + - remove punctuation + - keep track of punctuation marks + + 2. Phonemization: + - convert text to phonemes + + 3. Postprocessing: + - join phonemes + - restore punctuation marks + Args: language (str): Language used by the phonemizer. @@ -51,40 +63,30 @@ class BasePhonemizer(abc.ABC): @abc.abstractmethod def name(): """The name of the backend""" + ... @classmethod @abc.abstractmethod def is_available(cls): """Returns True if the backend is installed, False otherwise""" + ... @classmethod @abc.abstractmethod def version(cls): """Return the backend version as a tuple (major, minor, patch)""" + ... + @staticmethod @abc.abstractmethod def supported_languages(): """Return a dict of language codes -> name supported by the backend""" + ... def is_supported_language(self, language): """Returns True if `language` is supported by the backend""" return language in self.supported_languages() - fr""" - Phonemization follows the following steps: - 1. Preprocessing: - - remove empty lines - - remove punctuation - - keep track of punctuation marks - - 2. Phonemization: - - convert text to phonemes - - 3. Postprocessing: - - join phonemes - - restore punctuation marks - """ - @abc.abstractmethod def _phonemize(self, text, separator): """The main phonemization method""" diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index f1d0b6cd..3cccee41 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -28,29 +28,30 @@ def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]: "1", # UTF8 text encoding ] cmd.extend(args) - logging.debug("espeakng: executing %s" % repr(cmd)) - p = subprocess.Popen( + logging.debug("espeakng: executing %s", repr(cmd)) + + with subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - ) - res = iter(p.stdout.readline, b"") - if not sync: + ) as p: + res = iter(p.stdout.readline, b"") + if not sync: + p.stdout.close() + if p.stderr: + p.stderr.close() + if p.stdin: + p.stdin.close() + return res + res2 = [] + for line in res: + res2.append(line) p.stdout.close() if p.stderr: p.stderr.close() if p.stdin: p.stdin.close() - return res - res2 = [] - for line in res: - res2.append(line) - p.stdout.close() - if p.stderr: - p.stderr.close() - if p.stdin: - p.stdin.close() - p.wait() + p.wait() return res2 @@ -85,7 +86,24 @@ class ESpeak(BasePhonemizer): def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True): if self._ESPEAK_LIB is None: raise Exception("Unknown backend: %s" % backend) + + # band-aid for backwards compatibility + if language == "en": + language = "en-us" + super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs) + if backend is not None: + self.backend = backend + + @property + def backend(self): + return self._ESPEAK_LIB + + @backend.setter + def backend(self, backend): + if backend not in ["espeak", "espeak-ng"]: + raise Exception("Unknown backend: %s" % backend) + self._ESPEAK_LIB = backend def auto_set_espeak_lib(self) -> None: if is_tool("espeak-ng"): @@ -115,24 +133,25 @@ class ESpeak(BasePhonemizer): # espeak and espeak-ng parses `ipa` differently if tie: # use '͡' between phonemes - if _DEF_ESPEAK_LIB == "espeak": + if self.backend == "espeak": args.append("--ipa=1") else: args.append("--ipa=3") else: # split with '_' - if _DEF_ESPEAK_LIB == "espeak": + if self.backend == "espeak": args.append("--ipa=3") else: args.append("--ipa=1") if tie: args.append("--tie=%s" % tie) + args.append('"' + text + '"') # compute phonemes phonemes = "" for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True): - logging.debug("line: %s" % repr(line)) - phonemes += line.decode("utf8").strip() + logging.debug("line: %s", repr(line)) + phonemes += line.decode("utf8").strip()[2:] # skip two redundant characters return phonemes.replace("_", separator) def _phonemize(self, text, separator=None): @@ -146,7 +165,7 @@ class ESpeak(BasePhonemizer): Dict: Dictionary of language codes. """ if _DEF_ESPEAK_LIB is None: - raise {} + return {} args = ["--voices"] langs = {} count = 0 @@ -157,7 +176,7 @@ class ESpeak(BasePhonemizer): lang_code = cols[1] lang_name = cols[3] langs[lang_code] = lang_name - logging.debug("line: %s" % repr(line)) + logging.debug("line: %s", repr(line)) count += 1 return langs @@ -168,9 +187,9 @@ class ESpeak(BasePhonemizer): str: Version of the used backend. """ args = ["--version"] - for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True): + for line in _espeak_exe(self.backend, args, sync=True): version = line.decode("utf8").strip().split()[2] - logging.debug("line: %s" % repr(line)) + logging.debug("line: %s", repr(line)) return version @classmethod diff --git a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py index d0aa469e..f3e9c9ab 100644 --- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py @@ -1,5 +1,4 @@ import importlib -from os import stat from typing import List import gruut @@ -55,7 +54,7 @@ class Gruut(BasePhonemizer): def name(): return "gruut" - def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: + def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument """Convert input text to phonemes. Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py index 4f93edeb..60b965f9 100644 --- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py @@ -30,7 +30,7 @@ class JA_JP_Phonemizer(BasePhonemizer): language = "ja-jp" - def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): + def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) @staticmethod @@ -61,12 +61,12 @@ class JA_JP_Phonemizer(BasePhonemizer): return True -if __name__ == "__main__": - text = "これは、電話をかけるための私の日本語の例のテキストです。" - e = JA_JP_Phonemizer() - print(e.supported_languages()) - print(e.version()) - print(e.language) - print(e.name()) - print(e.is_available()) - print("`" + e.phonemize(text) + "`") +# if __name__ == "__main__": +# text = "これは、電話をかけるための私の日本語の例のテキストです。" +# e = JA_JP_Phonemizer() +# print(e.supported_languages()) +# print(e.version()) +# print(e.language) +# print(e.name()) +# print(e.is_available()) +# print("`" + e.phonemize(text) + "`") diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py index e8b2ce34..e36b0a2a 100644 --- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py @@ -17,7 +17,7 @@ class MultiPhonemizer: lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER language = "multi-lingual" - def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: + def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: # pylint: disable=dangerous-default-value self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer) self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name) @@ -40,16 +40,16 @@ class MultiPhonemizer: return list(self.lang_to_phonemizer_name.keys()) -if __name__ == "__main__": - texts = { - "tr": "Merhaba, bu Türkçe bit örnek!", - "en-us": "Hello, this is English example!", - "de": "Hallo, das ist ein Deutches Beipiel!", - "zh-cn": "这是中国的例子", - } - phonemes = {} - ph = MultiPhonemizer() - for lang, text in texts.items(): - phoneme = ph.phonemize(text, lang) - phonemes[lang] = phoneme - print(phonemes) +# if __name__ == "__main__": +# texts = { +# "tr": "Merhaba, bu Türkçe bit örnek!", +# "en-us": "Hello, this is English example!", +# "de": "Hallo, das ist ein Deutches Beipiel!", +# "zh-cn": "这是中国的例子", +# } +# phonemes = {} +# ph = MultiPhonemizer() +# for lang, text in texts.items(): +# phoneme = ph.phonemize(text, lang) +# phonemes[lang] = phoneme +# print(phonemes) diff --git a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py index e1bd77c7..5a4a5591 100644 --- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py @@ -25,14 +25,15 @@ class ZH_CN_Phonemizer(BasePhonemizer): language = "zh-cn" - def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): + def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) @staticmethod def name(): return "zh_cn_phonemizer" - def phonemize_zh_cn(self, text: str, separator: str = "|") -> str: + @staticmethod + def phonemize_zh_cn(text: str, separator: str = "|") -> str: ph = chinese_text_to_phonemes(text, separator) return ph @@ -50,12 +51,12 @@ class ZH_CN_Phonemizer(BasePhonemizer): return True -if __name__ == "__main__": - text = "这是,样本中文。" - e = ZH_CN_Phonemizer() - print(e.supported_languages()) - print(e.version()) - print(e.language) - print(e.name()) - print(e.is_available()) - print("`" + e.phonemize(text) + "`") +# if __name__ == "__main__": +# text = "这是,样本中文。" +# e = ZH_CN_Phonemizer() +# print(e.supported_languages()) +# print(e.version()) +# print(e.language) +# print(e.name()) +# print(e.is_available()) +# print("`" + e.phonemize(text) + "`") diff --git a/TTS/tts/utils/text/punctuation.py b/TTS/tts/utils/text/punctuation.py index 414ac253..09087d5f 100644 --- a/TTS/tts/utils/text/punctuation.py +++ b/TTS/tts/utils/text/punctuation.py @@ -130,7 +130,7 @@ class Punctuation: return cls._restore(text, puncs, 0) @classmethod - def _restore(cls, text, puncs, num): + def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements """Auxiliary method for Punctuation.restore()""" if not puncs: return text @@ -159,14 +159,14 @@ class Punctuation: return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num) -if __name__ == "__main__": - punc = Punctuation() - text = "This is. This is, example!" +# if __name__ == "__main__": +# punc = Punctuation() +# text = "This is. This is, example!" - print(punc.strip(text)) +# print(punc.strip(text)) - split_text, puncs = punc.strip_to_restore(text) - print(split_text, " ---- ", puncs) +# split_text, puncs = punc.strip_to_restore(text) +# print(split_text, " ---- ", puncs) - restored_text = punc.restore(split_text, puncs) - print(restored_text) +# restored_text = punc.restore(split_text, puncs) +# print(restored_text) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index bdee8615..bfa0e5e1 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -383,8 +383,7 @@ class AudioProcessor(object): def init_from_config(config: "Coqpit"): if "audio" in config: return AudioProcessor(**config.audio) - else: - return AudioProcessor(**config) + return AudioProcessor(**config) ### setting up the parameters ### def _build_mel_basis( diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2e4f4735..f6a1ae6a 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -13,7 +13,6 @@ from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import from TTS.tts.utils.synthesis import synthesis, trim_silence -from TTS.tts.utils.text import TTSTokenizer from TTS.utils.audio import AudioProcessor from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index e56d1db4..f78d69b8 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -314,7 +314,7 @@ class GAN(BaseVocoder): data_items: List, verbose: bool, num_gpus: int, - rank: int = 0, # pylint: disable=unused-argument + rank: int = None, # pylint: disable=unused-argument ): """Initiate and return the GAN dataloader.