Make lint

This commit is contained in:
Eren Gölge 2021-12-07 12:51:58 +00:00
parent 30cfafce56
commit c9972e6f14
14 changed files with 319 additions and 236 deletions

View File

@ -111,8 +111,8 @@ def load_tts_samples(
meta_data_eval_all += meta_data_eval meta_data_eval_all += meta_data_eval
meta_data_train_all += meta_data_train meta_data_train_all += meta_data_train
# load attention masks for the duration predictor training # load attention masks for the duration predictor training
if dataset.meta_file_attn_mask: if d.meta_file_attn_mask:
meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) meta_data = dict(load_attention_mask_meta_data(d["meta_file_attn_mask"]))
for idx, ins in enumerate(meta_data_train_all): for idx, ins in enumerate(meta_data_train_all):
attn_file = meta_data[ins["audio_file"]].strip() attn_file = meta_data[ins["audio_file"]].strip()
meta_data_train_all[idx].update({"alignment_file": attn_file}) meta_data_train_all[idx].update({"alignment_file": attn_file})

View File

@ -1,7 +1,6 @@
import collections import collections
import os import os
import random import random
from multiprocessing import Pool
from typing import Dict, List, Union from typing import Dict, List, Union
import numpy as np import numpy as np
@ -10,7 +9,6 @@ import tqdm
from torch.utils.data import Dataset from torch.utils.data import Dataset
from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
from TTS.tts.utils.text import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
@ -183,7 +181,7 @@ class TTSDataset(Dataset):
def get_phonemes(self, idx, text): def get_phonemes(self, idx, text):
out_dict = self.phoneme_dataset[idx] out_dict = self.phoneme_dataset[idx]
assert text == out_dict["text"], f"{text} != {out_dict['text']}" assert text == out_dict["text"], f"{text} != {out_dict['text']}"
assert out_dict["token_ids"].size > 0 assert len(out_dict["token_ids"]) > 0
return out_dict return out_dict
def get_f0(self, idx): def get_f0(self, idx):
@ -192,7 +190,8 @@ class TTSDataset(Dataset):
assert wav_file == out_dict["audio_file"] assert wav_file == out_dict["audio_file"]
return out_dict return out_dict
def get_attn_maks(self, attn_file): @staticmethod
def get_attn_mask(attn_file):
return np.load(attn_file) return np.load(attn_file)
def get_token_ids(self, idx, text): def get_token_ids(self, idx, text):
@ -207,7 +206,7 @@ class TTSDataset(Dataset):
raw_text = item["text"] raw_text = item["text"]
wav = np.asarray(self.load_wav(item[]), dtype=np.float32) wav = np.asarray(self.load_wav(item["audio_file"]), dtype=np.float32)
# apply noise for augmentation # apply noise for augmentation
if self.use_noise_augment: if self.use_noise_augment:
@ -262,7 +261,7 @@ class TTSDataset(Dataset):
idxs = np.argsort(lengths) # ascending order idxs = np.argsort(lengths) # ascending order
ignore_idx = [] ignore_idx = []
keep_idx = [] keep_idx = []
for i, idx in enumerate(idxs): for idx in idxs:
length = lengths[idx] length = lengths[idx]
if length < min_len or length > max_len: if length < min_len or length > max_len:
ignore_idx.append(idx) ignore_idx.append(idx)
@ -277,6 +276,7 @@ class TTSDataset(Dataset):
@staticmethod @staticmethod
def create_buckets(samples, batch_group_size: int): def create_buckets(samples, batch_group_size: int):
assert batch_group_size > 0
for i in range(len(samples) // batch_group_size): for i in range(len(samples) // batch_group_size):
offset = i * batch_group_size offset = i * batch_group_size
end_offset = offset + batch_group_size end_offset = offset + batch_group_size
@ -319,6 +319,7 @@ class TTSDataset(Dataset):
# shuffle batch groups # shuffle batch groups
# create batches with similar length items # create batches with similar length items
# the larger the `batch_group_size`, the higher the length variety in a batch. # the larger the `batch_group_size`, the higher the length variety in a batch.
if self.batch_group_size > 0:
samples = self.create_buckets(samples, self.batch_group_size) samples = self.create_buckets(samples, self.batch_group_size)
# update items to the new sorted items # update items to the new sorted items
@ -571,6 +572,7 @@ class PhonemeDataset(Dataset):
We use pytorch dataloader because we are lazy. We use pytorch dataloader because we are lazy.
""" """
print("[*] Pre-computing phonemes...")
with tqdm.tqdm(total=len(self)) as pbar: with tqdm.tqdm(total=len(self)) as pbar:
batch_size = num_workers if num_workers > 0 else 1 batch_size = num_workers if num_workers > 0 else 1
dataloder = torch.utils.data.DataLoader( dataloder = torch.utils.data.DataLoader(
@ -658,16 +660,21 @@ class F0Dataset:
return len(self.samples) return len(self.samples)
def precompute(self, num_workers=0): def precompute(self, num_workers=0):
print("[*] Pre-computing F0s...")
with tqdm.tqdm(total=len(self)) as pbar: with tqdm.tqdm(total=len(self)) as pbar:
batch_size = num_workers if num_workers > 0 else 1 batch_size = num_workers if num_workers > 0 else 1
# we do not normalize at preproessing
normalize_f0 = self.normalize_f0
self.normalize_f0 = False
dataloder = torch.utils.data.DataLoader( dataloder = torch.utils.data.DataLoader(
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
) )
computed_data = [] computed_data = []
for batch in dataloder: for batch in dataloder:
f0 = batch["f0"] f0 = batch["f0"]
computed_data.append([f for f in f0]) computed_data.append(f for f in f0)
pbar.update(batch_size) pbar.update(batch_size)
self.normalize_f0 = normalize_f0
if self.normalize_f0: if self.normalize_f0:
computed_data = [tensor for batch in computed_data for tensor in batch] # flatten computed_data = [tensor for batch in computed_data for tensor in batch] # flatten
@ -746,80 +753,80 @@ class F0Dataset:
print(f"{indent}| > Number of instances : {len(self.samples)}") print(f"{indent}| > Number of instances : {len(self.samples)}")
if __name__ == "__main__": # if __name__ == "__main__":
from torch.utils.data import DataLoader # from torch.utils.data import DataLoader
from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig # from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples # from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text.characters import IPAPhonemes # from TTS.tts.utils.text.characters import IPAPhonemes
from TTS.tts.utils.text.phonemizers import ESpeak # from TTS.tts.utils.text.phonemizers import ESpeak
dataset_config = BaseDatasetConfig( # dataset_config = BaseDatasetConfig(
name="ljspeech", # name="ljspeech",
meta_file_train="metadata.csv", # meta_file_train="metadata.csv",
path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1", # path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1",
) # )
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
samples = train_samples + eval_samples # samples = train_samples + eval_samples
phonemizer = ESpeak(language="en-us") # phonemizer = ESpeak(language="en-us")
tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer) # tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer)
# ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests") # # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests")
# ph_dataset.precompute(num_workers=4) # # ph_dataset.precompute(num_workers=4)
# dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn) # # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn)
# # for batch in dataloader:
# # print(batch)
# # break
# audio_config = BaseAudioConfig(
# sample_rate=22050,
# win_length=1024,
# hop_length=256,
# num_mels=80,
# preemphasis=0.0,
# ref_level_db=20,
# log_func="np.log",
# do_trim_silence=True,
# trim_db=45,
# mel_fmin=0,
# mel_fmax=8000,
# spec_gain=1.0,
# signal_norm=False,
# do_amp_to_db_linear=False,
# )
# ap = AudioProcessor.init_from_config(audio_config)
# # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)
# # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
# # for batch in dataloader:
# # print(batch)
# # breakpoint()
# # break
# dataset = TTSDataset(
# outputs_per_step=1,
# compute_linear_spec=False,
# samples=samples,
# ap=ap,
# return_wav=False,
# batch_group_size=0,
# min_seq_len=0,
# max_seq_len=500,
# use_noise_augment=False,
# verbose=True,
# speaker_id_mapping=None,
# d_vector_mapping=None,
# compute_f0=True,
# f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
# tokenizer=tokenizer,
# phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
# precompute_num_workers=4,
# )
# dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
# for batch in dataloader: # for batch in dataloader:
# print(batch) # print(batch)
# break # break
audio_config = BaseAudioConfig(
sample_rate=22050,
win_length=1024,
hop_length=256,
num_mels=80,
preemphasis=0.0,
ref_level_db=20,
log_func="np.log",
do_trim_silence=True,
trim_db=45,
mel_fmin=0,
mel_fmax=8000,
spec_gain=1.0,
signal_norm=False,
do_amp_to_db_linear=False,
)
ap = AudioProcessor.init_from_config(audio_config)
# f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)
# dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
# for batch in dataloader:
# print(batch)
# breakpoint()
# break
dataset = TTSDataset(
outputs_per_step=1,
compute_linear_spec=False,
samples=samples,
ap=ap,
return_wav=False,
batch_group_size=0,
min_seq_len=0,
max_seq_len=500,
use_noise_augment=False,
verbose=True,
speaker_id_mapping=None,
d_vector_mapping=None,
compute_f0=True,
f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
tokenizer=tokenizer,
phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
precompute_num_workers=4,
)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
for batch in dataloader:
print(batch)
break

View File

@ -199,10 +199,10 @@ def synthesis(
wav = model_outputs.squeeze(0) wav = model_outputs.squeeze(0)
else: else:
if use_griffin_lim: if use_griffin_lim:
wav = inv_spectrogram(model_outputs, ap, CONFIG) wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
# trim silence # trim silence
if do_trim_silence: if do_trim_silence:
wav = trim_silence(wav, ap) wav = trim_silence(wav, model.ap)
return_dict = { return_dict = {
"wav": wav, "wav": wav,
"alignments": alignments, "alignments": alignments,

View File

@ -1,3 +1,8 @@
from dataclasses import replace
from TTS.tts.configs.shared_configs import CharactersConfig
def parse_symbols(): def parse_symbols():
return { return {
"pad": _pad, "pad": _pad,
@ -29,46 +34,49 @@ _diacrilics = "ɚ˞ɫ"
_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
def create_graphemes( # def create_graphemes(
characters=_characters, # characters=_characters,
punctuations=_punctuations, # punctuations=_punctuations,
pad=_pad, # pad=_pad,
eos=_eos, # eos=_eos,
bos=_bos, # bos=_bos,
blank=_blank, # blank=_blank,
unique=True, # unique=True,
): # pylint: disable=redefined-outer-name # ): # pylint: disable=redefined-outer-name
"""Function to create default characters and phonemes""" # """Function to create default characters and phonemes"""
# create graphemes # # create graphemes
_graphemes = list(characters) # = (
_graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes # sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
_graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes # ) # this is to keep previous models compatible.
_graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes # _graphemes = list(characters)
_graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes # _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes
_graphemes = _graphemes + list(punctuations) # _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes
return _graphemes, _phonemes # _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes
# _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes
# _graphemes = _graphemes + list(punctuations)
# return _graphemes, _phonemes
def create_phonemes( # def create_phonemes(
phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True # phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True
): # ):
# create phonemes # # create phonemes
_phonemes = None # _phonemes = None
_phonemes_sorted = ( # _phonemes_sorted = (
sorted(list(set(phonemes))) if unique else sorted(list(phonemes)) # sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
) # this is to keep previous models compatible. # ) # this is to keep previous models compatible.
_phonemes = list(_phonemes_sorted) # _phonemes = list(_phonemes_sorted)
_phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes # _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes
_phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes # _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes
_phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes # _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes
_phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes # _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes
_phonemes = _phonemes + list(punctuations) # _phonemes = _phonemes + list(punctuations)
_phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) # _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
return _phonemes # return _phonemes
graphemes = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos) # DEF_GRAPHEMES = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
phonemes = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank) # DEF_PHONEMES = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank)
class BaseCharacters: class BaseCharacters:
@ -114,7 +122,7 @@ class BaseCharacters:
eos: str, eos: str,
bos: str, bos: str,
blank: str, blank: str,
is_unique: bool = True, is_unique: bool = False,
is_sorted: bool = True, is_sorted: bool = True,
) -> None: ) -> None:
self._characters = characters self._characters = characters
@ -202,14 +210,20 @@ class BaseCharacters:
_vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
self._vocab = _vocab + list(self._punctuations) self._vocab = _vocab + list(self._punctuations)
self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)} self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)} self._id_to_char = {
idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
}
if self.is_unique: if self.is_unique:
duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
assert ( assert (
len(self.vocab) == len(self._char_to_id) == len(self._id_to_char) len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
), f" [!] There are duplicate characters in the character set. {set([x for x in self.vocab if self.vocab.count(x) > 1])}" ), f" [!] There are duplicate characters in the character set. {duplicates}"
def char_to_id(self, char: str) -> int: def char_to_id(self, char: str) -> int:
try:
return self._char_to_id[char] return self._char_to_id[char]
except KeyError as e:
raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
def id_to_char(self, idx: int) -> str: def id_to_char(self, idx: int) -> str:
return self._id_to_char[idx] return self._id_to_char[idx]
@ -229,9 +243,23 @@ class BaseCharacters:
print(f"{indent}| > Num chars: {self.num_chars}") print(f"{indent}| > Num chars: {self.num_chars}")
@staticmethod @staticmethod
def init_from_config(config: "Coqpit"): def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
return BaseCharacters( """Init your character class from a config.
**config.characters if config.characters is not None else {},
Implement this method for your subclass.
"""
...
def to_config(self) -> "CharactersConfig":
return CharactersConfig(
characters=self._characters,
punctuations=self._punctuations,
pad=self._pad,
eos=self._eos,
bos=self._bos,
blank=self._blank,
is_unique=self.is_unique,
is_sorted=self.is_sorted,
) )
@ -275,18 +303,24 @@ class IPAPhonemes(BaseCharacters):
eos: str = _eos, eos: str = _eos,
bos: str = _bos, bos: str = _bos,
blank: str = _blank, blank: str = _blank,
is_unique: bool = True, is_unique: bool = False,
is_sorted: bool = True, is_sorted: bool = True,
) -> None: ) -> None:
super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted) super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
@staticmethod @staticmethod
def init_from_config(config: "Coqpit"): def init_from_config(config: "Coqpit"):
"""Init a IPAPhonemes object from a model config
If characters are not defined in the config, it will be set to the default characters and the config
will be updated.
"""
# band-aid for compatibility with old models # band-aid for compatibility with old models
if "characters" in config and config.characters is not None: if "characters" in config and config.characters is not None:
if "phonemes" in config.characters and config.characters.phonemes is not None: if "phonemes" in config.characters and config.characters.phonemes is not None:
config.characters["characters"] = config.characters["phonemes"] config.characters["characters"] = config.characters["phonemes"]
return IPAPhonemes( return (
IPAPhonemes(
characters=config.characters["characters"], characters=config.characters["characters"],
punctuations=config.characters["punctuations"], punctuations=config.characters["punctuations"],
pad=config.characters["pad"], pad=config.characters["pad"],
@ -295,11 +329,16 @@ class IPAPhonemes(BaseCharacters):
blank=config.characters["blank"], blank=config.characters["blank"],
is_unique=config.characters["is_unique"], is_unique=config.characters["is_unique"],
is_sorted=config.characters["is_sorted"], is_sorted=config.characters["is_sorted"],
),
config,
) )
else: # use character set from config
return IPAPhonemes( if config.characters is not None:
**config.characters if config.characters is not None else {}, return IPAPhonemes(**config.characters), config
) # return default character set
characters = IPAPhonemes()
new_config = replace(config, characters=characters.to_config())
return characters, new_config
class Graphemes(BaseCharacters): class Graphemes(BaseCharacters):
@ -339,24 +378,42 @@ class Graphemes(BaseCharacters):
eos: str = _eos, eos: str = _eos,
bos: str = _bos, bos: str = _bos,
blank: str = _blank, blank: str = _blank,
is_unique: bool = True, is_unique: bool = False,
is_sorted: bool = True, is_sorted: bool = True,
) -> None: ) -> None:
super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted) super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
@staticmethod @staticmethod
def init_from_config(config: "Coqpit"): def init_from_config(config: "Coqpit"):
return Graphemes( """Init a Graphemes object from a model config
**config.characters if config.characters is not None else {},
If characters are not defined in the config, it will be set to the default characters and the config
will be updated.
"""
if config.characters is not None:
# band-aid for compatibility with old models
if "phonemes" in config.characters:
return (
Graphemes(
characters=config.characters["characters"],
punctuations=config.characters["punctuations"],
pad=config.characters["pad"],
eos=config.characters["eos"],
bos=config.characters["bos"],
blank=config.characters["blank"],
is_unique=config.characters["is_unique"],
is_sorted=config.characters["is_sorted"],
),
config,
) )
return Graphemes(**config.characters), config
characters = Graphemes()
new_config = replace(config, characters=characters.to_config())
return characters, new_config
if __name__ == "__main__": if __name__ == "__main__":
gr = Graphemes() gr = Graphemes()
ph = IPAPhonemes() ph = IPAPhonemes()
gr.print_log()
print(gr.vocab) ph.print_log()
print(ph.vocab)
print(gr.num_chars)
assert "a" == gr.id_to_char(gr.char_to_id("a"))

View File

@ -1,6 +1,5 @@
import abc import abc
import itertools from typing import List, Tuple
from typing import List, Tuple, Union
from TTS.tts.utils.text.punctuation import Punctuation from TTS.tts.utils.text.punctuation import Punctuation
@ -8,6 +7,19 @@ from TTS.tts.utils.text.punctuation import Punctuation
class BasePhonemizer(abc.ABC): class BasePhonemizer(abc.ABC):
"""Base phonemizer class """Base phonemizer class
Phonemization follows the following steps:
1. Preprocessing:
- remove empty lines
- remove punctuation
- keep track of punctuation marks
2. Phonemization:
- convert text to phonemes
3. Postprocessing:
- join phonemes
- restore punctuation marks
Args: Args:
language (str): language (str):
Language used by the phonemizer. Language used by the phonemizer.
@ -51,40 +63,30 @@ class BasePhonemizer(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def name(): def name():
"""The name of the backend""" """The name of the backend"""
...
@classmethod @classmethod
@abc.abstractmethod @abc.abstractmethod
def is_available(cls): def is_available(cls):
"""Returns True if the backend is installed, False otherwise""" """Returns True if the backend is installed, False otherwise"""
...
@classmethod @classmethod
@abc.abstractmethod @abc.abstractmethod
def version(cls): def version(cls):
"""Return the backend version as a tuple (major, minor, patch)""" """Return the backend version as a tuple (major, minor, patch)"""
...
@staticmethod
@abc.abstractmethod @abc.abstractmethod
def supported_languages(): def supported_languages():
"""Return a dict of language codes -> name supported by the backend""" """Return a dict of language codes -> name supported by the backend"""
...
def is_supported_language(self, language): def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend""" """Returns True if `language` is supported by the backend"""
return language in self.supported_languages() return language in self.supported_languages()
fr"""
Phonemization follows the following steps:
1. Preprocessing:
- remove empty lines
- remove punctuation
- keep track of punctuation marks
2. Phonemization:
- convert text to phonemes
3. Postprocessing:
- join phonemes
- restore punctuation marks
"""
@abc.abstractmethod @abc.abstractmethod
def _phonemize(self, text, separator): def _phonemize(self, text, separator):
"""The main phonemization method""" """The main phonemization method"""

View File

@ -28,12 +28,13 @@ def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]:
"1", # UTF8 text encoding "1", # UTF8 text encoding
] ]
cmd.extend(args) cmd.extend(args)
logging.debug("espeakng: executing %s" % repr(cmd)) logging.debug("espeakng: executing %s", repr(cmd))
p = subprocess.Popen(
with subprocess.Popen(
cmd, cmd,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
) ) as p:
res = iter(p.stdout.readline, b"") res = iter(p.stdout.readline, b"")
if not sync: if not sync:
p.stdout.close() p.stdout.close()
@ -85,7 +86,24 @@ class ESpeak(BasePhonemizer):
def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True): def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True):
if self._ESPEAK_LIB is None: if self._ESPEAK_LIB is None:
raise Exception("Unknown backend: %s" % backend) raise Exception("Unknown backend: %s" % backend)
# band-aid for backwards compatibility
if language == "en":
language = "en-us"
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs) super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
if backend is not None:
self.backend = backend
@property
def backend(self):
return self._ESPEAK_LIB
@backend.setter
def backend(self, backend):
if backend not in ["espeak", "espeak-ng"]:
raise Exception("Unknown backend: %s" % backend)
self._ESPEAK_LIB = backend
def auto_set_espeak_lib(self) -> None: def auto_set_espeak_lib(self) -> None:
if is_tool("espeak-ng"): if is_tool("espeak-ng"):
@ -115,24 +133,25 @@ class ESpeak(BasePhonemizer):
# espeak and espeak-ng parses `ipa` differently # espeak and espeak-ng parses `ipa` differently
if tie: if tie:
# use '͡' between phonemes # use '͡' between phonemes
if _DEF_ESPEAK_LIB == "espeak": if self.backend == "espeak":
args.append("--ipa=1") args.append("--ipa=1")
else: else:
args.append("--ipa=3") args.append("--ipa=3")
else: else:
# split with '_' # split with '_'
if _DEF_ESPEAK_LIB == "espeak": if self.backend == "espeak":
args.append("--ipa=3") args.append("--ipa=3")
else: else:
args.append("--ipa=1") args.append("--ipa=1")
if tie: if tie:
args.append("--tie=%s" % tie) args.append("--tie=%s" % tie)
args.append('"' + text + '"') args.append('"' + text + '"')
# compute phonemes # compute phonemes
phonemes = "" phonemes = ""
for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True): for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
logging.debug("line: %s" % repr(line)) logging.debug("line: %s", repr(line))
phonemes += line.decode("utf8").strip() phonemes += line.decode("utf8").strip()[2:] # skip two redundant characters
return phonemes.replace("_", separator) return phonemes.replace("_", separator)
def _phonemize(self, text, separator=None): def _phonemize(self, text, separator=None):
@ -146,7 +165,7 @@ class ESpeak(BasePhonemizer):
Dict: Dictionary of language codes. Dict: Dictionary of language codes.
""" """
if _DEF_ESPEAK_LIB is None: if _DEF_ESPEAK_LIB is None:
raise {} return {}
args = ["--voices"] args = ["--voices"]
langs = {} langs = {}
count = 0 count = 0
@ -157,7 +176,7 @@ class ESpeak(BasePhonemizer):
lang_code = cols[1] lang_code = cols[1]
lang_name = cols[3] lang_name = cols[3]
langs[lang_code] = lang_name langs[lang_code] = lang_name
logging.debug("line: %s" % repr(line)) logging.debug("line: %s", repr(line))
count += 1 count += 1
return langs return langs
@ -168,9 +187,9 @@ class ESpeak(BasePhonemizer):
str: Version of the used backend. str: Version of the used backend.
""" """
args = ["--version"] args = ["--version"]
for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True): for line in _espeak_exe(self.backend, args, sync=True):
version = line.decode("utf8").strip().split()[2] version = line.decode("utf8").strip().split()[2]
logging.debug("line: %s" % repr(line)) logging.debug("line: %s", repr(line))
return version return version
@classmethod @classmethod

View File

@ -1,5 +1,4 @@
import importlib import importlib
from os import stat
from typing import List from typing import List
import gruut import gruut
@ -55,7 +54,7 @@ class Gruut(BasePhonemizer):
def name(): def name():
return "gruut" return "gruut"
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
"""Convert input text to phonemes. """Convert input text to phonemes.
Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters

View File

@ -30,7 +30,7 @@ class JA_JP_Phonemizer(BasePhonemizer):
language = "ja-jp" language = "ja-jp"
def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod @staticmethod
@ -61,12 +61,12 @@ class JA_JP_Phonemizer(BasePhonemizer):
return True return True
if __name__ == "__main__": # if __name__ == "__main__":
text = "これは、電話をかけるための私の日本語の例のテキストです。" # text = "これは、電話をかけるための私の日本語の例のテキストです。"
e = JA_JP_Phonemizer() # e = JA_JP_Phonemizer()
print(e.supported_languages()) # print(e.supported_languages())
print(e.version()) # print(e.version())
print(e.language) # print(e.language)
print(e.name()) # print(e.name())
print(e.is_available()) # print(e.is_available())
print("`" + e.phonemize(text) + "`") # print("`" + e.phonemize(text) + "`")

View File

@ -17,7 +17,7 @@ class MultiPhonemizer:
lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER
language = "multi-lingual" language = "multi-lingual"
def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: # pylint: disable=dangerous-default-value
self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer) self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer)
self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name) self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
@ -40,16 +40,16 @@ class MultiPhonemizer:
return list(self.lang_to_phonemizer_name.keys()) return list(self.lang_to_phonemizer_name.keys())
if __name__ == "__main__": # if __name__ == "__main__":
texts = { # texts = {
"tr": "Merhaba, bu Türkçe bit örnek!", # "tr": "Merhaba, bu Türkçe bit örnek!",
"en-us": "Hello, this is English example!", # "en-us": "Hello, this is English example!",
"de": "Hallo, das ist ein Deutches Beipiel!", # "de": "Hallo, das ist ein Deutches Beipiel!",
"zh-cn": "这是中国的例子", # "zh-cn": "这是中国的例子",
} # }
phonemes = {} # phonemes = {}
ph = MultiPhonemizer() # ph = MultiPhonemizer()
for lang, text in texts.items(): # for lang, text in texts.items():
phoneme = ph.phonemize(text, lang) # phoneme = ph.phonemize(text, lang)
phonemes[lang] = phoneme # phonemes[lang] = phoneme
print(phonemes) # print(phonemes)

View File

@ -25,14 +25,15 @@ class ZH_CN_Phonemizer(BasePhonemizer):
language = "zh-cn" language = "zh-cn"
def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod @staticmethod
def name(): def name():
return "zh_cn_phonemizer" return "zh_cn_phonemizer"
def phonemize_zh_cn(self, text: str, separator: str = "|") -> str: @staticmethod
def phonemize_zh_cn(text: str, separator: str = "|") -> str:
ph = chinese_text_to_phonemes(text, separator) ph = chinese_text_to_phonemes(text, separator)
return ph return ph
@ -50,12 +51,12 @@ class ZH_CN_Phonemizer(BasePhonemizer):
return True return True
if __name__ == "__main__": # if __name__ == "__main__":
text = "这是,样本中文。" # text = "这是,样本中文。"
e = ZH_CN_Phonemizer() # e = ZH_CN_Phonemizer()
print(e.supported_languages()) # print(e.supported_languages())
print(e.version()) # print(e.version())
print(e.language) # print(e.language)
print(e.name()) # print(e.name())
print(e.is_available()) # print(e.is_available())
print("`" + e.phonemize(text) + "`") # print("`" + e.phonemize(text) + "`")

View File

@ -130,7 +130,7 @@ class Punctuation:
return cls._restore(text, puncs, 0) return cls._restore(text, puncs, 0)
@classmethod @classmethod
def _restore(cls, text, puncs, num): def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
"""Auxiliary method for Punctuation.restore()""" """Auxiliary method for Punctuation.restore()"""
if not puncs: if not puncs:
return text return text
@ -159,14 +159,14 @@ class Punctuation:
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num) return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
if __name__ == "__main__": # if __name__ == "__main__":
punc = Punctuation() # punc = Punctuation()
text = "This is. This is, example!" # text = "This is. This is, example!"
print(punc.strip(text)) # print(punc.strip(text))
split_text, puncs = punc.strip_to_restore(text) # split_text, puncs = punc.strip_to_restore(text)
print(split_text, " ---- ", puncs) # print(split_text, " ---- ", puncs)
restored_text = punc.restore(split_text, puncs) # restored_text = punc.restore(split_text, puncs)
print(restored_text) # print(restored_text)

View File

@ -383,7 +383,6 @@ class AudioProcessor(object):
def init_from_config(config: "Coqpit"): def init_from_config(config: "Coqpit"):
if "audio" in config: if "audio" in config:
return AudioProcessor(**config.audio) return AudioProcessor(**config.audio)
else:
return AudioProcessor(**config) return AudioProcessor(**config)
### setting up the parameters ### ### setting up the parameters ###

View File

@ -13,7 +13,6 @@ from TTS.tts.utils.speakers import SpeakerManager
# pylint: disable=unused-wildcard-import # pylint: disable=unused-wildcard-import
# pylint: disable=wildcard-import # pylint: disable=wildcard-import
from TTS.tts.utils.synthesis import synthesis, trim_silence from TTS.tts.utils.synthesis import synthesis, trim_silence
from TTS.tts.utils.text import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.models import setup_model as setup_vocoder_model
from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input

View File

@ -314,7 +314,7 @@ class GAN(BaseVocoder):
data_items: List, data_items: List,
verbose: bool, verbose: bool,
num_gpus: int, num_gpus: int,
rank: int = 0, # pylint: disable=unused-argument rank: int = None, # pylint: disable=unused-argument
): ):
"""Initiate and return the GAN dataloader. """Initiate and return the GAN dataloader.