mirror of https://github.com/coqui-ai/TTS.git
Make lint
This commit is contained in:
parent
30cfafce56
commit
c9972e6f14
|
@ -111,8 +111,8 @@ def load_tts_samples(
|
|||
meta_data_eval_all += meta_data_eval
|
||||
meta_data_train_all += meta_data_train
|
||||
# load attention masks for the duration predictor training
|
||||
if dataset.meta_file_attn_mask:
|
||||
meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
|
||||
if d.meta_file_attn_mask:
|
||||
meta_data = dict(load_attention_mask_meta_data(d["meta_file_attn_mask"]))
|
||||
for idx, ins in enumerate(meta_data_train_all):
|
||||
attn_file = meta_data[ins["audio_file"]].strip()
|
||||
meta_data_train_all[idx].update({"alignment_file": attn_file})
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import collections
|
||||
import os
|
||||
import random
|
||||
from multiprocessing import Pool
|
||||
from typing import Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
|
@ -10,7 +9,6 @@ import tqdm
|
|||
from torch.utils.data import Dataset
|
||||
|
||||
from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
|
||||
from TTS.tts.utils.text import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
|
||||
|
@ -183,7 +181,7 @@ class TTSDataset(Dataset):
|
|||
def get_phonemes(self, idx, text):
|
||||
out_dict = self.phoneme_dataset[idx]
|
||||
assert text == out_dict["text"], f"{text} != {out_dict['text']}"
|
||||
assert out_dict["token_ids"].size > 0
|
||||
assert len(out_dict["token_ids"]) > 0
|
||||
return out_dict
|
||||
|
||||
def get_f0(self, idx):
|
||||
|
@ -192,7 +190,8 @@ class TTSDataset(Dataset):
|
|||
assert wav_file == out_dict["audio_file"]
|
||||
return out_dict
|
||||
|
||||
def get_attn_maks(self, attn_file):
|
||||
@staticmethod
|
||||
def get_attn_mask(attn_file):
|
||||
return np.load(attn_file)
|
||||
|
||||
def get_token_ids(self, idx, text):
|
||||
|
@ -207,7 +206,7 @@ class TTSDataset(Dataset):
|
|||
|
||||
raw_text = item["text"]
|
||||
|
||||
wav = np.asarray(self.load_wav(item[]), dtype=np.float32)
|
||||
wav = np.asarray(self.load_wav(item["audio_file"]), dtype=np.float32)
|
||||
|
||||
# apply noise for augmentation
|
||||
if self.use_noise_augment:
|
||||
|
@ -262,7 +261,7 @@ class TTSDataset(Dataset):
|
|||
idxs = np.argsort(lengths) # ascending order
|
||||
ignore_idx = []
|
||||
keep_idx = []
|
||||
for i, idx in enumerate(idxs):
|
||||
for idx in idxs:
|
||||
length = lengths[idx]
|
||||
if length < min_len or length > max_len:
|
||||
ignore_idx.append(idx)
|
||||
|
@ -277,6 +276,7 @@ class TTSDataset(Dataset):
|
|||
|
||||
@staticmethod
|
||||
def create_buckets(samples, batch_group_size: int):
|
||||
assert batch_group_size > 0
|
||||
for i in range(len(samples) // batch_group_size):
|
||||
offset = i * batch_group_size
|
||||
end_offset = offset + batch_group_size
|
||||
|
@ -319,7 +319,8 @@ class TTSDataset(Dataset):
|
|||
# shuffle batch groups
|
||||
# create batches with similar length items
|
||||
# the larger the `batch_group_size`, the higher the length variety in a batch.
|
||||
samples = self.create_buckets(samples, self.batch_group_size)
|
||||
if self.batch_group_size > 0:
|
||||
samples = self.create_buckets(samples, self.batch_group_size)
|
||||
|
||||
# update items to the new sorted items
|
||||
self.samples = samples
|
||||
|
@ -571,6 +572,7 @@ class PhonemeDataset(Dataset):
|
|||
|
||||
We use pytorch dataloader because we are lazy.
|
||||
"""
|
||||
print("[*] Pre-computing phonemes...")
|
||||
with tqdm.tqdm(total=len(self)) as pbar:
|
||||
batch_size = num_workers if num_workers > 0 else 1
|
||||
dataloder = torch.utils.data.DataLoader(
|
||||
|
@ -658,16 +660,21 @@ class F0Dataset:
|
|||
return len(self.samples)
|
||||
|
||||
def precompute(self, num_workers=0):
|
||||
print("[*] Pre-computing F0s...")
|
||||
with tqdm.tqdm(total=len(self)) as pbar:
|
||||
batch_size = num_workers if num_workers > 0 else 1
|
||||
# we do not normalize at preproessing
|
||||
normalize_f0 = self.normalize_f0
|
||||
self.normalize_f0 = False
|
||||
dataloder = torch.utils.data.DataLoader(
|
||||
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
|
||||
)
|
||||
computed_data = []
|
||||
for batch in dataloder:
|
||||
f0 = batch["f0"]
|
||||
computed_data.append([f for f in f0])
|
||||
computed_data.append(f for f in f0)
|
||||
pbar.update(batch_size)
|
||||
self.normalize_f0 = normalize_f0
|
||||
|
||||
if self.normalize_f0:
|
||||
computed_data = [tensor for batch in computed_data for tensor in batch] # flatten
|
||||
|
@ -746,80 +753,80 @@ class F0Dataset:
|
|||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch.utils.data import DataLoader
|
||||
# if __name__ == "__main__":
|
||||
# from torch.utils.data import DataLoader
|
||||
|
||||
from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.utils.text.characters import IPAPhonemes
|
||||
from TTS.tts.utils.text.phonemizers import ESpeak
|
||||
# from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
|
||||
# from TTS.tts.datasets import load_tts_samples
|
||||
# from TTS.tts.utils.text.characters import IPAPhonemes
|
||||
# from TTS.tts.utils.text.phonemizers import ESpeak
|
||||
|
||||
dataset_config = BaseDatasetConfig(
|
||||
name="ljspeech",
|
||||
meta_file_train="metadata.csv",
|
||||
path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1",
|
||||
)
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
samples = train_samples + eval_samples
|
||||
# dataset_config = BaseDatasetConfig(
|
||||
# name="ljspeech",
|
||||
# meta_file_train="metadata.csv",
|
||||
# path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1",
|
||||
# )
|
||||
# train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
# samples = train_samples + eval_samples
|
||||
|
||||
phonemizer = ESpeak(language="en-us")
|
||||
tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer)
|
||||
# ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests")
|
||||
# ph_dataset.precompute(num_workers=4)
|
||||
# phonemizer = ESpeak(language="en-us")
|
||||
# tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer)
|
||||
# # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests")
|
||||
# # ph_dataset.precompute(num_workers=4)
|
||||
|
||||
# dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn)
|
||||
# for batch in dataloader:
|
||||
# print(batch)
|
||||
# break
|
||||
# # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn)
|
||||
# # for batch in dataloader:
|
||||
# # print(batch)
|
||||
# # break
|
||||
|
||||
audio_config = BaseAudioConfig(
|
||||
sample_rate=22050,
|
||||
win_length=1024,
|
||||
hop_length=256,
|
||||
num_mels=80,
|
||||
preemphasis=0.0,
|
||||
ref_level_db=20,
|
||||
log_func="np.log",
|
||||
do_trim_silence=True,
|
||||
trim_db=45,
|
||||
mel_fmin=0,
|
||||
mel_fmax=8000,
|
||||
spec_gain=1.0,
|
||||
signal_norm=False,
|
||||
do_amp_to_db_linear=False,
|
||||
)
|
||||
# audio_config = BaseAudioConfig(
|
||||
# sample_rate=22050,
|
||||
# win_length=1024,
|
||||
# hop_length=256,
|
||||
# num_mels=80,
|
||||
# preemphasis=0.0,
|
||||
# ref_level_db=20,
|
||||
# log_func="np.log",
|
||||
# do_trim_silence=True,
|
||||
# trim_db=45,
|
||||
# mel_fmin=0,
|
||||
# mel_fmax=8000,
|
||||
# spec_gain=1.0,
|
||||
# signal_norm=False,
|
||||
# do_amp_to_db_linear=False,
|
||||
# )
|
||||
|
||||
ap = AudioProcessor.init_from_config(audio_config)
|
||||
# ap = AudioProcessor.init_from_config(audio_config)
|
||||
|
||||
# f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)
|
||||
# # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)
|
||||
|
||||
# dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
|
||||
# for batch in dataloader:
|
||||
# print(batch)
|
||||
# breakpoint()
|
||||
# break
|
||||
# # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
|
||||
# # for batch in dataloader:
|
||||
# # print(batch)
|
||||
# # breakpoint()
|
||||
# # break
|
||||
|
||||
dataset = TTSDataset(
|
||||
outputs_per_step=1,
|
||||
compute_linear_spec=False,
|
||||
samples=samples,
|
||||
ap=ap,
|
||||
return_wav=False,
|
||||
batch_group_size=0,
|
||||
min_seq_len=0,
|
||||
max_seq_len=500,
|
||||
use_noise_augment=False,
|
||||
verbose=True,
|
||||
speaker_id_mapping=None,
|
||||
d_vector_mapping=None,
|
||||
compute_f0=True,
|
||||
f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
|
||||
tokenizer=tokenizer,
|
||||
phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
|
||||
precompute_num_workers=4,
|
||||
)
|
||||
# dataset = TTSDataset(
|
||||
# outputs_per_step=1,
|
||||
# compute_linear_spec=False,
|
||||
# samples=samples,
|
||||
# ap=ap,
|
||||
# return_wav=False,
|
||||
# batch_group_size=0,
|
||||
# min_seq_len=0,
|
||||
# max_seq_len=500,
|
||||
# use_noise_augment=False,
|
||||
# verbose=True,
|
||||
# speaker_id_mapping=None,
|
||||
# d_vector_mapping=None,
|
||||
# compute_f0=True,
|
||||
# f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
|
||||
# tokenizer=tokenizer,
|
||||
# phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
|
||||
# precompute_num_workers=4,
|
||||
# )
|
||||
|
||||
dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
|
||||
for batch in dataloader:
|
||||
print(batch)
|
||||
break
|
||||
# dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
|
||||
# for batch in dataloader:
|
||||
# print(batch)
|
||||
# break
|
||||
|
|
|
@ -199,10 +199,10 @@ def synthesis(
|
|||
wav = model_outputs.squeeze(0)
|
||||
else:
|
||||
if use_griffin_lim:
|
||||
wav = inv_spectrogram(model_outputs, ap, CONFIG)
|
||||
wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
|
||||
# trim silence
|
||||
if do_trim_silence:
|
||||
wav = trim_silence(wav, ap)
|
||||
wav = trim_silence(wav, model.ap)
|
||||
return_dict = {
|
||||
"wav": wav,
|
||||
"alignments": alignments,
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
from dataclasses import replace
|
||||
|
||||
from TTS.tts.configs.shared_configs import CharactersConfig
|
||||
|
||||
|
||||
def parse_symbols():
|
||||
return {
|
||||
"pad": _pad,
|
||||
|
@ -29,46 +34,49 @@ _diacrilics = "ɚ˞ɫ"
|
|||
_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
|
||||
|
||||
|
||||
def create_graphemes(
|
||||
characters=_characters,
|
||||
punctuations=_punctuations,
|
||||
pad=_pad,
|
||||
eos=_eos,
|
||||
bos=_bos,
|
||||
blank=_blank,
|
||||
unique=True,
|
||||
): # pylint: disable=redefined-outer-name
|
||||
"""Function to create default characters and phonemes"""
|
||||
# create graphemes
|
||||
_graphemes = list(characters)
|
||||
_graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes
|
||||
_graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes
|
||||
_graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes
|
||||
_graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes
|
||||
_graphemes = _graphemes + list(punctuations)
|
||||
return _graphemes, _phonemes
|
||||
# def create_graphemes(
|
||||
# characters=_characters,
|
||||
# punctuations=_punctuations,
|
||||
# pad=_pad,
|
||||
# eos=_eos,
|
||||
# bos=_bos,
|
||||
# blank=_blank,
|
||||
# unique=True,
|
||||
# ): # pylint: disable=redefined-outer-name
|
||||
# """Function to create default characters and phonemes"""
|
||||
# # create graphemes
|
||||
# = (
|
||||
# sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
|
||||
# ) # this is to keep previous models compatible.
|
||||
# _graphemes = list(characters)
|
||||
# _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes
|
||||
# _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes
|
||||
# _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes
|
||||
# _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes
|
||||
# _graphemes = _graphemes + list(punctuations)
|
||||
# return _graphemes, _phonemes
|
||||
|
||||
|
||||
def create_phonemes(
|
||||
phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True
|
||||
):
|
||||
# create phonemes
|
||||
_phonemes = None
|
||||
_phonemes_sorted = (
|
||||
sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
|
||||
) # this is to keep previous models compatible.
|
||||
_phonemes = list(_phonemes_sorted)
|
||||
_phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes
|
||||
_phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes
|
||||
_phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes
|
||||
_phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes
|
||||
_phonemes = _phonemes + list(punctuations)
|
||||
_phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
|
||||
return _phonemes
|
||||
# def create_phonemes(
|
||||
# phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True
|
||||
# ):
|
||||
# # create phonemes
|
||||
# _phonemes = None
|
||||
# _phonemes_sorted = (
|
||||
# sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
|
||||
# ) # this is to keep previous models compatible.
|
||||
# _phonemes = list(_phonemes_sorted)
|
||||
# _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes
|
||||
# _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes
|
||||
# _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes
|
||||
# _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes
|
||||
# _phonemes = _phonemes + list(punctuations)
|
||||
# _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
|
||||
# return _phonemes
|
||||
|
||||
|
||||
graphemes = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
|
||||
phonemes = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank)
|
||||
# DEF_GRAPHEMES = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
|
||||
# DEF_PHONEMES = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank)
|
||||
|
||||
|
||||
class BaseCharacters:
|
||||
|
@ -114,7 +122,7 @@ class BaseCharacters:
|
|||
eos: str,
|
||||
bos: str,
|
||||
blank: str,
|
||||
is_unique: bool = True,
|
||||
is_unique: bool = False,
|
||||
is_sorted: bool = True,
|
||||
) -> None:
|
||||
self._characters = characters
|
||||
|
@ -202,14 +210,20 @@ class BaseCharacters:
|
|||
_vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
|
||||
self._vocab = _vocab + list(self._punctuations)
|
||||
self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
|
||||
self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
|
||||
self._id_to_char = {
|
||||
idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
|
||||
}
|
||||
if self.is_unique:
|
||||
duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
|
||||
assert (
|
||||
len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
|
||||
), f" [!] There are duplicate characters in the character set. {set([x for x in self.vocab if self.vocab.count(x) > 1])}"
|
||||
), f" [!] There are duplicate characters in the character set. {duplicates}"
|
||||
|
||||
def char_to_id(self, char: str) -> int:
|
||||
return self._char_to_id[char]
|
||||
try:
|
||||
return self._char_to_id[char]
|
||||
except KeyError as e:
|
||||
raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
|
||||
|
||||
def id_to_char(self, idx: int) -> str:
|
||||
return self._id_to_char[idx]
|
||||
|
@ -229,9 +243,23 @@ class BaseCharacters:
|
|||
print(f"{indent}| > Num chars: {self.num_chars}")
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config: "Coqpit"):
|
||||
return BaseCharacters(
|
||||
**config.characters if config.characters is not None else {},
|
||||
def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
|
||||
"""Init your character class from a config.
|
||||
|
||||
Implement this method for your subclass.
|
||||
"""
|
||||
...
|
||||
|
||||
def to_config(self) -> "CharactersConfig":
|
||||
return CharactersConfig(
|
||||
characters=self._characters,
|
||||
punctuations=self._punctuations,
|
||||
pad=self._pad,
|
||||
eos=self._eos,
|
||||
bos=self._bos,
|
||||
blank=self._blank,
|
||||
is_unique=self.is_unique,
|
||||
is_sorted=self.is_sorted,
|
||||
)
|
||||
|
||||
|
||||
|
@ -275,31 +303,42 @@ class IPAPhonemes(BaseCharacters):
|
|||
eos: str = _eos,
|
||||
bos: str = _bos,
|
||||
blank: str = _blank,
|
||||
is_unique: bool = True,
|
||||
is_unique: bool = False,
|
||||
is_sorted: bool = True,
|
||||
) -> None:
|
||||
super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config: "Coqpit"):
|
||||
"""Init a IPAPhonemes object from a model config
|
||||
|
||||
If characters are not defined in the config, it will be set to the default characters and the config
|
||||
will be updated.
|
||||
"""
|
||||
# band-aid for compatibility with old models
|
||||
if "characters" in config and config.characters is not None:
|
||||
if "phonemes" in config.characters and config.characters.phonemes is not None:
|
||||
config.characters["characters"] = config.characters["phonemes"]
|
||||
return IPAPhonemes(
|
||||
characters=config.characters["characters"],
|
||||
punctuations=config.characters["punctuations"],
|
||||
pad=config.characters["pad"],
|
||||
eos=config.characters["eos"],
|
||||
bos=config.characters["bos"],
|
||||
blank=config.characters["blank"],
|
||||
is_unique=config.characters["is_unique"],
|
||||
is_sorted=config.characters["is_sorted"],
|
||||
)
|
||||
else:
|
||||
return IPAPhonemes(
|
||||
**config.characters if config.characters is not None else {},
|
||||
return (
|
||||
IPAPhonemes(
|
||||
characters=config.characters["characters"],
|
||||
punctuations=config.characters["punctuations"],
|
||||
pad=config.characters["pad"],
|
||||
eos=config.characters["eos"],
|
||||
bos=config.characters["bos"],
|
||||
blank=config.characters["blank"],
|
||||
is_unique=config.characters["is_unique"],
|
||||
is_sorted=config.characters["is_sorted"],
|
||||
),
|
||||
config,
|
||||
)
|
||||
# use character set from config
|
||||
if config.characters is not None:
|
||||
return IPAPhonemes(**config.characters), config
|
||||
# return default character set
|
||||
characters = IPAPhonemes()
|
||||
new_config = replace(config, characters=characters.to_config())
|
||||
return characters, new_config
|
||||
|
||||
|
||||
class Graphemes(BaseCharacters):
|
||||
|
@ -339,24 +378,42 @@ class Graphemes(BaseCharacters):
|
|||
eos: str = _eos,
|
||||
bos: str = _bos,
|
||||
blank: str = _blank,
|
||||
is_unique: bool = True,
|
||||
is_unique: bool = False,
|
||||
is_sorted: bool = True,
|
||||
) -> None:
|
||||
super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config: "Coqpit"):
|
||||
return Graphemes(
|
||||
**config.characters if config.characters is not None else {},
|
||||
)
|
||||
"""Init a Graphemes object from a model config
|
||||
|
||||
If characters are not defined in the config, it will be set to the default characters and the config
|
||||
will be updated.
|
||||
"""
|
||||
if config.characters is not None:
|
||||
# band-aid for compatibility with old models
|
||||
if "phonemes" in config.characters:
|
||||
return (
|
||||
Graphemes(
|
||||
characters=config.characters["characters"],
|
||||
punctuations=config.characters["punctuations"],
|
||||
pad=config.characters["pad"],
|
||||
eos=config.characters["eos"],
|
||||
bos=config.characters["bos"],
|
||||
blank=config.characters["blank"],
|
||||
is_unique=config.characters["is_unique"],
|
||||
is_sorted=config.characters["is_sorted"],
|
||||
),
|
||||
config,
|
||||
)
|
||||
return Graphemes(**config.characters), config
|
||||
characters = Graphemes()
|
||||
new_config = replace(config, characters=characters.to_config())
|
||||
return characters, new_config
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
gr = Graphemes()
|
||||
ph = IPAPhonemes()
|
||||
|
||||
print(gr.vocab)
|
||||
print(ph.vocab)
|
||||
|
||||
print(gr.num_chars)
|
||||
assert "a" == gr.id_to_char(gr.char_to_id("a"))
|
||||
gr.print_log()
|
||||
ph.print_log()
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import abc
|
||||
import itertools
|
||||
from typing import List, Tuple, Union
|
||||
from typing import List, Tuple
|
||||
|
||||
from TTS.tts.utils.text.punctuation import Punctuation
|
||||
|
||||
|
@ -8,6 +7,19 @@ from TTS.tts.utils.text.punctuation import Punctuation
|
|||
class BasePhonemizer(abc.ABC):
|
||||
"""Base phonemizer class
|
||||
|
||||
Phonemization follows the following steps:
|
||||
1. Preprocessing:
|
||||
- remove empty lines
|
||||
- remove punctuation
|
||||
- keep track of punctuation marks
|
||||
|
||||
2. Phonemization:
|
||||
- convert text to phonemes
|
||||
|
||||
3. Postprocessing:
|
||||
- join phonemes
|
||||
- restore punctuation marks
|
||||
|
||||
Args:
|
||||
language (str):
|
||||
Language used by the phonemizer.
|
||||
|
@ -51,40 +63,30 @@ class BasePhonemizer(abc.ABC):
|
|||
@abc.abstractmethod
|
||||
def name():
|
||||
"""The name of the backend"""
|
||||
...
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def is_available(cls):
|
||||
"""Returns True if the backend is installed, False otherwise"""
|
||||
...
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def version(cls):
|
||||
"""Return the backend version as a tuple (major, minor, patch)"""
|
||||
...
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def supported_languages():
|
||||
"""Return a dict of language codes -> name supported by the backend"""
|
||||
...
|
||||
|
||||
def is_supported_language(self, language):
|
||||
"""Returns True if `language` is supported by the backend"""
|
||||
return language in self.supported_languages()
|
||||
|
||||
fr"""
|
||||
Phonemization follows the following steps:
|
||||
1. Preprocessing:
|
||||
- remove empty lines
|
||||
- remove punctuation
|
||||
- keep track of punctuation marks
|
||||
|
||||
2. Phonemization:
|
||||
- convert text to phonemes
|
||||
|
||||
3. Postprocessing:
|
||||
- join phonemes
|
||||
- restore punctuation marks
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def _phonemize(self, text, separator):
|
||||
"""The main phonemization method"""
|
||||
|
|
|
@ -28,29 +28,30 @@ def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]:
|
|||
"1", # UTF8 text encoding
|
||||
]
|
||||
cmd.extend(args)
|
||||
logging.debug("espeakng: executing %s" % repr(cmd))
|
||||
p = subprocess.Popen(
|
||||
logging.debug("espeakng: executing %s", repr(cmd))
|
||||
|
||||
with subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
res = iter(p.stdout.readline, b"")
|
||||
if not sync:
|
||||
) as p:
|
||||
res = iter(p.stdout.readline, b"")
|
||||
if not sync:
|
||||
p.stdout.close()
|
||||
if p.stderr:
|
||||
p.stderr.close()
|
||||
if p.stdin:
|
||||
p.stdin.close()
|
||||
return res
|
||||
res2 = []
|
||||
for line in res:
|
||||
res2.append(line)
|
||||
p.stdout.close()
|
||||
if p.stderr:
|
||||
p.stderr.close()
|
||||
if p.stdin:
|
||||
p.stdin.close()
|
||||
return res
|
||||
res2 = []
|
||||
for line in res:
|
||||
res2.append(line)
|
||||
p.stdout.close()
|
||||
if p.stderr:
|
||||
p.stderr.close()
|
||||
if p.stdin:
|
||||
p.stdin.close()
|
||||
p.wait()
|
||||
p.wait()
|
||||
return res2
|
||||
|
||||
|
||||
|
@ -85,7 +86,24 @@ class ESpeak(BasePhonemizer):
|
|||
def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True):
|
||||
if self._ESPEAK_LIB is None:
|
||||
raise Exception("Unknown backend: %s" % backend)
|
||||
|
||||
# band-aid for backwards compatibility
|
||||
if language == "en":
|
||||
language = "en-us"
|
||||
|
||||
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
|
||||
if backend is not None:
|
||||
self.backend = backend
|
||||
|
||||
@property
|
||||
def backend(self):
|
||||
return self._ESPEAK_LIB
|
||||
|
||||
@backend.setter
|
||||
def backend(self, backend):
|
||||
if backend not in ["espeak", "espeak-ng"]:
|
||||
raise Exception("Unknown backend: %s" % backend)
|
||||
self._ESPEAK_LIB = backend
|
||||
|
||||
def auto_set_espeak_lib(self) -> None:
|
||||
if is_tool("espeak-ng"):
|
||||
|
@ -115,24 +133,25 @@ class ESpeak(BasePhonemizer):
|
|||
# espeak and espeak-ng parses `ipa` differently
|
||||
if tie:
|
||||
# use '͡' between phonemes
|
||||
if _DEF_ESPEAK_LIB == "espeak":
|
||||
if self.backend == "espeak":
|
||||
args.append("--ipa=1")
|
||||
else:
|
||||
args.append("--ipa=3")
|
||||
else:
|
||||
# split with '_'
|
||||
if _DEF_ESPEAK_LIB == "espeak":
|
||||
if self.backend == "espeak":
|
||||
args.append("--ipa=3")
|
||||
else:
|
||||
args.append("--ipa=1")
|
||||
if tie:
|
||||
args.append("--tie=%s" % tie)
|
||||
|
||||
args.append('"' + text + '"')
|
||||
# compute phonemes
|
||||
phonemes = ""
|
||||
for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
|
||||
logging.debug("line: %s" % repr(line))
|
||||
phonemes += line.decode("utf8").strip()
|
||||
logging.debug("line: %s", repr(line))
|
||||
phonemes += line.decode("utf8").strip()[2:] # skip two redundant characters
|
||||
return phonemes.replace("_", separator)
|
||||
|
||||
def _phonemize(self, text, separator=None):
|
||||
|
@ -146,7 +165,7 @@ class ESpeak(BasePhonemizer):
|
|||
Dict: Dictionary of language codes.
|
||||
"""
|
||||
if _DEF_ESPEAK_LIB is None:
|
||||
raise {}
|
||||
return {}
|
||||
args = ["--voices"]
|
||||
langs = {}
|
||||
count = 0
|
||||
|
@ -157,7 +176,7 @@ class ESpeak(BasePhonemizer):
|
|||
lang_code = cols[1]
|
||||
lang_name = cols[3]
|
||||
langs[lang_code] = lang_name
|
||||
logging.debug("line: %s" % repr(line))
|
||||
logging.debug("line: %s", repr(line))
|
||||
count += 1
|
||||
return langs
|
||||
|
||||
|
@ -168,9 +187,9 @@ class ESpeak(BasePhonemizer):
|
|||
str: Version of the used backend.
|
||||
"""
|
||||
args = ["--version"]
|
||||
for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True):
|
||||
for line in _espeak_exe(self.backend, args, sync=True):
|
||||
version = line.decode("utf8").strip().split()[2]
|
||||
logging.debug("line: %s" % repr(line))
|
||||
logging.debug("line: %s", repr(line))
|
||||
return version
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import importlib
|
||||
from os import stat
|
||||
from typing import List
|
||||
|
||||
import gruut
|
||||
|
@ -55,7 +54,7 @@ class Gruut(BasePhonemizer):
|
|||
def name():
|
||||
return "gruut"
|
||||
|
||||
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str:
|
||||
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
|
||||
"""Convert input text to phonemes.
|
||||
|
||||
Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
|
||||
|
|
|
@ -30,7 +30,7 @@ class JA_JP_Phonemizer(BasePhonemizer):
|
|||
|
||||
language = "ja-jp"
|
||||
|
||||
def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs):
|
||||
def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
|
||||
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
|
||||
|
||||
@staticmethod
|
||||
|
@ -61,12 +61,12 @@ class JA_JP_Phonemizer(BasePhonemizer):
|
|||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = "これは、電話をかけるための私の日本語の例のテキストです。"
|
||||
e = JA_JP_Phonemizer()
|
||||
print(e.supported_languages())
|
||||
print(e.version())
|
||||
print(e.language)
|
||||
print(e.name())
|
||||
print(e.is_available())
|
||||
print("`" + e.phonemize(text) + "`")
|
||||
# if __name__ == "__main__":
|
||||
# text = "これは、電話をかけるための私の日本語の例のテキストです。"
|
||||
# e = JA_JP_Phonemizer()
|
||||
# print(e.supported_languages())
|
||||
# print(e.version())
|
||||
# print(e.language)
|
||||
# print(e.name())
|
||||
# print(e.is_available())
|
||||
# print("`" + e.phonemize(text) + "`")
|
||||
|
|
|
@ -17,7 +17,7 @@ class MultiPhonemizer:
|
|||
lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER
|
||||
language = "multi-lingual"
|
||||
|
||||
def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None:
|
||||
def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: # pylint: disable=dangerous-default-value
|
||||
self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer)
|
||||
self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
|
||||
|
||||
|
@ -40,16 +40,16 @@ class MultiPhonemizer:
|
|||
return list(self.lang_to_phonemizer_name.keys())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
texts = {
|
||||
"tr": "Merhaba, bu Türkçe bit örnek!",
|
||||
"en-us": "Hello, this is English example!",
|
||||
"de": "Hallo, das ist ein Deutches Beipiel!",
|
||||
"zh-cn": "这是中国的例子",
|
||||
}
|
||||
phonemes = {}
|
||||
ph = MultiPhonemizer()
|
||||
for lang, text in texts.items():
|
||||
phoneme = ph.phonemize(text, lang)
|
||||
phonemes[lang] = phoneme
|
||||
print(phonemes)
|
||||
# if __name__ == "__main__":
|
||||
# texts = {
|
||||
# "tr": "Merhaba, bu Türkçe bit örnek!",
|
||||
# "en-us": "Hello, this is English example!",
|
||||
# "de": "Hallo, das ist ein Deutches Beipiel!",
|
||||
# "zh-cn": "这是中国的例子",
|
||||
# }
|
||||
# phonemes = {}
|
||||
# ph = MultiPhonemizer()
|
||||
# for lang, text in texts.items():
|
||||
# phoneme = ph.phonemize(text, lang)
|
||||
# phonemes[lang] = phoneme
|
||||
# print(phonemes)
|
||||
|
|
|
@ -25,14 +25,15 @@ class ZH_CN_Phonemizer(BasePhonemizer):
|
|||
|
||||
language = "zh-cn"
|
||||
|
||||
def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs):
|
||||
def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
|
||||
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
|
||||
|
||||
@staticmethod
|
||||
def name():
|
||||
return "zh_cn_phonemizer"
|
||||
|
||||
def phonemize_zh_cn(self, text: str, separator: str = "|") -> str:
|
||||
@staticmethod
|
||||
def phonemize_zh_cn(text: str, separator: str = "|") -> str:
|
||||
ph = chinese_text_to_phonemes(text, separator)
|
||||
return ph
|
||||
|
||||
|
@ -50,12 +51,12 @@ class ZH_CN_Phonemizer(BasePhonemizer):
|
|||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = "这是,样本中文。"
|
||||
e = ZH_CN_Phonemizer()
|
||||
print(e.supported_languages())
|
||||
print(e.version())
|
||||
print(e.language)
|
||||
print(e.name())
|
||||
print(e.is_available())
|
||||
print("`" + e.phonemize(text) + "`")
|
||||
# if __name__ == "__main__":
|
||||
# text = "这是,样本中文。"
|
||||
# e = ZH_CN_Phonemizer()
|
||||
# print(e.supported_languages())
|
||||
# print(e.version())
|
||||
# print(e.language)
|
||||
# print(e.name())
|
||||
# print(e.is_available())
|
||||
# print("`" + e.phonemize(text) + "`")
|
||||
|
|
|
@ -130,7 +130,7 @@ class Punctuation:
|
|||
return cls._restore(text, puncs, 0)
|
||||
|
||||
@classmethod
|
||||
def _restore(cls, text, puncs, num):
|
||||
def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
|
||||
"""Auxiliary method for Punctuation.restore()"""
|
||||
if not puncs:
|
||||
return text
|
||||
|
@ -159,14 +159,14 @@ class Punctuation:
|
|||
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
punc = Punctuation()
|
||||
text = "This is. This is, example!"
|
||||
# if __name__ == "__main__":
|
||||
# punc = Punctuation()
|
||||
# text = "This is. This is, example!"
|
||||
|
||||
print(punc.strip(text))
|
||||
# print(punc.strip(text))
|
||||
|
||||
split_text, puncs = punc.strip_to_restore(text)
|
||||
print(split_text, " ---- ", puncs)
|
||||
# split_text, puncs = punc.strip_to_restore(text)
|
||||
# print(split_text, " ---- ", puncs)
|
||||
|
||||
restored_text = punc.restore(split_text, puncs)
|
||||
print(restored_text)
|
||||
# restored_text = punc.restore(split_text, puncs)
|
||||
# print(restored_text)
|
||||
|
|
|
@ -383,8 +383,7 @@ class AudioProcessor(object):
|
|||
def init_from_config(config: "Coqpit"):
|
||||
if "audio" in config:
|
||||
return AudioProcessor(**config.audio)
|
||||
else:
|
||||
return AudioProcessor(**config)
|
||||
return AudioProcessor(**config)
|
||||
|
||||
### setting up the parameters ###
|
||||
def _build_mel_basis(
|
||||
|
|
|
@ -13,7 +13,6 @@ from TTS.tts.utils.speakers import SpeakerManager
|
|||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.tts.utils.synthesis import synthesis, trim_silence
|
||||
from TTS.tts.utils.text import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.vocoder.models import setup_model as setup_vocoder_model
|
||||
from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
|
||||
|
|
|
@ -314,7 +314,7 @@ class GAN(BaseVocoder):
|
|||
data_items: List,
|
||||
verbose: bool,
|
||||
num_gpus: int,
|
||||
rank: int = 0, # pylint: disable=unused-argument
|
||||
rank: int = None, # pylint: disable=unused-argument
|
||||
):
|
||||
"""Initiate and return the GAN dataloader.
|
||||
|
||||
|
|
Loading…
Reference in New Issue