mirror of https://github.com/coqui-ai/TTS.git
Make style
This commit is contained in:
parent
d045bfce41
commit
b094979f1a
|
@ -84,7 +84,24 @@ class XttsConfig(BaseTTSConfig):
|
|||
audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
|
||||
model_dir: str = None
|
||||
languages: List[str] = field(
|
||||
default_factory=lambda: ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja"]
|
||||
default_factory=lambda: [
|
||||
"en",
|
||||
"es",
|
||||
"fr",
|
||||
"de",
|
||||
"it",
|
||||
"pt",
|
||||
"pl",
|
||||
"tr",
|
||||
"ru",
|
||||
"nl",
|
||||
"cs",
|
||||
"ar",
|
||||
"zh-cn",
|
||||
"hu",
|
||||
"ko",
|
||||
"ja",
|
||||
]
|
||||
)
|
||||
|
||||
# inference params
|
||||
|
|
|
@ -13,6 +13,7 @@ from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel
|
|||
from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder
|
||||
from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler
|
||||
|
||||
|
||||
def null_position_embeddings(range, dim):
|
||||
return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
|
||||
|
||||
|
@ -186,7 +187,9 @@ class GPT(nn.Module):
|
|||
def get_grad_norm_parameter_groups(self):
|
||||
return {
|
||||
"conditioning_encoder": list(self.conditioning_encoder.parameters()),
|
||||
"conditioning_perceiver": list(self.conditioning_perceiver.parameters()) if self.use_perceiver_resampler else None,
|
||||
"conditioning_perceiver": list(self.conditioning_perceiver.parameters())
|
||||
if self.use_perceiver_resampler
|
||||
else None,
|
||||
"gpt": list(self.gpt.parameters()),
|
||||
"heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()),
|
||||
}
|
||||
|
@ -355,9 +358,9 @@ class GPT(nn.Module):
|
|||
if not return_latent:
|
||||
if cond_input.ndim == 4:
|
||||
cond_input = cond_input.squeeze(1)
|
||||
conds = self.conditioning_encoder(cond_input) # (b, d, s)
|
||||
conds = self.conditioning_encoder(cond_input) # (b, d, s)
|
||||
if self.use_perceiver_resampler:
|
||||
conds = self.conditioning_perceiver(conds.permute(0, 2, 1)).transpose(1, 2) # (b, d, 32)
|
||||
conds = self.conditioning_perceiver(conds.permute(0, 2, 1)).transpose(1, 2) # (b, d, 32)
|
||||
else:
|
||||
# already computed
|
||||
conds = cond_input.unsqueeze(1)
|
||||
|
|
|
@ -16,8 +16,10 @@ from einops.layers.torch import Rearrange
|
|||
def exists(val):
|
||||
return val is not None
|
||||
|
||||
|
||||
def once(fn):
|
||||
called = False
|
||||
|
||||
@wraps(fn)
|
||||
def inner(x):
|
||||
nonlocal called
|
||||
|
@ -25,19 +27,17 @@ def once(fn):
|
|||
return
|
||||
called = True
|
||||
return fn(x)
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
print_once = once(print)
|
||||
|
||||
# main class
|
||||
|
||||
|
||||
class Attend(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dropout = 0.,
|
||||
causal = False,
|
||||
use_flash = False
|
||||
):
|
||||
def __init__(self, dropout=0.0, causal=False, use_flash=False):
|
||||
super().__init__()
|
||||
self.dropout = dropout
|
||||
self.attn_dropout = nn.Dropout(dropout)
|
||||
|
@ -46,23 +46,25 @@ class Attend(nn.Module):
|
|||
self.register_buffer("mask", None, persistent=False)
|
||||
|
||||
self.use_flash = use_flash
|
||||
assert not (use_flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
|
||||
assert not (
|
||||
use_flash and version.parse(torch.__version__) < version.parse("2.0.0")
|
||||
), "in order to use flash attention, you must be using pytorch 2.0 or above"
|
||||
|
||||
# determine efficient attention configs for cuda and cpu
|
||||
self.config = namedtuple('EfficientAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])
|
||||
self.config = namedtuple("EfficientAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"])
|
||||
self.cpu_config = self.config(True, True, True)
|
||||
self.cuda_config = None
|
||||
|
||||
if not torch.cuda.is_available() or not use_flash:
|
||||
return
|
||||
|
||||
device_properties = torch.cuda.get_device_properties(torch.device('cuda'))
|
||||
device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
|
||||
|
||||
if device_properties.major == 8 and device_properties.minor == 0:
|
||||
print_once('A100 GPU detected, using flash attention if input tensor is on cuda')
|
||||
print_once("A100 GPU detected, using flash attention if input tensor is on cuda")
|
||||
self.cuda_config = self.config(True, False, False)
|
||||
else:
|
||||
print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda')
|
||||
print_once("Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda")
|
||||
self.cuda_config = self.config(False, True, True)
|
||||
|
||||
def get_mask(self, n, device):
|
||||
|
@ -73,23 +75,23 @@ class Attend(nn.Module):
|
|||
self.register_buffer("mask", mask, persistent=False)
|
||||
return mask
|
||||
|
||||
def flash_attn(self, q, k, v, mask = None):
|
||||
def flash_attn(self, q, k, v, mask=None):
|
||||
_, heads, q_len, _, k_len, is_cuda = *q.shape, k.shape[-2], q.is_cuda
|
||||
|
||||
# Recommended for multi-query single-key-value attention by Tri Dao
|
||||
# kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
|
||||
|
||||
if k.ndim == 3:
|
||||
k = rearrange(k, 'b ... -> b 1 ...').expand_as(q)
|
||||
k = rearrange(k, "b ... -> b 1 ...").expand_as(q)
|
||||
|
||||
if v.ndim == 3:
|
||||
v = rearrange(v, 'b ... -> b 1 ...').expand_as(q)
|
||||
v = rearrange(v, "b ... -> b 1 ...").expand_as(q)
|
||||
|
||||
# Check if mask exists and expand to compatible shape
|
||||
# The mask is B L, so it would have to be expanded to B H N L
|
||||
|
||||
if exists(mask):
|
||||
mask = rearrange(mask, 'b j -> b 1 1 j')
|
||||
mask = rearrange(mask, "b j -> b 1 1 j")
|
||||
mask = mask.expand(-1, heads, q_len, -1)
|
||||
|
||||
# Check if there is a compatible device for flash attention
|
||||
|
@ -100,15 +102,12 @@ class Attend(nn.Module):
|
|||
|
||||
with torch.backends.cuda.sdp_kernel(**config._asdict()):
|
||||
out = F.scaled_dot_product_attention(
|
||||
q, k, v,
|
||||
attn_mask = mask,
|
||||
dropout_p = self.dropout if self.training else 0.,
|
||||
is_causal = self.causal
|
||||
q, k, v, attn_mask=mask, dropout_p=self.dropout if self.training else 0.0, is_causal=self.causal
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
def forward(self, q, k, v, mask = None):
|
||||
def forward(self, q, k, v, mask=None):
|
||||
"""
|
||||
einstein notation
|
||||
b - batch
|
||||
|
@ -122,9 +121,9 @@ class Attend(nn.Module):
|
|||
scale = q.shape[-1] ** -0.5
|
||||
|
||||
if self.use_flash:
|
||||
return self.flash_attn(q, k, v, mask = mask)
|
||||
return self.flash_attn(q, k, v, mask=mask)
|
||||
|
||||
kv_einsum_eq = 'b j d' if k.ndim == 3 else 'b h j d'
|
||||
kv_einsum_eq = "b j d" if k.ndim == 3 else "b h j d"
|
||||
|
||||
# similarity
|
||||
|
||||
|
@ -133,7 +132,7 @@ class Attend(nn.Module):
|
|||
# key padding mask
|
||||
|
||||
if exists(mask):
|
||||
mask = rearrange(mask, 'b j -> b 1 1 j')
|
||||
mask = rearrange(mask, "b j -> b 1 1 j")
|
||||
sim = sim.masked_fill(~mask, -torch.finfo(sim.dtype).max)
|
||||
|
||||
# causal mask
|
||||
|
@ -153,6 +152,7 @@ class Attend(nn.Module):
|
|||
|
||||
return out
|
||||
|
||||
|
||||
def Sequential(*mods):
|
||||
return nn.Sequential(*filter(exists, mods))
|
||||
|
||||
|
|
|
@ -161,9 +161,9 @@ _abbreviations = {
|
|||
"hu": [
|
||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("dr", "doktor"), # doctor
|
||||
("b", "bácsi"), # Mr.
|
||||
("nőv", "nővér"), # nurse
|
||||
("dr", "doktor"), # doctor
|
||||
("b", "bácsi"), # Mr.
|
||||
("nőv", "nővér"), # nurse
|
||||
# Add other Hungarian abbreviations here if needed.
|
||||
]
|
||||
],
|
||||
|
@ -171,9 +171,8 @@ _abbreviations = {
|
|||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
# Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
|
||||
|
||||
]
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
|
@ -354,7 +353,7 @@ _symbols_multilingual = {
|
|||
("#", " kettőskereszt "),
|
||||
("$", " dollár "),
|
||||
("£", " font "),
|
||||
("°", " fok ")
|
||||
("°", " fok "),
|
||||
]
|
||||
],
|
||||
"ko": [
|
||||
|
@ -367,9 +366,9 @@ _symbols_multilingual = {
|
|||
("#", " 번호 "),
|
||||
("$", " 달러 "),
|
||||
("£", " 파운드 "),
|
||||
("°", " 도 ")
|
||||
("°", " 도 "),
|
||||
]
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
|
@ -463,14 +462,9 @@ def _expand_ordinal(m, lang="en"):
|
|||
def _expand_number(m, lang="en"):
|
||||
return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
|
||||
|
||||
<<<<<<< HEAD
|
||||
|
||||
def expand_numbers_multilingual(text, lang="en"):
|
||||
if lang == "zh-cn":
|
||||
=======
|
||||
def expand_numbers_multilingual(text, lang='en'):
|
||||
if lang == "zh" or lang == "zh-cn":
|
||||
>>>>>>> Update model entry
|
||||
text = zh_num2words()(text)
|
||||
else:
|
||||
if lang in ["en", "ru"]:
|
||||
|
@ -521,24 +515,15 @@ def basic_cleaners(text):
|
|||
|
||||
def chinese_transliterate(text):
|
||||
return "".join(
|
||||
p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
|
||||
[p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
|
||||
)
|
||||
|
||||
|
||||
|
||||
def japanese_cleaners(text, katsu):
|
||||
text = katsu.romaji(text)
|
||||
text = lowercase(text)
|
||||
return text
|
||||
|
||||
<<<<<<< HEAD
|
||||
|
||||
class VoiceBpeTokenizer:
|
||||
def __init__(self, vocab_file=None, preprocess=None):
|
||||
self.tokenizer = None
|
||||
self.katsu = None
|
||||
=======
|
||||
>>>>>>> Update model entry
|
||||
|
||||
def korean_cleaners(text):
|
||||
r = Transliter(academic)
|
||||
|
@ -559,32 +544,14 @@ def preprocess_text(txt, lang):
|
|||
return txt
|
||||
|
||||
|
||||
<<<<<<< HEAD
|
||||
def preprocess_text(self, txt, lang):
|
||||
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
|
||||
txt = multilingual_cleaners(txt, lang)
|
||||
if lang == "zh-cn":
|
||||
txt = chinese_transliterate(txt)
|
||||
elif lang == "ja":
|
||||
if self.katsu is None:
|
||||
import cutlet
|
||||
DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json")
|
||||
|
||||
self.katsu = cutlet.Cutlet()
|
||||
txt = japanese_cleaners(txt, self.katsu)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
return txt
|
||||
=======
|
||||
DEFAULT_VOCAB_FILE = os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json"
|
||||
)
|
||||
|
||||
class VoiceBpeTokenizer:
|
||||
def __init__(self, vocab_file=None):
|
||||
self.tokenizer = None
|
||||
if vocab_file is not None:
|
||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||
>>>>>>> Update model entry
|
||||
|
||||
def encode(self, txt, lang):
|
||||
txt = preprocess_text(txt, lang)
|
||||
|
@ -611,139 +578,145 @@ class VoiceBpeTokenizer:
|
|||
def test_expand_numbers_multilingual():
|
||||
test_cases = [
|
||||
# English
|
||||
("In 12.5 seconds.", 'In twelve point five seconds.', 'en'),
|
||||
("There were 50 soldiers.", 'There were fifty soldiers.', 'en'),
|
||||
("This is a 1st test", 'This is a first test', 'en'),
|
||||
("That will be $20 sir.", 'That will be twenty dollars sir.', 'en'),
|
||||
("That will be 20€ sir.", 'That will be twenty euro sir.', 'en'),
|
||||
("That will be 20.15€ sir.", 'That will be twenty euro, fifteen cents sir.', 'en'),
|
||||
("That's 100,000.5.", 'That\'s one hundred thousand point five.', 'en'),
|
||||
("In 12.5 seconds.", "In twelve point five seconds.", "en"),
|
||||
("There were 50 soldiers.", "There were fifty soldiers.", "en"),
|
||||
("This is a 1st test", "This is a first test", "en"),
|
||||
("That will be $20 sir.", "That will be twenty dollars sir.", "en"),
|
||||
("That will be 20€ sir.", "That will be twenty euro sir.", "en"),
|
||||
("That will be 20.15€ sir.", "That will be twenty euro, fifteen cents sir.", "en"),
|
||||
("That's 100,000.5.", "That's one hundred thousand point five.", "en"),
|
||||
# French
|
||||
("En 12,5 secondes.", 'En douze virgule cinq secondes.', 'fr'),
|
||||
("Il y avait 50 soldats.", 'Il y avait cinquante soldats.', 'fr'),
|
||||
("Ceci est un 1er test", 'Ceci est un premier test', 'fr'),
|
||||
("Cela vous fera $20 monsieur.", 'Cela vous fera vingt dollars monsieur.', 'fr'),
|
||||
("Cela vous fera 20€ monsieur.", 'Cela vous fera vingt euros monsieur.', 'fr'),
|
||||
("Cela vous fera 20,15€ monsieur.", 'Cela vous fera vingt euros et quinze centimes monsieur.', 'fr'),
|
||||
("Ce sera 100.000,5.", 'Ce sera cent mille virgule cinq.', 'fr'),
|
||||
("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"),
|
||||
("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"),
|
||||
("Ceci est un 1er test", "Ceci est un premier test", "fr"),
|
||||
("Cela vous fera $20 monsieur.", "Cela vous fera vingt dollars monsieur.", "fr"),
|
||||
("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"),
|
||||
("Cela vous fera 20,15€ monsieur.", "Cela vous fera vingt euros et quinze centimes monsieur.", "fr"),
|
||||
("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"),
|
||||
# German
|
||||
("In 12,5 Sekunden.", 'In zwölf Komma fünf Sekunden.', 'de'),
|
||||
("Es gab 50 Soldaten.", 'Es gab fünfzig Soldaten.', 'de'),
|
||||
("Dies ist ein 1. Test", 'Dies ist ein erste Test', 'de'), # Issue with gender
|
||||
("Das macht $20 Herr.", 'Das macht zwanzig Dollar Herr.', 'de'),
|
||||
("Das macht 20€ Herr.", 'Das macht zwanzig Euro Herr.', 'de'),
|
||||
("Das macht 20,15€ Herr.", 'Das macht zwanzig Euro und fünfzehn Cent Herr.', 'de'),
|
||||
("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"),
|
||||
("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"),
|
||||
("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender
|
||||
("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"),
|
||||
("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"),
|
||||
("Das macht 20,15€ Herr.", "Das macht zwanzig Euro und fünfzehn Cent Herr.", "de"),
|
||||
# Spanish
|
||||
("En 12,5 segundos.", 'En doce punto cinco segundos.', 'es'),
|
||||
("Había 50 soldados.", 'Había cincuenta soldados.', 'es'),
|
||||
("Este es un 1er test", 'Este es un primero test', 'es'),
|
||||
("Eso le costará $20 señor.", 'Eso le costará veinte dólares señor.', 'es'),
|
||||
("Eso le costará 20€ señor.", 'Eso le costará veinte euros señor.', 'es'),
|
||||
("Eso le costará 20,15€ señor.", 'Eso le costará veinte euros con quince céntimos señor.', 'es'),
|
||||
("En 12,5 segundos.", "En doce punto cinco segundos.", "es"),
|
||||
("Había 50 soldados.", "Había cincuenta soldados.", "es"),
|
||||
("Este es un 1er test", "Este es un primero test", "es"),
|
||||
("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"),
|
||||
("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"),
|
||||
("Eso le costará 20,15€ señor.", "Eso le costará veinte euros con quince céntimos señor.", "es"),
|
||||
# Italian
|
||||
("In 12,5 secondi.", 'In dodici virgola cinque secondi.', 'it'),
|
||||
("C'erano 50 soldati.", "C'erano cinquanta soldati.", 'it'),
|
||||
("Questo è un 1° test", 'Questo è un primo test', 'it'),
|
||||
("Ti costerà $20 signore.", 'Ti costerà venti dollari signore.', 'it'),
|
||||
("Ti costerà 20€ signore.", 'Ti costerà venti euro signore.', 'it'),
|
||||
("Ti costerà 20,15€ signore.", 'Ti costerà venti euro e quindici centesimi signore.', 'it'),
|
||||
("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"),
|
||||
("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"),
|
||||
("Questo è un 1° test", "Questo è un primo test", "it"),
|
||||
("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"),
|
||||
("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"),
|
||||
("Ti costerà 20,15€ signore.", "Ti costerà venti euro e quindici centesimi signore.", "it"),
|
||||
# Portuguese
|
||||
("Em 12,5 segundos.", 'Em doze vírgula cinco segundos.', 'pt'),
|
||||
("Havia 50 soldados.", 'Havia cinquenta soldados.', 'pt'),
|
||||
("Este é um 1º teste", 'Este é um primeiro teste', 'pt'),
|
||||
("Isso custará $20 senhor.", 'Isso custará vinte dólares senhor.', 'pt'),
|
||||
("Isso custará 20€ senhor.", 'Isso custará vinte euros senhor.', 'pt'),
|
||||
("Isso custará 20,15€ senhor.", 'Isso custará vinte euros e quinze cêntimos senhor.', 'pt'), # "cêntimos" should be "centavos" num2words issue
|
||||
("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"),
|
||||
("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"),
|
||||
("Este é um 1º teste", "Este é um primeiro teste", "pt"),
|
||||
("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"),
|
||||
("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"),
|
||||
(
|
||||
"Isso custará 20,15€ senhor.",
|
||||
"Isso custará vinte euros e quinze cêntimos senhor.",
|
||||
"pt",
|
||||
), # "cêntimos" should be "centavos" num2words issue
|
||||
# Polish
|
||||
("W 12,5 sekundy.", 'W dwanaście przecinek pięć sekundy.', 'pl'),
|
||||
("Było 50 żołnierzy.", 'Było pięćdziesiąt żołnierzy.', 'pl'),
|
||||
("To będzie kosztować 20€ panie.", 'To będzie kosztować dwadzieścia euro panie.', 'pl'),
|
||||
("To będzie kosztować 20,15€ panie.", 'To będzie kosztować dwadzieścia euro, piętnaście centów panie.', 'pl'),
|
||||
("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"),
|
||||
("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"),
|
||||
("To będzie kosztować 20€ panie.", "To będzie kosztować dwadzieścia euro panie.", "pl"),
|
||||
("To będzie kosztować 20,15€ panie.", "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", "pl"),
|
||||
# Arabic
|
||||
("في الـ 12,5 ثانية.", 'في الـ اثنا عشر , خمسون ثانية.', 'ar'),
|
||||
("كان هناك 50 جنديًا.", 'كان هناك خمسون جنديًا.', 'ar'),
|
||||
("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"),
|
||||
("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"),
|
||||
# ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
|
||||
# ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
|
||||
# Czech
|
||||
("Za 12,5 vteřiny.", 'Za dvanáct celá pět vteřiny.', 'cs'),
|
||||
("Bylo tam 50 vojáků.", 'Bylo tam padesát vojáků.', 'cs'),
|
||||
("To bude stát 20€ pane.", 'To bude stát dvacet euro pane.', 'cs'),
|
||||
("To bude 20.15€ pane.", 'To bude dvacet euro, patnáct centů pane.', 'cs'),
|
||||
("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"),
|
||||
("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"),
|
||||
("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"),
|
||||
("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"),
|
||||
# Russian
|
||||
("Через 12.5 секунды.", 'Через двенадцать запятая пять секунды.', 'ru'),
|
||||
("Там было 50 солдат.", 'Там было пятьдесят солдат.', 'ru'),
|
||||
("Это будет 20.15€ сэр.", 'Это будет двадцать евро, пятнадцать центов сэр.', 'ru'),
|
||||
("Это будет стоить 20€ господин.", 'Это будет стоить двадцать евро господин.', 'ru'),
|
||||
("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"),
|
||||
("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"),
|
||||
("Это будет 20.15€ сэр.", "Это будет двадцать евро, пятнадцать центов сэр.", "ru"),
|
||||
("Это будет стоить 20€ господин.", "Это будет стоить двадцать евро господин.", "ru"),
|
||||
# Dutch
|
||||
("In 12,5 seconden.", 'In twaalf komma vijf seconden.', 'nl'),
|
||||
("Er waren 50 soldaten.", 'Er waren vijftig soldaten.', 'nl'),
|
||||
("Dat wordt dan $20 meneer.", 'Dat wordt dan twintig dollar meneer.', 'nl'),
|
||||
("Dat wordt dan 20€ meneer.", 'Dat wordt dan twintig euro meneer.', 'nl'),
|
||||
("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"),
|
||||
("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"),
|
||||
("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
|
||||
("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
|
||||
# Chinese (Simplified)
|
||||
("在12.5秒内", '在十二点五秒内', 'zh'),
|
||||
("有50名士兵", '有五十名士兵', 'zh'),
|
||||
("在12.5秒内", "在十二点五秒内", "zh"),
|
||||
("有50名士兵", "有五十名士兵", "zh"),
|
||||
# ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
|
||||
# ("那将是20€先生", '那将是二十欧元先生', 'zh'),
|
||||
# Turkish
|
||||
# ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
|
||||
("50 asker vardı.", 'elli asker vardı.', 'tr'),
|
||||
("Bu 1. test", 'Bu birinci test', 'tr'),
|
||||
("50 asker vardı.", "elli asker vardı.", "tr"),
|
||||
("Bu 1. test", "Bu birinci test", "tr"),
|
||||
# ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
|
||||
# Hungarian
|
||||
("12,5 másodperc alatt.", 'tizenkettő egész öt tized másodperc alatt.', 'hu'),
|
||||
("50 katona volt.", 'ötven katona volt.', 'hu'),
|
||||
("Ez az 1. teszt", 'Ez az első teszt', 'hu'),
|
||||
("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"),
|
||||
("50 katona volt.", "ötven katona volt.", "hu"),
|
||||
("Ez az 1. teszt", "Ez az első teszt", "hu"),
|
||||
# Korean
|
||||
("12.5 초 안에.", '십이 점 다섯 초 안에.', 'ko'),
|
||||
("50 명의 병사가 있었다.", '오십 명의 병사가 있었다.', 'ko'),
|
||||
("이것은 1 번째 테스트입니다", '이것은 첫 번째 테스트입니다', 'ko'),
|
||||
("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
|
||||
("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
|
||||
("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
|
||||
]
|
||||
for a, b, lang in test_cases:
|
||||
out = expand_numbers_multilingual(a, lang=lang)
|
||||
assert out == b, f"'{out}' vs '{b}'"
|
||||
|
||||
|
||||
def test_abbreviations_multilingual():
|
||||
test_cases = [
|
||||
# English
|
||||
("Hello Mr. Smith.", 'Hello mister Smith.', 'en'),
|
||||
("Dr. Jones is here.", 'doctor Jones is here.', 'en'),
|
||||
("Hello Mr. Smith.", "Hello mister Smith.", "en"),
|
||||
("Dr. Jones is here.", "doctor Jones is here.", "en"),
|
||||
# Spanish
|
||||
("Hola Sr. Garcia.", 'Hola señor Garcia.', 'es'),
|
||||
("La Dra. Martinez es muy buena.", 'La doctora Martinez es muy buena.', 'es'),
|
||||
("Hola Sr. Garcia.", "Hola señor Garcia.", "es"),
|
||||
("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"),
|
||||
# French
|
||||
("Bonjour Mr. Dupond.", 'Bonjour monsieur Dupond.', 'fr'),
|
||||
("Mme. Moreau est absente aujourd'hui.", 'madame Moreau est absente aujourd\'hui.', 'fr'),
|
||||
("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"),
|
||||
("Mme. Moreau est absente aujourd'hui.", "madame Moreau est absente aujourd'hui.", "fr"),
|
||||
# German
|
||||
("Frau Dr. Müller ist sehr klug.", 'Frau doktor Müller ist sehr klug.', 'de'),
|
||||
("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"),
|
||||
# Portuguese
|
||||
("Olá Sr. Silva.", 'Olá senhor Silva.', 'pt'),
|
||||
("Dra. Costa, você está disponível?", 'doutora Costa, você está disponível?', 'pt'),
|
||||
("Olá Sr. Silva.", "Olá senhor Silva.", "pt"),
|
||||
("Dra. Costa, você está disponível?", "doutora Costa, você está disponível?", "pt"),
|
||||
# Italian
|
||||
("Buongiorno, Sig. Rossi.", 'Buongiorno, signore Rossi.', 'it'),
|
||||
#("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
|
||||
("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"),
|
||||
# ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
|
||||
# Polish
|
||||
("Dzień dobry, P. Kowalski.", 'Dzień dobry, pani Kowalski.', 'pl'),
|
||||
("M. Nowak, czy mogę zadać pytanie?", 'pan Nowak, czy mogę zadać pytanie?', 'pl'),
|
||||
("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"),
|
||||
("M. Nowak, czy mogę zadać pytanie?", "pan Nowak, czy mogę zadać pytanie?", "pl"),
|
||||
# Czech
|
||||
("P. Novák", "pan Novák", 'cs'),
|
||||
("Dr. Vojtěch", "doktor Vojtěch", 'cs'),
|
||||
("P. Novák", "pan Novák", "cs"),
|
||||
("Dr. Vojtěch", "doktor Vojtěch", "cs"),
|
||||
# Dutch
|
||||
("Dhr. Jansen", "de heer Jansen", 'nl'),
|
||||
("Mevr. de Vries", "mevrouw de Vries", 'nl'),
|
||||
("Dhr. Jansen", "de heer Jansen", "nl"),
|
||||
("Mevr. de Vries", "mevrouw de Vries", "nl"),
|
||||
# Russian
|
||||
("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", 'ru'),
|
||||
("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", 'ru'),
|
||||
("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"),
|
||||
("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", "ru"),
|
||||
# Turkish
|
||||
("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", 'tr'),
|
||||
("Dr. Ayşe burada.", "doktor Ayşe burada.", 'tr'),
|
||||
("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"),
|
||||
("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"),
|
||||
# Hungarian
|
||||
("Dr. Szabó itt van.", "doktor Szabó itt van.", 'hu'),
|
||||
("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"),
|
||||
]
|
||||
|
||||
for a, b, lang in test_cases:
|
||||
out = expand_abbreviations_multilingual(a, lang=lang)
|
||||
assert out == b, f"'{out}' vs '{b}'"
|
||||
|
||||
|
||||
def test_symbols_multilingual():
|
||||
test_cases = [
|
||||
("I have 14% battery", "I have 14 percent battery", "en"),
|
||||
|
@ -763,14 +736,15 @@ def test_symbols_multilingual():
|
|||
("我的电量为 14%", "我的电量为 14 百分之", "zh"),
|
||||
("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
|
||||
("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
|
||||
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko")
|
||||
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
|
||||
]
|
||||
|
||||
for a, b, lang in test_cases:
|
||||
out = expand_symbols_multilingual(a, lang=lang)
|
||||
assert out == b, f"'{out}' vs '{b}'"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_expand_numbers_multilingual()
|
||||
test_abbreviations_multilingual()
|
||||
test_symbols_multilingual()
|
||||
test_symbols_multilingual()
|
||||
|
|
|
@ -149,7 +149,11 @@ class XTTSDataset(torch.utils.data.Dataset):
|
|||
# if use masking do not use cond_len
|
||||
cond_len = torch.nan
|
||||
else:
|
||||
ref_sample = sample["reference_path"] if "reference_path" in sample and sample["reference_path"] is not None else audiopath
|
||||
ref_sample = (
|
||||
sample["reference_path"]
|
||||
if "reference_path" in sample and sample["reference_path"] is not None
|
||||
else audiopath
|
||||
)
|
||||
cond, cond_len, _ = get_prompt_slice(
|
||||
ref_sample, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval
|
||||
)
|
||||
|
@ -210,7 +214,9 @@ class XTTSDataset(torch.utils.data.Dataset):
|
|||
"wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long),
|
||||
"filenames": audiopath,
|
||||
"conditioning": cond.unsqueeze(1),
|
||||
"cond_lens": torch.tensor(cond_len, dtype=torch.long) if cond_len is not torch.nan else torch.tensor([cond_len]),
|
||||
"cond_lens": torch.tensor(cond_len, dtype=torch.long)
|
||||
if cond_len is not torch.nan
|
||||
else torch.tensor([cond_len]),
|
||||
"cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]),
|
||||
}
|
||||
return res
|
||||
|
|
|
@ -213,7 +213,13 @@ class GPTTrainer(BaseTTS):
|
|||
cond_lens: long tensor, (b,)
|
||||
"""
|
||||
losses = self.xtts.gpt(
|
||||
text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels=cond_mels, cond_idxs=cond_idxs, cond_lens=cond_lens,
|
||||
text_inputs,
|
||||
text_lengths,
|
||||
audio_codes,
|
||||
wav_lengths,
|
||||
cond_mels=cond_mels,
|
||||
cond_idxs=cond_idxs,
|
||||
cond_lens=cond_lens,
|
||||
)
|
||||
return losses
|
||||
|
||||
|
@ -227,7 +233,12 @@ class GPTTrainer(BaseTTS):
|
|||
print(" | > Synthesizing test sentences.")
|
||||
for idx, s_info in enumerate(self.config.test_sentences):
|
||||
wav = self.xtts.synthesize(
|
||||
s_info["text"], self.config, s_info["speaker_wav"], s_info["language"], gpt_cond_len=3, decoder="ne_hifigan"
|
||||
s_info["text"],
|
||||
self.config,
|
||||
s_info["speaker_wav"],
|
||||
s_info["language"],
|
||||
gpt_cond_len=3,
|
||||
decoder="ne_hifigan",
|
||||
)["wav"]
|
||||
test_audios["{}-audio".format(idx)] = wav
|
||||
|
||||
|
@ -295,7 +306,9 @@ class GPTTrainer(BaseTTS):
|
|||
cond_idxs = batch["cond_idxs"]
|
||||
cond_lens = batch["cond_lens"]
|
||||
|
||||
loss_text, loss_mel, _ = self.forward(text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens)
|
||||
loss_text, loss_mel, _ = self.forward(
|
||||
text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens
|
||||
)
|
||||
loss_dict["loss_text_ce"] = loss_text * self.args.gpt_loss_text_ce_weight
|
||||
loss_dict["loss_mel_ce"] = loss_mel * self.args.gpt_loss_mel_ce_weight
|
||||
loss_dict["loss"] = loss_dict["loss_text_ce"] + loss_dict["loss_mel_ce"]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -381,7 +381,8 @@ class Xtts(BaseTTS):
|
|||
audio_22k = torchaudio.functional.resample(audio, sr, 22050)
|
||||
audio_22k = audio_22k[:, : 22050 * length]
|
||||
if self.args.gpt_use_perceiver_resampler:
|
||||
mel = wav_to_mel_cloning(audio_22k,
|
||||
mel = wav_to_mel_cloning(
|
||||
audio_22k,
|
||||
mel_norms=self.mel_stats.cpu(),
|
||||
n_fft=2048,
|
||||
hop_length=256,
|
||||
|
@ -391,10 +392,11 @@ class Xtts(BaseTTS):
|
|||
sample_rate=22050,
|
||||
f_min=0,
|
||||
f_max=8000,
|
||||
n_mels=80
|
||||
n_mels=80,
|
||||
)
|
||||
else:
|
||||
mel = wav_to_mel_cloning(audio_22k,
|
||||
mel = wav_to_mel_cloning(
|
||||
audio_22k,
|
||||
mel_norms=self.mel_stats.cpu(),
|
||||
n_fft=4096,
|
||||
hop_length=1024,
|
||||
|
@ -404,7 +406,7 @@ class Xtts(BaseTTS):
|
|||
sample_rate=22050,
|
||||
f_min=0,
|
||||
f_max=8000,
|
||||
n_mels=80
|
||||
n_mels=80,
|
||||
)
|
||||
cond_latent = self.gpt.get_style_emb(mel.to(self.device))
|
||||
return cond_latent.transpose(1, 2)
|
||||
|
@ -598,7 +600,10 @@ class Xtts(BaseTTS):
|
|||
Sample rate is 24kHz.
|
||||
"""
|
||||
(gpt_cond_latent, diffusion_conditioning, speaker_embedding) = self.get_conditioning_latents(
|
||||
audio_path=ref_audio_path, gpt_cond_len=gpt_cond_len, max_ref_length=max_ref_len, sound_norm_refs=sound_norm_refs
|
||||
audio_path=ref_audio_path,
|
||||
gpt_cond_len=gpt_cond_len,
|
||||
max_ref_length=max_ref_len,
|
||||
sound_norm_refs=sound_norm_refs,
|
||||
)
|
||||
|
||||
return self.inference(
|
||||
|
@ -728,7 +733,12 @@ class Xtts(BaseTTS):
|
|||
)
|
||||
wav = self.vocoder.inference(mel)
|
||||
|
||||
return {"wav": wav.cpu().numpy().squeeze(), "gpt_latents": gpt_latents, "speaker_embedding": speaker_embedding, "diffusion_conditioning": diffusion_conditioning}
|
||||
return {
|
||||
"wav": wav.cpu().numpy().squeeze(),
|
||||
"gpt_latents": gpt_latents,
|
||||
"speaker_embedding": speaker_embedding,
|
||||
"diffusion_conditioning": diffusion_conditioning,
|
||||
}
|
||||
|
||||
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
|
||||
"""Handle chunk formatting in streaming mode"""
|
||||
|
|
|
@ -61,13 +61,15 @@ TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v2.0/voca
|
|||
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v2.0/model.pth"
|
||||
|
||||
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
||||
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, TOKENIZER_FILE_LINK.split("/")[-1]) # vocab.json file
|
||||
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, XTTS_CHECKPOINT_LINK.split("/")[-1]) # model.pth file
|
||||
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, TOKENIZER_FILE_LINK.split("/")[-1]) # vocab.json file
|
||||
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, XTTS_CHECKPOINT_LINK.split("/")[-1]) # model.pth file
|
||||
|
||||
# download XTTS v2.0 files if needed
|
||||
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
|
||||
print(" > Downloading XTTS v2.0 files!")
|
||||
ModelManager._download_model_files([TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
||||
ModelManager._download_model_files(
|
||||
[TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
||||
)
|
||||
|
||||
|
||||
# Training sentences generations
|
||||
|
@ -92,7 +94,7 @@ def main():
|
|||
gpt_num_audio_tokens=8194,
|
||||
gpt_start_audio_token=8192,
|
||||
gpt_stop_audio_token=8193,
|
||||
use_ne_hifigan=True, # if it is true it will keep the non-enhanced keys on the output checkpoint
|
||||
use_ne_hifigan=True, # if it is true it will keep the non-enhanced keys on the output checkpoint
|
||||
gpt_use_masking_gt_prompt_approach=True,
|
||||
gpt_use_perceiver_resampler=True,
|
||||
)
|
||||
|
|
|
@ -22,4 +22,8 @@ def test_synthesize():
|
|||
)
|
||||
|
||||
# test pipe_out command
|
||||
<<<<<<< HEAD
|
||||
run_cli(f'tts --text "test." --pipe_out --out_path "{output_path}" | aplay')
|
||||
=======
|
||||
run_cli('tts --text "test." --pipe_out ' f'--out_path "{output_path}" | aplay')
|
||||
>>>>>>> Make style
|
||||
|
|
Loading…
Reference in New Issue