mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev-tacotron2' of github.com:mozilla/TTS into dev-tacotron2
This commit is contained in:
commit
7ac66661d7
11
.compute
11
.compute
|
@ -1,7 +1,12 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
ls ${SHARED_DIR}/data/
|
ls ${SHARED_DIR}/data/mozilla/Judy/
|
||||||
|
yes | apt-get install sox
|
||||||
|
yes | apt-get install ffmpeg
|
||||||
|
soxi /data/ro/shared/data/mozilla/Judy/batch6/wavs_no_processing/6_126.wav
|
||||||
pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
|
pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
|
||||||
yes | apt-get install espeak
|
yes | apt-get install espeak
|
||||||
python3 setup.py develop
|
python3 setup.py develop
|
||||||
# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/Blizzard/Nancy/ --restore_path ${USER_DIR}/best_model_4467.pth.tar
|
# wget https://www.dropbox.com/s/evaouukiwb7krz8/MozillaDataset.tar.gz?dl=0 -O ${USER_DIR}/MozillaDataset.tar.gz
|
||||||
python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/Mozilla/
|
# tar -xzvf ${USER_DIR}/MozillaDataset.tar.gz --no-same-owner -C ${USER_DIR}
|
||||||
|
# python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaDataset/Mozilla/ --restore_path ${USER_DIR}/best_model_4583.pth.tar
|
||||||
|
python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"run_name": "mozilla-fattn",
|
"run_name": "mozilla-nomask-fattn-bn",
|
||||||
"run_description": "Finetune 4583, Mozilla with 0 batch group size and fattn",
|
"run_description": "Finetune 4700 orignal -> bn prenet - Mozilla with prenet bn, no mask, forward attn, batch group size 0",
|
||||||
|
|
||||||
"audio":{
|
"audio":{
|
||||||
// Audio processing parameters
|
// Audio processing parameters
|
||||||
|
@ -40,13 +40,13 @@
|
||||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||||
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
||||||
"attention_norm": "softmax", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
"attention_norm": "softmax", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
||||||
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
|
"prenet_type": "bn", // ONLY TACOTRON2 - "original" or "bn".
|
||||||
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
|
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
|
||||||
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
|
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
|
||||||
"loss_masking": false, // enable / disable loss masking against the sequence padding.
|
"loss_masking": false, // enable / disable loss masking against the sequence padding.
|
||||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||||
|
|
||||||
"batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
"batch_size": 24, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||||
"eval_batch_size":16,
|
"eval_batch_size":16,
|
||||||
"r": 1, // Number of frames to predict for step.
|
"r": 1, // Number of frames to predict for step.
|
||||||
"wd": 0.000001, // Weight decay weight.
|
"wd": 0.000001, // Weight decay weight.
|
||||||
|
@ -57,7 +57,7 @@
|
||||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||||
|
|
||||||
"run_eval": false,
|
"run_eval": false,
|
||||||
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
|
||||||
"data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument
|
"data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument
|
||||||
"meta_file_train": "prompts_train.data", // DATASET-RELATED: metafile for training dataloader.
|
"meta_file_train": "prompts_train.data", // DATASET-RELATED: metafile for training dataloader.
|
||||||
"meta_file_val": "prompts_val.data", // DATASET-RELATED: metafile for evaluation dataloader.
|
"meta_file_val": "prompts_val.data", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||||
|
@ -67,8 +67,8 @@
|
||||||
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
|
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
|
||||||
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||||
"phoneme_cache_path": "nancy_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners"
|
"text_cleaner": "phoneme_cleaners"
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,17 +49,20 @@ def mozilla(root_path, meta_files):
|
||||||
items = []
|
items = []
|
||||||
for idx, meta_file in enumerate(meta_files):
|
for idx, meta_file in enumerate(meta_files):
|
||||||
folder = folders[idx]
|
folder = folders[idx]
|
||||||
txt_file = os.path.join(root_path, meta_file)
|
# txt_file = os.path.join(root_path, meta_file)
|
||||||
|
txt_file = meta_file
|
||||||
with open(txt_file, 'r') as ttf:
|
with open(txt_file, 'r') as ttf:
|
||||||
for line in ttf:
|
for line in ttf:
|
||||||
cols = line.split('|')
|
cols = line.split('|')
|
||||||
wav_file = os.path.join(root_path, folder, 'wavs_no_processing',
|
# wav_file = os.path.join(root_path, folder,
|
||||||
|
# 'wavs_no_processing', cols[1].strip())
|
||||||
|
wav_file = os.path.join(folder, 'wavs_no_processing',
|
||||||
cols[1].strip())
|
cols[1].strip())
|
||||||
if os.path.isfile(wav_file):
|
if os.path.isfile(wav_file):
|
||||||
text = cols[0].strip()
|
text = cols[0].strip()
|
||||||
items.append([text, wav_file])
|
items.append([text, wav_file])
|
||||||
else:
|
else:
|
||||||
print(" > Error: {}".format(cols))
|
print(" > Error: {}".format(wav_file))
|
||||||
continue
|
continue
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
|
@ -152,7 +152,7 @@ class Attention(nn.Module):
|
||||||
"""
|
"""
|
||||||
B = inputs.shape[0]
|
B = inputs.shape[0]
|
||||||
T = inputs.shape[1]
|
T = inputs.shape[1]
|
||||||
self.alpha = torch.cat([torch.ones([B, 1]), torch.zeros([B, T])[:, :-1]], dim=1).to(inputs.device)
|
self.alpha = torch.cat([torch.ones([B, 1]), torch.zeros([B, T])[:, :-1] + 1e-7 ], dim=1).to(inputs.device)
|
||||||
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
|
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
|
||||||
|
|
||||||
def get_attention(self, query, processed_inputs, attention_cat):
|
def get_attention(self, query, processed_inputs, attention_cat):
|
||||||
|
@ -183,16 +183,16 @@ class Attention(nn.Module):
|
||||||
def apply_forward_attention(self, inputs, alignment, processed_query):
|
def apply_forward_attention(self, inputs, alignment, processed_query):
|
||||||
# forward attention
|
# forward attention
|
||||||
prev_alpha = F.pad(self.alpha[:, :-1].clone(), (1, 0, 0, 0)).to(inputs.device)
|
prev_alpha = F.pad(self.alpha[:, :-1].clone(), (1, 0, 0, 0)).to(inputs.device)
|
||||||
self.alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha) + 1e-7) * alignment
|
alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha) + 1e-8) * alignment
|
||||||
alpha_norm = self.alpha / self.alpha.sum(dim=1).unsqueeze(1)
|
self.alpha = alpha / alpha.sum(dim=1).unsqueeze(1)
|
||||||
# compute context
|
# compute context
|
||||||
context = torch.bmm(alpha_norm.unsqueeze(1), inputs)
|
context = torch.bmm(self.alpha.unsqueeze(1), inputs)
|
||||||
context = context.squeeze(1)
|
context = context.squeeze(1)
|
||||||
# compute transition agent
|
# compute transition agent
|
||||||
if self.trans_agent:
|
if self.trans_agent:
|
||||||
ta_input = torch.cat([context, processed_query.squeeze(1)], dim=-1)
|
ta_input = torch.cat([context, processed_query.squeeze(1)], dim=-1)
|
||||||
self.u = torch.sigmoid(self.ta(ta_input))
|
self.u = torch.sigmoid(self.ta(ta_input))
|
||||||
return context, alpha_norm, alignment
|
return context, self.alpha, alignment
|
||||||
|
|
||||||
def forward(self, attention_hidden_state, inputs, processed_inputs,
|
def forward(self, attention_hidden_state, inputs, processed_inputs,
|
||||||
attention_cat, mask):
|
attention_cat, mask):
|
||||||
|
|
|
@ -8,5 +8,6 @@
|
||||||
"wavernn_config":"config.json", // wavernn config file
|
"wavernn_config":"config.json", // wavernn config file
|
||||||
"is_wavernn_batched":true,
|
"is_wavernn_batched":true,
|
||||||
"port": 5002,
|
"port": 5002,
|
||||||
"use_cuda": true
|
"use_cuda": true,
|
||||||
|
"debug": true
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,4 +27,4 @@ def tts():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(debug=True, host='0.0.0.0', port=config.port)
|
app.run(debug=config.debug, host='0.0.0.0', port=config.port)
|
||||||
|
|
|
@ -10,6 +10,14 @@ from utils.audio import AudioProcessor
|
||||||
from utils.generic_utils import load_config, setup_model
|
from utils.generic_utils import load_config, setup_model
|
||||||
from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme
|
from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme
|
||||||
|
|
||||||
|
import re
|
||||||
|
alphabets= "([A-Za-z])"
|
||||||
|
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
|
||||||
|
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
|
||||||
|
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
||||||
|
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
||||||
|
websites = "[.](com|net|org|io|gov)"
|
||||||
|
|
||||||
|
|
||||||
class Synthesizer(object):
|
class Synthesizer(object):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
|
@ -48,6 +56,7 @@ class Synthesizer(object):
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
self.tts_model.cuda()
|
self.tts_model.cuda()
|
||||||
self.tts_model.eval()
|
self.tts_model.eval()
|
||||||
|
self.tts_model.decoder.max_decoder_steps = 3000
|
||||||
|
|
||||||
def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
|
def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
|
||||||
sys.path.append(lib_path) # set this if TTS is not installed globally
|
sys.path.append(lib_path) # set this if TTS is not installed globally
|
||||||
|
@ -83,14 +92,42 @@ class Synthesizer(object):
|
||||||
wav = np.array(wav)
|
wav = np.array(wav)
|
||||||
self.ap.save_wav(wav, path)
|
self.ap.save_wav(wav, path)
|
||||||
|
|
||||||
|
def split_into_sentences(self, text):
|
||||||
|
text = " " + text + " "
|
||||||
|
text = text.replace("\n"," ")
|
||||||
|
text = re.sub(prefixes,"\\1<prd>",text)
|
||||||
|
text = re.sub(websites,"<prd>\\1",text)
|
||||||
|
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
|
||||||
|
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
|
||||||
|
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
|
||||||
|
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
|
||||||
|
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
|
||||||
|
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
|
||||||
|
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
|
||||||
|
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
|
||||||
|
if "”" in text: text = text.replace(".”","”.")
|
||||||
|
if "\"" in text: text = text.replace(".\"","\".")
|
||||||
|
if "!" in text: text = text.replace("!\"","\"!")
|
||||||
|
if "?" in text: text = text.replace("?\"","\"?")
|
||||||
|
text = text.replace(".",".<stop>")
|
||||||
|
text = text.replace("?","?<stop>")
|
||||||
|
text = text.replace("!","!<stop>")
|
||||||
|
text = text.replace("<prd>",".")
|
||||||
|
sentences = text.split("<stop>")
|
||||||
|
sentences = sentences[:-1]
|
||||||
|
sentences = [s.strip() for s in sentences]
|
||||||
|
return sentences
|
||||||
|
|
||||||
def tts(self, text):
|
def tts(self, text):
|
||||||
wavs = []
|
wavs = []
|
||||||
for sen in text.split('.'):
|
sens = self.split_into_sentences(text)
|
||||||
|
if len(sens) == 0:
|
||||||
|
sens = [text+'.']
|
||||||
|
for sen in sens:
|
||||||
if len(sen) < 3:
|
if len(sen) < 3:
|
||||||
continue
|
continue
|
||||||
sen = sen.strip()
|
sen = sen.strip()
|
||||||
print(sen)
|
print(sen)
|
||||||
sen = sen.strip()
|
|
||||||
|
|
||||||
seq = np.array(self.input_adapter(sen))
|
seq = np.array(self.input_adapter(sen))
|
||||||
text_hat = sequence_to_phoneme(seq)
|
text_hat = sequence_to_phoneme(seq)
|
||||||
|
|
10
train.py
10
train.py
|
@ -499,6 +499,12 @@ if __name__ == '__main__':
|
||||||
type=str,
|
type=str,
|
||||||
help='path for training outputs.',
|
help='path for training outputs.',
|
||||||
default='')
|
default='')
|
||||||
|
parser.add_argument(
|
||||||
|
'--output_folder',
|
||||||
|
type=str,
|
||||||
|
default='',
|
||||||
|
help='folder name for traning outputs.'
|
||||||
|
)
|
||||||
|
|
||||||
# DISTRUBUTED
|
# DISTRUBUTED
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -524,8 +530,10 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
OUT_PATH = args.output_path
|
OUT_PATH = args.output_path
|
||||||
|
|
||||||
if args.group_id == '':
|
if args.group_id == '' and args.output_folder == '':
|
||||||
OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
|
OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
|
||||||
|
else:
|
||||||
|
OUT_PATH = os.path.join(OUT_PATH, args.output_folder)
|
||||||
|
|
||||||
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
|
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
|
||||||
|
|
||||||
|
|
|
@ -50,6 +50,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):
|
||||||
sequence = [_phonemes_to_id['^']]
|
sequence = [_phonemes_to_id['^']]
|
||||||
else:
|
else:
|
||||||
sequence = []
|
sequence = []
|
||||||
|
text = text.replace(":", "")
|
||||||
clean_text = _clean_text(text, cleaner_names)
|
clean_text = _clean_text(text, cleaner_names)
|
||||||
phonemes = text2phone(clean_text, language)
|
phonemes = text2phone(clean_text, language)
|
||||||
if phonemes is None:
|
if phonemes is None:
|
||||||
|
|
|
@ -28,6 +28,10 @@ _arpabet = ['@' + s for s in _phonemes]
|
||||||
symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
|
symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
|
||||||
phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
|
phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
|
||||||
|
|
||||||
|
# Generate ALIEN language
|
||||||
|
# from random import shuffle
|
||||||
|
# shuffle(phonemes)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(" > TTS symbols ")
|
print(" > TTS symbols ")
|
||||||
print(symbols)
|
print(symbols)
|
||||||
|
|
Loading…
Reference in New Issue