mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev' of https://github.com/mozilla/TTS into dev
This commit is contained in:
commit
975842f71a
18
config.json
18
config.json
|
@ -12,6 +12,8 @@
|
|||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
|
@ -27,6 +29,18 @@
|
|||
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
"characters":{
|
||||
"pad": "_",
|
||||
"eos": "~",
|
||||
"bos": "^",
|
||||
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
|
||||
"punctuations":"!'(),-.:;? ",
|
||||
"phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
||||
},
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
|
@ -93,7 +107,7 @@
|
|||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/",
|
||||
"output_path": "/data4/rw/home/Trainings/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
|
@ -110,7 +124,7 @@
|
|||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "/home/erogol/Data/LJSpeech-1.1/",
|
||||
"path": "/root/LJSpeech-1.1/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": null
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@ class MyDataset(Dataset):
|
|||
text_cleaner,
|
||||
ap,
|
||||
meta_data,
|
||||
tp=None,
|
||||
batch_group_size=0,
|
||||
min_seq_len=0,
|
||||
max_seq_len=float("inf"),
|
||||
|
@ -49,6 +50,7 @@ class MyDataset(Dataset):
|
|||
self.min_seq_len = min_seq_len
|
||||
self.max_seq_len = max_seq_len
|
||||
self.ap = ap
|
||||
self.tp = tp
|
||||
self.use_phonemes = use_phonemes
|
||||
self.phoneme_cache_path = phoneme_cache_path
|
||||
self.phoneme_language = phoneme_language
|
||||
|
@ -75,13 +77,13 @@ class MyDataset(Dataset):
|
|||
|
||||
def _generate_and_cache_phoneme_sequence(self, text, cache_path):
|
||||
"""generate a phoneme sequence from text.
|
||||
|
||||
since the usage is for subsequent caching, we never add bos and
|
||||
eos chars here. Instead we add those dynamically later; based on the
|
||||
config option."""
|
||||
phonemes = phoneme_to_sequence(text, [self.cleaners],
|
||||
language=self.phoneme_language,
|
||||
enable_eos_bos=False)
|
||||
enable_eos_bos=False,
|
||||
tp=self.tp)
|
||||
phonemes = np.asarray(phonemes, dtype=np.int32)
|
||||
np.save(cache_path, phonemes)
|
||||
return phonemes
|
||||
|
@ -101,7 +103,7 @@ class MyDataset(Dataset):
|
|||
phonemes = self._generate_and_cache_phoneme_sequence(text,
|
||||
cache_path)
|
||||
if self.enable_eos_bos:
|
||||
phonemes = pad_with_eos_bos(phonemes)
|
||||
phonemes = pad_with_eos_bos(phonemes, tp=self.tp)
|
||||
phonemes = np.asarray(phonemes, dtype=np.int32)
|
||||
return phonemes
|
||||
|
||||
|
@ -113,7 +115,7 @@ class MyDataset(Dataset):
|
|||
text = self._load_or_generate_phoneme_sequence(wav_file, text)
|
||||
else:
|
||||
text = np.asarray(
|
||||
text_to_sequence(text, [self.cleaners]), dtype=np.int32)
|
||||
text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32)
|
||||
|
||||
assert text.size > 0, self.items[idx][1]
|
||||
assert wav.size > 0, self.items[idx][1]
|
||||
|
@ -193,7 +195,7 @@ class MyDataset(Dataset):
|
|||
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
|
||||
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
|
||||
|
||||
mel_lengths = [m.shape[1] for m in mel]
|
||||
mel_lengths = [m.shape[1] for m in mel]
|
||||
|
||||
# compute 'stop token' targets
|
||||
stop_targets = [
|
||||
|
|
|
@ -135,7 +135,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"from TTS.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"\n",
|
||||
"# multi speaker \n",
|
||||
"if CONFIG.use_speaker_embedding:\n",
|
||||
|
@ -145,6 +145,10 @@
|
|||
" speakers = []\n",
|
||||
" speaker_id = None\n",
|
||||
"\n",
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in CONFIG.keys():\n",
|
||||
" symbols, phonemes = make_symbols(**CONFIG.characters)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, len(speakers), CONFIG)\n",
|
||||
|
|
|
@ -65,7 +65,7 @@
|
|||
"from TTS.utils.text import text_to_sequence\n",
|
||||
"from TTS.utils.synthesis import synthesis\n",
|
||||
"from TTS.utils.visual import visualize\n",
|
||||
"from TTS.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"\n",
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
|
@ -149,6 +149,10 @@
|
|||
" speakers = []\n",
|
||||
" speaker_id = None\n",
|
||||
"\n",
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in CONFIG.keys():\n",
|
||||
" symbols, phonemes = make_symbols(**CONFIG.characters)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, len(speakers), CONFIG)\n",
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.visual import plot_spectrogram\n",
|
||||
"from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n",
|
||||
"from TTS.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"\n",
|
||||
|
@ -94,6 +94,10 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in C.keys():\n",
|
||||
" symbols, phonemes = make_symbols(**C.characters)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
||||
"# TODO: multiple speaker\n",
|
||||
|
@ -116,7 +120,7 @@
|
|||
"preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
|
||||
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
||||
"meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
|
||||
"dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
|
||||
"dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
|
||||
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -100,7 +100,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"from TTS.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"\n",
|
||||
"# multi speaker \n",
|
||||
"if CONFIG.use_speaker_embedding:\n",
|
||||
|
@ -110,6 +110,10 @@
|
|||
" speakers = []\n",
|
||||
" speaker_id = None\n",
|
||||
"\n",
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in CONFIG.keys():\n",
|
||||
" symbols, phonemes = make_symbols(**CONFIG.characters)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, len(speakers), CONFIG)\n",
|
||||
|
|
|
@ -9,8 +9,11 @@ import yaml
|
|||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import load_config, setup_model
|
||||
from TTS.utils.speakers import load_speaker_mapping
|
||||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.utils.synthesis import *
|
||||
from TTS.utils.text import phonemes, symbols
|
||||
|
||||
from TTS.utils.text import make_symbols, phonemes, symbols
|
||||
|
||||
alphabets = r"([A-Za-z])"
|
||||
prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]"
|
||||
|
@ -38,12 +41,20 @@ class Synthesizer(object):
|
|||
self.config.pwgan_config, self.config.use_cuda)
|
||||
|
||||
def load_tts(self, tts_checkpoint, tts_config, use_cuda):
|
||||
# pylint: disable=global-statement
|
||||
global symbols, phonemes
|
||||
|
||||
print(" > Loading TTS model ...")
|
||||
print(" | > model config: ", tts_config)
|
||||
print(" | > checkpoint file: ", tts_checkpoint)
|
||||
|
||||
self.tts_config = load_config(tts_config)
|
||||
self.use_phonemes = self.tts_config.use_phonemes
|
||||
self.ap = AudioProcessor(**self.tts_config.audio)
|
||||
|
||||
if 'characters' in self.tts_config.keys():
|
||||
symbols, phonemes = make_symbols(**self.tts_config.characters)
|
||||
|
||||
if self.use_phonemes:
|
||||
self.input_size = len(phonemes)
|
||||
else:
|
||||
|
@ -54,7 +65,7 @@ class Synthesizer(object):
|
|||
num_speakers = len(self.tts_speakers)
|
||||
else:
|
||||
num_speakers = 0
|
||||
self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
|
||||
self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
|
||||
# load model state
|
||||
cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
|
||||
# load the model
|
||||
|
@ -84,7 +95,7 @@ class Synthesizer(object):
|
|||
mulaw=self.wavernn_config.mulaw,
|
||||
pad=self.wavernn_config.pad,
|
||||
use_aux_net=self.wavernn_config.use_aux_net,
|
||||
use_upsample_net = self.wavernn_config.use_upsample_net,
|
||||
use_upsample_net=self.wavernn_config.use_upsample_net,
|
||||
upsample_factors=self.wavernn_config.upsample_factors,
|
||||
feat_dims=80,
|
||||
compute_dims=128,
|
||||
|
|
|
@ -8,7 +8,7 @@ import string
|
|||
|
||||
from TTS.utils.synthesis import synthesis
|
||||
from TTS.utils.generic_utils import load_config, setup_model
|
||||
from TTS.utils.text.symbols import symbols, phonemes
|
||||
from TTS.utils.text.symbols import make_symbols, symbols, phonemes
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
|
||||
|
@ -48,6 +48,8 @@ def tts(model,
|
|||
|
||||
if __name__ == "__main__":
|
||||
|
||||
global symbols, phonemes
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('text', type=str, help='Text to generate speech.')
|
||||
parser.add_argument('config_path',
|
||||
|
@ -105,6 +107,10 @@ if __name__ == "__main__":
|
|||
# load the audio processor
|
||||
ap = AudioProcessor(**C.audio)
|
||||
|
||||
# if the vocabulary was passed, replace the default
|
||||
if 'characters' in C.keys():
|
||||
symbols, phonemes = make_symbols(**C.characters)
|
||||
|
||||
# load speakers
|
||||
if args.speakers_json != '':
|
||||
speakers = json.load(open(args.speakers_json, 'r'))
|
||||
|
|
|
@ -19,6 +19,16 @@
|
|||
"mel_fmax": 7600, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": false
|
||||
},
|
||||
|
||||
"characters":{
|
||||
"pad": "_",
|
||||
"eos": "~",
|
||||
"bos": "^",
|
||||
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
|
||||
"punctuations":"!'(),-.:;? ",
|
||||
"phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
||||
},
|
||||
|
||||
"hidden_size": 128,
|
||||
"embedding_size": 256,
|
||||
"text_cleaner": "english_cleaners",
|
||||
|
|
|
@ -5,13 +5,19 @@ import torch as T
|
|||
|
||||
from TTS.server.synthesizer import Synthesizer
|
||||
from TTS.tests import get_tests_input_path, get_tests_output_path
|
||||
from TTS.utils.text.symbols import phonemes, symbols
|
||||
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model
|
||||
|
||||
|
||||
class DemoServerTest(unittest.TestCase):
|
||||
# pylint: disable=R0201
|
||||
def _create_random_model(self):
|
||||
# pylint: disable=global-statement
|
||||
global symbols, phonemes
|
||||
config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
|
||||
if 'characters' in config.keys():
|
||||
symbols, phonemes = make_symbols(**config.characters)
|
||||
|
||||
num_chars = len(phonemes) if config.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, 0, config)
|
||||
output_path = os.path.join(get_tests_output_path())
|
||||
|
|
|
@ -37,7 +37,8 @@ class TestTTSDataset(unittest.TestCase):
|
|||
r,
|
||||
c.text_cleaner,
|
||||
ap=self.ap,
|
||||
meta_data=items,
|
||||
meta_data=items,
|
||||
tp=c.characters if 'characters' in c.keys() else None,
|
||||
batch_group_size=bgs,
|
||||
min_seq_len=c.min_seq_len,
|
||||
max_seq_len=float("inf"),
|
||||
|
|
|
@ -1,7 +1,14 @@
|
|||
import os
|
||||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
# pylint: disable=unused-import
|
||||
import unittest
|
||||
import torch as T
|
||||
|
||||
from TTS.utils.text import *
|
||||
from TTS.tests import get_tests_path
|
||||
from TTS.utils.generic_utils import load_config
|
||||
|
||||
TESTS_PATH = get_tests_path()
|
||||
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
|
||||
|
||||
def test_phoneme_to_sequence():
|
||||
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||
|
@ -9,67 +16,80 @@ def test_phoneme_to_sequence():
|
|||
lang = "en-us"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||
gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
|
||||
assert text_hat == gt
|
||||
assert text_hat == text_hat_with_params == gt
|
||||
|
||||
# multiple punctuations
|
||||
text = "Be a voice, not an! echo?"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
assert text_hat == text_hat_with_params == gt
|
||||
|
||||
# not ending with punctuation
|
||||
text = "Be a voice, not an! echo"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
assert text_hat == text_hat_with_params == gt
|
||||
|
||||
# original
|
||||
text = "Be a voice, not an echo!"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
assert text_hat == text_hat_with_params == gt
|
||||
|
||||
# extra space after the sentence
|
||||
text = "Be a voice, not an! echo. "
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
assert text_hat == text_hat_with_params == gt
|
||||
|
||||
# extra space after the sentence
|
||||
text = "Be a voice, not an! echo. "
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang, True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
assert text_hat == text_hat_with_params == gt
|
||||
|
||||
# padding char
|
||||
text = "_Be a _voice, not an! echo_"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
assert text_hat == gt
|
||||
|
||||
assert text_hat == text_hat_with_params == gt
|
||||
|
||||
def test_text2phone():
|
||||
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||
gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
|
||||
lang = "en-us"
|
||||
ph = text2phone(text, lang)
|
||||
assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
|
||||
assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
|
8
train.py
8
train.py
|
@ -25,7 +25,7 @@ from TTS.utils.logger import Logger
|
|||
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
|
||||
get_speakers
|
||||
from TTS.utils.synthesis import synthesis
|
||||
from TTS.utils.text.symbols import phonemes, symbols
|
||||
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.utils.radam import RAdam
|
||||
|
@ -49,6 +49,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
|
|||
c.text_cleaner,
|
||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||
ap=ap,
|
||||
tp=c.characters if 'characters' in c.keys() else None,
|
||||
batch_group_size=0 if is_val else c.batch_group_size *
|
||||
c.batch_size,
|
||||
min_seq_len=c.min_seq_len,
|
||||
|
@ -504,9 +505,12 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
|||
|
||||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
global meta_data_train, meta_data_eval
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train, meta_data_eval, symbols, phonemes
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
if 'characters' in c.keys():
|
||||
symbols, phonemes = make_symbols(**c.characters)
|
||||
|
||||
# DISTRUBUTED
|
||||
if num_gpus > 1:
|
||||
|
|
|
@ -427,6 +427,15 @@ def check_config(c):
|
|||
_check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
|
||||
_check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
|
||||
|
||||
# vocabulary parameters
|
||||
_check_argument('characters', c, restricted=False, val_type=dict)
|
||||
_check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
|
||||
# normalization parameters
|
||||
_check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
|
||||
_check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
|
||||
|
|
|
@ -9,10 +9,11 @@ def text_to_seqvec(text, CONFIG, use_cuda):
|
|||
if CONFIG.use_phonemes:
|
||||
seq = np.asarray(
|
||||
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
|
||||
CONFIG.enable_eos_bos_chars),
|
||||
CONFIG.enable_eos_bos_chars,
|
||||
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None),
|
||||
dtype=np.int32)
|
||||
else:
|
||||
seq = np.asarray(text_to_sequence(text, text_cleaner), dtype=np.int32)
|
||||
seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32)
|
||||
# torch tensor
|
||||
chars_var = torch.from_numpy(seq).unsqueeze(0)
|
||||
if use_cuda:
|
||||
|
|
|
@ -5,15 +5,15 @@ from packaging import version
|
|||
import phonemizer
|
||||
from phonemizer.phonemize import phonemize
|
||||
from TTS.utils.text import cleaners
|
||||
from TTS.utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \
|
||||
from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
|
||||
_eos
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)}
|
||||
_ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)}
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
||||
_PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)}
|
||||
_ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)}
|
||||
_phonemes_to_id = {s: i for i, s in enumerate(phonemes)}
|
||||
_id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
|
||||
|
||||
# Regular expression matching text enclosed in curly braces:
|
||||
_CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
||||
|
@ -57,11 +57,25 @@ def text2phone(text, language):
|
|||
return ph
|
||||
|
||||
|
||||
def pad_with_eos_bos(phoneme_sequence):
|
||||
return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]]
|
||||
def pad_with_eos_bos(phoneme_sequence, tp=None):
|
||||
# pylint: disable=global-statement
|
||||
global _phonemes_to_id, _bos, _eos
|
||||
if tp:
|
||||
_bos = tp['bos']
|
||||
_eos = tp['eos']
|
||||
_, _phonemes = make_symbols(**tp)
|
||||
_phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
|
||||
|
||||
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
|
||||
|
||||
|
||||
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):
|
||||
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
|
||||
# pylint: disable=global-statement
|
||||
global _phonemes_to_id
|
||||
if tp:
|
||||
_, _phonemes = make_symbols(**tp)
|
||||
_phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
|
||||
|
||||
sequence = []
|
||||
text = text.replace(":", "")
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
|
@ -73,21 +87,27 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):
|
|||
sequence += _phoneme_to_sequence(phoneme)
|
||||
# Append EOS char
|
||||
if enable_eos_bos:
|
||||
sequence = pad_with_eos_bos(sequence)
|
||||
sequence = pad_with_eos_bos(sequence, tp=tp)
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_phoneme(sequence):
|
||||
def sequence_to_phoneme(sequence, tp=None):
|
||||
# pylint: disable=global-statement
|
||||
'''Converts a sequence of IDs back to a string'''
|
||||
global _id_to_phonemes
|
||||
result = ''
|
||||
if tp:
|
||||
_, _phonemes = make_symbols(**tp)
|
||||
_id_to_phonemes = {i: s for i, s in enumerate(_phonemes)}
|
||||
|
||||
for symbol_id in sequence:
|
||||
if symbol_id in _ID_TO_PHONEMES:
|
||||
s = _ID_TO_PHONEMES[symbol_id]
|
||||
if symbol_id in _id_to_phonemes:
|
||||
s = _id_to_phonemes[symbol_id]
|
||||
result += s
|
||||
return result.replace('}{', ' ')
|
||||
|
||||
|
||||
def text_to_sequence(text, cleaner_names):
|
||||
def text_to_sequence(text, cleaner_names, tp=None):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
|
||||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
||||
|
@ -100,6 +120,12 @@ def text_to_sequence(text, cleaner_names):
|
|||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
# pylint: disable=global-statement
|
||||
global _symbol_to_id
|
||||
if tp:
|
||||
_symbols, _ = make_symbols(**tp)
|
||||
_symbol_to_id = {s: i for i, s in enumerate(_symbols)}
|
||||
|
||||
sequence = []
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while text:
|
||||
|
@ -114,12 +140,18 @@ def text_to_sequence(text, cleaner_names):
|
|||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
def sequence_to_text(sequence, tp=None):
|
||||
'''Converts a sequence of IDs back to a string'''
|
||||
# pylint: disable=global-statement
|
||||
global _id_to_symbol
|
||||
if tp:
|
||||
_symbols, _ = make_symbols(**tp)
|
||||
_id_to_symbol = {i: s for i, s in enumerate(_symbols)}
|
||||
|
||||
result = ''
|
||||
for symbol_id in sequence:
|
||||
if symbol_id in _ID_TO_SYMBOL:
|
||||
s = _ID_TO_SYMBOL[symbol_id]
|
||||
if symbol_id in _id_to_symbol:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
# Enclose ARPAbet back in curly braces:
|
||||
if len(s) > 1 and s[0] == '@':
|
||||
s = '{%s}' % s[1:]
|
||||
|
@ -137,11 +169,11 @@ def _clean_text(text, cleaner_names):
|
|||
|
||||
|
||||
def _symbols_to_sequence(syms):
|
||||
return [_SYMBOL_TO_ID[s] for s in syms if _should_keep_symbol(s)]
|
||||
return [_symbol_to_id[s] for s in syms if _should_keep_symbol(s)]
|
||||
|
||||
|
||||
def _phoneme_to_sequence(phons):
|
||||
return [_PHONEMES_TO_ID[s] for s in list(phons) if _should_keep_phoneme(s)]
|
||||
return [_phonemes_to_id[s] for s in list(phons) if _should_keep_phoneme(s)]
|
||||
|
||||
|
||||
def _arpabet_to_sequence(text):
|
||||
|
@ -149,8 +181,8 @@ def _arpabet_to_sequence(text):
|
|||
|
||||
|
||||
def _should_keep_symbol(s):
|
||||
return s in _SYMBOL_TO_ID and s not in ['~', '^', '_']
|
||||
return s in _symbol_to_id and s not in ['~', '^', '_']
|
||||
|
||||
|
||||
def _should_keep_phoneme(p):
|
||||
return p in _PHONEMES_TO_ID and p not in ['~', '^', '_']
|
||||
return p in _phonemes_to_id and p not in ['~', '^', '_']
|
||||
|
|
|
@ -5,6 +5,18 @@ Defines the set of symbols used in text input to the model.
|
|||
The default is a set of ASCII characters that works well for English or text that has been run
|
||||
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
||||
'''
|
||||
def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name
|
||||
''' Function to create symbols and phonemes '''
|
||||
_phonemes_sorted = sorted(list(phonemes))
|
||||
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||
_arpabet = ['@' + s for s in _phonemes_sorted]
|
||||
|
||||
# Export all symbols:
|
||||
_symbols = [pad, eos, bos] + list(characters) + _arpabet
|
||||
_phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
|
||||
|
||||
return _symbols, _phonemes
|
||||
|
||||
_pad = '_'
|
||||
_eos = '~'
|
||||
|
@ -20,14 +32,9 @@ _pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðsz
|
|||
_suprasegmentals = 'ˈˌːˑ'
|
||||
_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ'
|
||||
_diacrilics = 'ɚ˞ɫ'
|
||||
_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics))
|
||||
_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
|
||||
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||
_arpabet = ['@' + s for s in _phonemes]
|
||||
|
||||
# Export all symbols:
|
||||
symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
|
||||
phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
|
||||
symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
|
||||
|
||||
# Generate ALIEN language
|
||||
# from random import shuffle
|
||||
|
|
|
@ -54,9 +54,10 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
|
|||
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
||||
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
||||
if CONFIG.use_phonemes:
|
||||
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars)
|
||||
text = sequence_to_phoneme(seq)
|
||||
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
|
||||
text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
|
||||
print(text)
|
||||
|
||||
plt.yticks(range(len(text)), list(text))
|
||||
plt.colorbar()
|
||||
|
||||
|
|
Loading…
Reference in New Issue