Merge branch 'dev' of https://github.com/mozilla/TTS into dev

This commit is contained in:
erogol 2020-03-10 11:32:26 +01:00
commit 975842f71a
18 changed files with 202 additions and 62 deletions

View File

@ -12,6 +12,8 @@
"win_length": 1024, // stft window length in ms. "win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms. "hop_length": 256, // stft window hop-lengh in ms.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
"min_level_db": -100, // normalization range "min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
"power": 1.5, // value to sharpen wav signals after GL algorithm. "power": 1.5, // value to sharpen wav signals after GL algorithm.
@ -27,6 +29,18 @@
"trim_db": 60 // threshold for timming silence. Set this according to your dataset. "trim_db": 60 // threshold for timming silence. Set this according to your dataset.
}, },
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
"characters":{
"pad": "_",
"eos": "~",
"bos": "^",
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
"punctuations":"!'(),-.:;? ",
"phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
},
// DISTRIBUTED TRAINING // DISTRIBUTED TRAINING
"distributed":{ "distributed":{
"backend": "nccl", "backend": "nccl",
@ -93,7 +107,7 @@
"max_seq_len": 153, // DATASET-RELATED: maximum text length "max_seq_len": 153, // DATASET-RELATED: maximum text length
// PATHS // PATHS
"output_path": "/home/erogol/Models/", "output_path": "/data4/rw/home/Trainings/",
// PHONEMES // PHONEMES
"phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. "phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
@ -110,7 +124,7 @@
[ [
{ {
"name": "ljspeech", "name": "ljspeech",
"path": "/home/erogol/Data/LJSpeech-1.1/", "path": "/root/LJSpeech-1.1/",
"meta_file_train": "metadata.csv", "meta_file_train": "metadata.csv",
"meta_file_val": null "meta_file_val": null
} }

View File

@ -15,6 +15,7 @@ class MyDataset(Dataset):
text_cleaner, text_cleaner,
ap, ap,
meta_data, meta_data,
tp=None,
batch_group_size=0, batch_group_size=0,
min_seq_len=0, min_seq_len=0,
max_seq_len=float("inf"), max_seq_len=float("inf"),
@ -49,6 +50,7 @@ class MyDataset(Dataset):
self.min_seq_len = min_seq_len self.min_seq_len = min_seq_len
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.ap = ap self.ap = ap
self.tp = tp
self.use_phonemes = use_phonemes self.use_phonemes = use_phonemes
self.phoneme_cache_path = phoneme_cache_path self.phoneme_cache_path = phoneme_cache_path
self.phoneme_language = phoneme_language self.phoneme_language = phoneme_language
@ -75,13 +77,13 @@ class MyDataset(Dataset):
def _generate_and_cache_phoneme_sequence(self, text, cache_path): def _generate_and_cache_phoneme_sequence(self, text, cache_path):
"""generate a phoneme sequence from text. """generate a phoneme sequence from text.
since the usage is for subsequent caching, we never add bos and since the usage is for subsequent caching, we never add bos and
eos chars here. Instead we add those dynamically later; based on the eos chars here. Instead we add those dynamically later; based on the
config option.""" config option."""
phonemes = phoneme_to_sequence(text, [self.cleaners], phonemes = phoneme_to_sequence(text, [self.cleaners],
language=self.phoneme_language, language=self.phoneme_language,
enable_eos_bos=False) enable_eos_bos=False,
tp=self.tp)
phonemes = np.asarray(phonemes, dtype=np.int32) phonemes = np.asarray(phonemes, dtype=np.int32)
np.save(cache_path, phonemes) np.save(cache_path, phonemes)
return phonemes return phonemes
@ -101,7 +103,7 @@ class MyDataset(Dataset):
phonemes = self._generate_and_cache_phoneme_sequence(text, phonemes = self._generate_and_cache_phoneme_sequence(text,
cache_path) cache_path)
if self.enable_eos_bos: if self.enable_eos_bos:
phonemes = pad_with_eos_bos(phonemes) phonemes = pad_with_eos_bos(phonemes, tp=self.tp)
phonemes = np.asarray(phonemes, dtype=np.int32) phonemes = np.asarray(phonemes, dtype=np.int32)
return phonemes return phonemes
@ -113,7 +115,7 @@ class MyDataset(Dataset):
text = self._load_or_generate_phoneme_sequence(wav_file, text) text = self._load_or_generate_phoneme_sequence(wav_file, text)
else: else:
text = np.asarray( text = np.asarray(
text_to_sequence(text, [self.cleaners]), dtype=np.int32) text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32)
assert text.size > 0, self.items[idx][1] assert text.size > 0, self.items[idx][1]
assert wav.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1]
@ -193,7 +195,7 @@ class MyDataset(Dataset):
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
linear = [self.ap.spectrogram(w).astype('float32') for w in wav] linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
mel_lengths = [m.shape[1] for m in mel] mel_lengths = [m.shape[1] for m in mel]
# compute 'stop token' targets # compute 'stop token' targets
stop_targets = [ stop_targets = [

View File

@ -135,7 +135,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# LOAD TTS MODEL\n", "# LOAD TTS MODEL\n",
"from TTS.utils.text.symbols import symbols, phonemes\n", "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
"\n", "\n",
"# multi speaker \n", "# multi speaker \n",
"if CONFIG.use_speaker_embedding:\n", "if CONFIG.use_speaker_embedding:\n",
@ -145,6 +145,10 @@
" speakers = []\n", " speakers = []\n",
" speaker_id = None\n", " speaker_id = None\n",
"\n", "\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in CONFIG.keys():\n",
" symbols, phonemes = make_symbols(**CONFIG.characters)\n",
"\n",
"# load the model\n", "# load the model\n",
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), CONFIG)\n", "model = setup_model(num_chars, len(speakers), CONFIG)\n",

View File

@ -65,7 +65,7 @@
"from TTS.utils.text import text_to_sequence\n", "from TTS.utils.text import text_to_sequence\n",
"from TTS.utils.synthesis import synthesis\n", "from TTS.utils.synthesis import synthesis\n",
"from TTS.utils.visual import visualize\n", "from TTS.utils.visual import visualize\n",
"from TTS.utils.text.symbols import symbols, phonemes\n", "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
"\n", "\n",
"import IPython\n", "import IPython\n",
"from IPython.display import Audio\n", "from IPython.display import Audio\n",
@ -149,6 +149,10 @@
" speakers = []\n", " speakers = []\n",
" speaker_id = None\n", " speaker_id = None\n",
"\n", "\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in CONFIG.keys():\n",
" symbols, phonemes = make_symbols(**CONFIG.characters)\n",
"\n",
"# load the model\n", "# load the model\n",
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), CONFIG)\n", "model = setup_model(num_chars, len(speakers), CONFIG)\n",

View File

@ -37,7 +37,7 @@
"from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.visual import plot_spectrogram\n", "from TTS.utils.visual import plot_spectrogram\n",
"from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n", "from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n",
"from TTS.utils.text.symbols import symbols, phonemes\n", "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
"\n", "\n",
"%matplotlib inline\n", "%matplotlib inline\n",
"\n", "\n",
@ -94,6 +94,10 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in C.keys():\n",
" symbols, phonemes = make_symbols(**C.characters)\n",
"\n",
"# load the model\n", "# load the model\n",
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
"# TODO: multiple speaker\n", "# TODO: multiple speaker\n",
@ -116,7 +120,7 @@
"preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
"preprocessor = getattr(preprocessor, DATASET.lower())\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n",
"meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
"dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
] ]
}, },

View File

@ -100,7 +100,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# LOAD TTS MODEL\n", "# LOAD TTS MODEL\n",
"from TTS.utils.text.symbols import symbols, phonemes\n", "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
"\n", "\n",
"# multi speaker \n", "# multi speaker \n",
"if CONFIG.use_speaker_embedding:\n", "if CONFIG.use_speaker_embedding:\n",
@ -110,6 +110,10 @@
" speakers = []\n", " speakers = []\n",
" speaker_id = None\n", " speaker_id = None\n",
"\n", "\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in CONFIG.keys():\n",
" symbols, phonemes = make_symbols(**CONFIG.characters)\n",
"\n",
"# load the model\n", "# load the model\n",
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), CONFIG)\n", "model = setup_model(num_chars, len(speakers), CONFIG)\n",

View File

@ -9,8 +9,11 @@ import yaml
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config, setup_model from TTS.utils.generic_utils import load_config, setup_model
from TTS.utils.speakers import load_speaker_mapping from TTS.utils.speakers import load_speaker_mapping
# pylint: disable=unused-wildcard-import
# pylint: disable=wildcard-import
from TTS.utils.synthesis import * from TTS.utils.synthesis import *
from TTS.utils.text import phonemes, symbols
from TTS.utils.text import make_symbols, phonemes, symbols
alphabets = r"([A-Za-z])" alphabets = r"([A-Za-z])"
prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]" prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]"
@ -38,12 +41,20 @@ class Synthesizer(object):
self.config.pwgan_config, self.config.use_cuda) self.config.pwgan_config, self.config.use_cuda)
def load_tts(self, tts_checkpoint, tts_config, use_cuda): def load_tts(self, tts_checkpoint, tts_config, use_cuda):
# pylint: disable=global-statement
global symbols, phonemes
print(" > Loading TTS model ...") print(" > Loading TTS model ...")
print(" | > model config: ", tts_config) print(" | > model config: ", tts_config)
print(" | > checkpoint file: ", tts_checkpoint) print(" | > checkpoint file: ", tts_checkpoint)
self.tts_config = load_config(tts_config) self.tts_config = load_config(tts_config)
self.use_phonemes = self.tts_config.use_phonemes self.use_phonemes = self.tts_config.use_phonemes
self.ap = AudioProcessor(**self.tts_config.audio) self.ap = AudioProcessor(**self.tts_config.audio)
if 'characters' in self.tts_config.keys():
symbols, phonemes = make_symbols(**self.tts_config.characters)
if self.use_phonemes: if self.use_phonemes:
self.input_size = len(phonemes) self.input_size = len(phonemes)
else: else:
@ -54,7 +65,7 @@ class Synthesizer(object):
num_speakers = len(self.tts_speakers) num_speakers = len(self.tts_speakers)
else: else:
num_speakers = 0 num_speakers = 0
self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
# load model state # load model state
cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
# load the model # load the model
@ -84,7 +95,7 @@ class Synthesizer(object):
mulaw=self.wavernn_config.mulaw, mulaw=self.wavernn_config.mulaw,
pad=self.wavernn_config.pad, pad=self.wavernn_config.pad,
use_aux_net=self.wavernn_config.use_aux_net, use_aux_net=self.wavernn_config.use_aux_net,
use_upsample_net = self.wavernn_config.use_upsample_net, use_upsample_net=self.wavernn_config.use_upsample_net,
upsample_factors=self.wavernn_config.upsample_factors, upsample_factors=self.wavernn_config.upsample_factors,
feat_dims=80, feat_dims=80,
compute_dims=128, compute_dims=128,

View File

@ -8,7 +8,7 @@ import string
from TTS.utils.synthesis import synthesis from TTS.utils.synthesis import synthesis
from TTS.utils.generic_utils import load_config, setup_model from TTS.utils.generic_utils import load_config, setup_model
from TTS.utils.text.symbols import symbols, phonemes from TTS.utils.text.symbols import make_symbols, symbols, phonemes
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
@ -48,6 +48,8 @@ def tts(model,
if __name__ == "__main__": if __name__ == "__main__":
global symbols, phonemes
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('text', type=str, help='Text to generate speech.') parser.add_argument('text', type=str, help='Text to generate speech.')
parser.add_argument('config_path', parser.add_argument('config_path',
@ -105,6 +107,10 @@ if __name__ == "__main__":
# load the audio processor # load the audio processor
ap = AudioProcessor(**C.audio) ap = AudioProcessor(**C.audio)
# if the vocabulary was passed, replace the default
if 'characters' in C.keys():
symbols, phonemes = make_symbols(**C.characters)
# load speakers # load speakers
if args.speakers_json != '': if args.speakers_json != '':
speakers = json.load(open(args.speakers_json, 'r')) speakers = json.load(open(args.speakers_json, 'r'))

View File

@ -19,6 +19,16 @@
"mel_fmax": 7600, // maximum freq level for mel-spec. Tune for dataset!! "mel_fmax": 7600, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": false "do_trim_silence": false
}, },
"characters":{
"pad": "_",
"eos": "~",
"bos": "^",
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
"punctuations":"!'(),-.:;? ",
"phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
},
"hidden_size": 128, "hidden_size": 128,
"embedding_size": 256, "embedding_size": 256,
"text_cleaner": "english_cleaners", "text_cleaner": "english_cleaners",

View File

@ -5,13 +5,19 @@ import torch as T
from TTS.server.synthesizer import Synthesizer from TTS.server.synthesizer import Synthesizer
from TTS.tests import get_tests_input_path, get_tests_output_path from TTS.tests import get_tests_input_path, get_tests_output_path
from TTS.utils.text.symbols import phonemes, symbols from TTS.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model
class DemoServerTest(unittest.TestCase): class DemoServerTest(unittest.TestCase):
# pylint: disable=R0201
def _create_random_model(self): def _create_random_model(self):
# pylint: disable=global-statement
global symbols, phonemes
config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
if 'characters' in config.keys():
symbols, phonemes = make_symbols(**config.characters)
num_chars = len(phonemes) if config.use_phonemes else len(symbols) num_chars = len(phonemes) if config.use_phonemes else len(symbols)
model = setup_model(num_chars, 0, config) model = setup_model(num_chars, 0, config)
output_path = os.path.join(get_tests_output_path()) output_path = os.path.join(get_tests_output_path())

View File

@ -37,7 +37,8 @@ class TestTTSDataset(unittest.TestCase):
r, r,
c.text_cleaner, c.text_cleaner,
ap=self.ap, ap=self.ap,
meta_data=items, meta_data=items,
tp=c.characters if 'characters' in c.keys() else None,
batch_group_size=bgs, batch_group_size=bgs,
min_seq_len=c.min_seq_len, min_seq_len=c.min_seq_len,
max_seq_len=float("inf"), max_seq_len=float("inf"),

View File

@ -1,7 +1,14 @@
import os
# pylint: disable=unused-wildcard-import
# pylint: disable=wildcard-import
# pylint: disable=unused-import
import unittest import unittest
import torch as T
from TTS.utils.text import * from TTS.utils.text import *
from TTS.tests import get_tests_path
from TTS.utils.generic_utils import load_config
TESTS_PATH = get_tests_path()
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
def test_phoneme_to_sequence(): def test_phoneme_to_sequence():
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
@ -9,67 +16,80 @@ def test_phoneme_to_sequence():
lang = "en-us" lang = "en-us"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "ɹiːsənt ɹɪːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" gt = "ɹiːsənt ɹɪːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
assert text_hat == gt assert text_hat == text_hat_with_params == gt
# multiple punctuations # multiple punctuations
text = "Be a voice, not an! echo?" text = "Be a voice, not an! echo?"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
print(text_hat) print(text_hat)
print(len(sequence)) print(len(sequence))
assert text_hat == gt assert text_hat == text_hat_with_params == gt
# not ending with punctuation # not ending with punctuation
text = "Be a voice, not an! echo" text = "Be a voice, not an! echo"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
print(text_hat) print(text_hat)
print(len(sequence)) print(len(sequence))
assert text_hat == gt assert text_hat == text_hat_with_params == gt
# original # original
text = "Be a voice, not an echo!" text = "Be a voice, not an echo!"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
print(text_hat) print(text_hat)
print(len(sequence)) print(len(sequence))
assert text_hat == gt assert text_hat == text_hat_with_params == gt
# extra space after the sentence # extra space after the sentence
text = "Be a voice, not an! echo. " text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
print(text_hat) print(text_hat)
print(len(sequence)) print(len(sequence))
assert text_hat == gt assert text_hat == text_hat_with_params == gt
# extra space after the sentence # extra space after the sentence
text = "Be a voice, not an! echo. " text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(text, text_cleaner, lang, True) sequence = phoneme_to_sequence(text, text_cleaner, lang, True)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
print(text_hat) print(text_hat)
print(len(sequence)) print(len(sequence))
assert text_hat == gt assert text_hat == text_hat_with_params == gt
# padding char # padding char
text = "_Be a _voice, not an! echo_" text = "_Be a _voice, not an! echo_"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
print(text_hat) print(text_hat)
print(len(sequence)) print(len(sequence))
assert text_hat == gt assert text_hat == text_hat_with_params == gt
def test_text2phone(): def test_text2phone():
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
lang = "en-us" lang = "en-us"
ph = text2phone(text, lang) ph = text2phone(text, lang)
assert gt == ph, f"\n{phonemes} \n vs \n{gt}" assert gt == ph, f"\n{phonemes} \n vs \n{gt}"

View File

@ -25,7 +25,7 @@ from TTS.utils.logger import Logger
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
get_speakers get_speakers
from TTS.utils.synthesis import synthesis from TTS.utils.synthesis import synthesis
from TTS.utils.text.symbols import phonemes, symbols from TTS.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.visual import plot_alignment, plot_spectrogram
from TTS.datasets.preprocess import load_meta_data from TTS.datasets.preprocess import load_meta_data
from TTS.utils.radam import RAdam from TTS.utils.radam import RAdam
@ -49,6 +49,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
c.text_cleaner, c.text_cleaner,
meta_data=meta_data_eval if is_val else meta_data_train, meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap, ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
batch_group_size=0 if is_val else c.batch_group_size * batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size, c.batch_size,
min_seq_len=c.min_seq_len, min_seq_len=c.min_seq_len,
@ -504,9 +505,12 @@ def evaluate(model, criterion, ap, global_step, epoch):
# FIXME: move args definition/parsing inside of main? # FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name def main(args): # pylint: disable=redefined-outer-name
global meta_data_train, meta_data_eval # pylint: disable=global-variable-undefined
global meta_data_train, meta_data_eval, symbols, phonemes
# Audio processor # Audio processor
ap = AudioProcessor(**c.audio) ap = AudioProcessor(**c.audio)
if 'characters' in c.keys():
symbols, phonemes = make_symbols(**c.characters)
# DISTRUBUTED # DISTRUBUTED
if num_gpus > 1: if num_gpus > 1:

View File

@ -427,6 +427,15 @@ def check_config(c):
_check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
_check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
# vocabulary parameters
_check_argument('characters', c, restricted=False, val_type=dict)
_check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
# normalization parameters # normalization parameters
_check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
_check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)

View File

@ -9,10 +9,11 @@ def text_to_seqvec(text, CONFIG, use_cuda):
if CONFIG.use_phonemes: if CONFIG.use_phonemes:
seq = np.asarray( seq = np.asarray(
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language, phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
CONFIG.enable_eos_bos_chars), CONFIG.enable_eos_bos_chars,
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None),
dtype=np.int32) dtype=np.int32)
else: else:
seq = np.asarray(text_to_sequence(text, text_cleaner), dtype=np.int32) seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32)
# torch tensor # torch tensor
chars_var = torch.from_numpy(seq).unsqueeze(0) chars_var = torch.from_numpy(seq).unsqueeze(0)
if use_cuda: if use_cuda:

View File

@ -5,15 +5,15 @@ from packaging import version
import phonemizer import phonemizer
from phonemizer.phonemize import phonemize from phonemizer.phonemize import phonemize
from TTS.utils.text import cleaners from TTS.utils.text import cleaners
from TTS.utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \ from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
_eos _eos
# Mappings from symbol to numeric ID and vice versa: # Mappings from symbol to numeric ID and vice versa:
_SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)} _symbol_to_id = {s: i for i, s in enumerate(symbols)}
_ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)}
_PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)} _phonemes_to_id = {s: i for i, s in enumerate(phonemes)}
_ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)} _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
# Regular expression matching text enclosed in curly braces: # Regular expression matching text enclosed in curly braces:
_CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)')
@ -57,11 +57,25 @@ def text2phone(text, language):
return ph return ph
def pad_with_eos_bos(phoneme_sequence): def pad_with_eos_bos(phoneme_sequence, tp=None):
return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]] # pylint: disable=global-statement
global _phonemes_to_id, _bos, _eos
if tp:
_bos = tp['bos']
_eos = tp['eos']
_, _phonemes = make_symbols(**tp)
_phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
# pylint: disable=global-statement
global _phonemes_to_id
if tp:
_, _phonemes = make_symbols(**tp)
_phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
sequence = [] sequence = []
text = text.replace(":", "") text = text.replace(":", "")
clean_text = _clean_text(text, cleaner_names) clean_text = _clean_text(text, cleaner_names)
@ -73,21 +87,27 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):
sequence += _phoneme_to_sequence(phoneme) sequence += _phoneme_to_sequence(phoneme)
# Append EOS char # Append EOS char
if enable_eos_bos: if enable_eos_bos:
sequence = pad_with_eos_bos(sequence) sequence = pad_with_eos_bos(sequence, tp=tp)
return sequence return sequence
def sequence_to_phoneme(sequence): def sequence_to_phoneme(sequence, tp=None):
# pylint: disable=global-statement
'''Converts a sequence of IDs back to a string''' '''Converts a sequence of IDs back to a string'''
global _id_to_phonemes
result = '' result = ''
if tp:
_, _phonemes = make_symbols(**tp)
_id_to_phonemes = {i: s for i, s in enumerate(_phonemes)}
for symbol_id in sequence: for symbol_id in sequence:
if symbol_id in _ID_TO_PHONEMES: if symbol_id in _id_to_phonemes:
s = _ID_TO_PHONEMES[symbol_id] s = _id_to_phonemes[symbol_id]
result += s result += s
return result.replace('}{', ' ') return result.replace('}{', ' ')
def text_to_sequence(text, cleaner_names): def text_to_sequence(text, cleaner_names, tp=None):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
The text can optionally have ARPAbet sequences enclosed in curly braces embedded The text can optionally have ARPAbet sequences enclosed in curly braces embedded
@ -100,6 +120,12 @@ def text_to_sequence(text, cleaner_names):
Returns: Returns:
List of integers corresponding to the symbols in the text List of integers corresponding to the symbols in the text
''' '''
# pylint: disable=global-statement
global _symbol_to_id
if tp:
_symbols, _ = make_symbols(**tp)
_symbol_to_id = {s: i for i, s in enumerate(_symbols)}
sequence = [] sequence = []
# Check for curly braces and treat their contents as ARPAbet: # Check for curly braces and treat their contents as ARPAbet:
while text: while text:
@ -114,12 +140,18 @@ def text_to_sequence(text, cleaner_names):
return sequence return sequence
def sequence_to_text(sequence): def sequence_to_text(sequence, tp=None):
'''Converts a sequence of IDs back to a string''' '''Converts a sequence of IDs back to a string'''
# pylint: disable=global-statement
global _id_to_symbol
if tp:
_symbols, _ = make_symbols(**tp)
_id_to_symbol = {i: s for i, s in enumerate(_symbols)}
result = '' result = ''
for symbol_id in sequence: for symbol_id in sequence:
if symbol_id in _ID_TO_SYMBOL: if symbol_id in _id_to_symbol:
s = _ID_TO_SYMBOL[symbol_id] s = _id_to_symbol[symbol_id]
# Enclose ARPAbet back in curly braces: # Enclose ARPAbet back in curly braces:
if len(s) > 1 and s[0] == '@': if len(s) > 1 and s[0] == '@':
s = '{%s}' % s[1:] s = '{%s}' % s[1:]
@ -137,11 +169,11 @@ def _clean_text(text, cleaner_names):
def _symbols_to_sequence(syms): def _symbols_to_sequence(syms):
return [_SYMBOL_TO_ID[s] for s in syms if _should_keep_symbol(s)] return [_symbol_to_id[s] for s in syms if _should_keep_symbol(s)]
def _phoneme_to_sequence(phons): def _phoneme_to_sequence(phons):
return [_PHONEMES_TO_ID[s] for s in list(phons) if _should_keep_phoneme(s)] return [_phonemes_to_id[s] for s in list(phons) if _should_keep_phoneme(s)]
def _arpabet_to_sequence(text): def _arpabet_to_sequence(text):
@ -149,8 +181,8 @@ def _arpabet_to_sequence(text):
def _should_keep_symbol(s): def _should_keep_symbol(s):
return s in _SYMBOL_TO_ID and s not in ['~', '^', '_'] return s in _symbol_to_id and s not in ['~', '^', '_']
def _should_keep_phoneme(p): def _should_keep_phoneme(p):
return p in _PHONEMES_TO_ID and p not in ['~', '^', '_'] return p in _phonemes_to_id and p not in ['~', '^', '_']

View File

@ -5,6 +5,18 @@ Defines the set of symbols used in text input to the model.
The default is a set of ASCII characters that works well for English or text that has been run The default is a set of ASCII characters that works well for English or text that has been run
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
''' '''
def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name
''' Function to create symbols and phonemes '''
_phonemes_sorted = sorted(list(phonemes))
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in _phonemes_sorted]
# Export all symbols:
_symbols = [pad, eos, bos] + list(characters) + _arpabet
_phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
return _symbols, _phonemes
_pad = '_' _pad = '_'
_eos = '~' _eos = '~'
@ -20,14 +32,9 @@ _pulmonic_consonants = 'pbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðsz
_suprasegmentals = 'ˈˌːˑ' _suprasegmentals = 'ˈˌːˑ'
_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ' _other_symbols = 'ʍwɥʜʢʡɕʑɺɧ'
_diacrilics = 'ɚ˞ɫ' _diacrilics = 'ɚ˞ɫ'
_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics)) _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
_arpabet = ['@' + s for s in _phonemes]
# Export all symbols:
symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
# Generate ALIEN language # Generate ALIEN language
# from random import shuffle # from random import shuffle

View File

@ -54,9 +54,10 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
plt.ylabel("Encoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
if CONFIG.use_phonemes: if CONFIG.use_phonemes:
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars) seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
text = sequence_to_phoneme(seq) text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
print(text) print(text)
plt.yticks(range(len(text)), list(text)) plt.yticks(range(len(text)), list(text))
plt.colorbar() plt.colorbar()