Merge branch 'master' of github.com:mozilla/TTS

This commit is contained in:
Eren Golge 2019-03-12 10:17:09 +01:00
commit e546efbed7
4 changed files with 43 additions and 28 deletions

View File

@ -140,7 +140,7 @@ Please feel free to offer new changes and pull things off. We are happy to discu
- [x] Generate human-like speech on LJSpeech dataset. - [x] Generate human-like speech on LJSpeech dataset.
- [x] Generate human-like speech on a different dataset (Nancy) (TWEB). - [x] Generate human-like speech on a different dataset (Nancy) (TWEB).
- [x] Train TTS with r=1 successfully. - [x] Train TTS with r=1 successfully.
- [ ] Enable process based distributed training. Similar [to] (https://github.com/fastai/imagenet-fast/). - [x] Enable process based distributed training. Similar [to] (https://github.com/fastai/imagenet-fast/).
- [ ] Adapting Neural Vocoder. The most active work is [here] (https://github.com/erogol/WaveRNN) - [ ] Adapting Neural Vocoder. The most active work is [here] (https://github.com/erogol/WaveRNN)
- [ ] Multi-speaker embedding. - [ ] Multi-speaker embedding.

View File

@ -1,16 +1,13 @@
import io import io
import os import os
import librosa
import torch
import scipy
import numpy as np
import soundfile as sf
from utils.text import text_to_sequence
from utils.generic_utils import load_config
from utils.audio import AudioProcessor
from models.tacotron import Tacotron
from matplotlib import pylab as plt
import numpy as np
import torch
from models.tacotron import Tacotron
from utils.audio import AudioProcessor
from utils.generic_utils import load_config
from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence
class Synthesizer(object): class Synthesizer(object):
def load_model(self, model_path, model_name, model_config, use_cuda): def load_model(self, model_path, model_name, model_config, use_cuda):
@ -22,14 +19,22 @@ class Synthesizer(object):
config = load_config(model_config) config = load_config(model_config)
self.config = config self.config = config
self.use_cuda = use_cuda self.use_cuda = use_cuda
self.use_phonemes = config.use_phonemes
self.ap = AudioProcessor(**config.audio) self.ap = AudioProcessor(**config.audio)
self.model = Tacotron(config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r)
if self.use_phonemes:
self.input_size = len(phonemes)
self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
else:
self.input_size = len(symbols)
self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r)
# load model state # load model state
if use_cuda: if use_cuda:
cp = torch.load(self.model_file) cp = torch.load(self.model_file)
else: else:
cp = torch.load( cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
self.model_file, map_location=lambda storage, loc: storage)
# load the model # load the model
self.model.load_state_dict(cp['model']) self.model.load_state_dict(cp['model'])
if use_cuda: if use_cuda:
@ -42,7 +47,6 @@ class Synthesizer(object):
self.ap.save_wav(wav, path) self.ap.save_wav(wav, path)
def tts(self, text): def tts(self, text):
text_cleaner = [self.config.text_cleaner]
wavs = [] wavs = []
for sen in text.split('.'): for sen in text.split('.'):
if len(sen) < 3: if len(sen) < 3:
@ -51,7 +55,9 @@ class Synthesizer(object):
sen += '.' sen += '.'
print(sen) print(sen)
sen = sen.strip() sen = sen.strip()
seq = np.array(text_to_sequence(sen, text_cleaner))
seq = np.array(self.input_adapter(sen))
chars_var = torch.from_numpy(seq).unsqueeze(0).long() chars_var = torch.from_numpy(seq).unsqueeze(0).long()
if self.use_cuda: if self.use_cuda:
chars_var = chars_var.cuda() chars_var = chars_var.cuda()
@ -59,8 +65,9 @@ class Synthesizer(object):
chars_var) chars_var)
linear_out = linear_out[0].data.cpu().numpy() linear_out = linear_out[0].data.cpu().numpy()
wav = self.ap.inv_spectrogram(linear_out.T) wav = self.ap.inv_spectrogram(linear_out.T)
out = io.BytesIO()
wavs += list(wav) wavs += list(wav)
wavs += [0] * 10000 wavs += [0] * 10000
out = io.BytesIO()
self.save_wav(wavs, out) self.save_wav(wavs, out)
return out return out

7
tests/symbols_tests.py Normal file
View File

@ -0,0 +1,7 @@
import unittest
from utils.text import phonemes
class SymbolsTest(unittest.TestCase):
def test_uniqueness(self):
assert sorted(phonemes) == sorted(list(set(phonemes)))

View File

@ -5,7 +5,6 @@ Defines the set of symbols used in text input to the model.
The default is a set of ASCII characters that works well for English or text that has been run The default is a set of ASCII characters that works well for English or text that has been run
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
''' '''
from utils.text import cmudict
_pad = '_' _pad = '_'
_eos = '~' _eos = '~'
@ -13,22 +12,24 @@ _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
_punctuations = '!\'(),-.:;? ' _punctuations = '!\'(),-.:;? '
_phoneme_punctuations = '.!;:,?' _phoneme_punctuations = '.!;:,?'
# TODO: include more phoneme characters for other languages. # Phonemes definition
_phonemes = ['l','ɹ','ɜ','ɚ','k','u','ʔ','ð','ɐ','ɾ','ɑ','ɔ','b','ɛ','t','v','n','m','ʊ','ŋ','s', _vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ'
'ʌ','o','ʃ','i','p','æ','e','a','ʒ',' ','h','ɪ','ɡ','f','r','w','ɫ','ɬ','d','x','ː', _non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ'
'','ə','j','θ','z','ɒ'] _pulmonic_consonants = 'pbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ'
_suprasegmentals = 'ˈˌːˑ'
_phonemes = sorted(list(set(_phonemes))) _other_symbols = 'ʍwɥʜʢʡɕʑɺɧ '
_diacrilics = 'ɚ˞ɫ'
_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics))
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in _phonemes] _arpabet = ['@' + s for s in _phonemes]
# Export all symbols: # Export all symbols:
symbols = [_pad, _eos] + list(_characters) + _arpabet symbols = [_pad, _eos] + list(_characters) + _arpabet
phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations) phonemes = [_pad, _eos] + _phonemes + list(_punctuations)
if __name__ == '__main__': if __name__ == '__main__':
print(" > TTS symbols ") print(" > TTS symbols {}".format(len(symbols)))
print(symbols) print(symbols)
print(" > TTS phonemes ") print(" > TTS phonemes {}".format(len(phonemes)))
print(phonemes) print(phonemes)