mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'master' of github.com:mozilla/TTS
This commit is contained in:
commit
e546efbed7
|
@ -140,7 +140,7 @@ Please feel free to offer new changes and pull things off. We are happy to discu
|
||||||
- [x] Generate human-like speech on LJSpeech dataset.
|
- [x] Generate human-like speech on LJSpeech dataset.
|
||||||
- [x] Generate human-like speech on a different dataset (Nancy) (TWEB).
|
- [x] Generate human-like speech on a different dataset (Nancy) (TWEB).
|
||||||
- [x] Train TTS with r=1 successfully.
|
- [x] Train TTS with r=1 successfully.
|
||||||
- [ ] Enable process based distributed training. Similar [to] (https://github.com/fastai/imagenet-fast/).
|
- [x] Enable process based distributed training. Similar [to] (https://github.com/fastai/imagenet-fast/).
|
||||||
- [ ] Adapting Neural Vocoder. The most active work is [here] (https://github.com/erogol/WaveRNN)
|
- [ ] Adapting Neural Vocoder. The most active work is [here] (https://github.com/erogol/WaveRNN)
|
||||||
- [ ] Multi-speaker embedding.
|
- [ ] Multi-speaker embedding.
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,13 @@
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import librosa
|
|
||||||
import torch
|
|
||||||
import scipy
|
|
||||||
import numpy as np
|
|
||||||
import soundfile as sf
|
|
||||||
from utils.text import text_to_sequence
|
|
||||||
from utils.generic_utils import load_config
|
|
||||||
from utils.audio import AudioProcessor
|
|
||||||
from models.tacotron import Tacotron
|
|
||||||
from matplotlib import pylab as plt
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from models.tacotron import Tacotron
|
||||||
|
from utils.audio import AudioProcessor
|
||||||
|
from utils.generic_utils import load_config
|
||||||
|
from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence
|
||||||
|
|
||||||
class Synthesizer(object):
|
class Synthesizer(object):
|
||||||
def load_model(self, model_path, model_name, model_config, use_cuda):
|
def load_model(self, model_path, model_name, model_config, use_cuda):
|
||||||
|
@ -22,14 +19,22 @@ class Synthesizer(object):
|
||||||
config = load_config(model_config)
|
config = load_config(model_config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.use_cuda = use_cuda
|
self.use_cuda = use_cuda
|
||||||
|
self.use_phonemes = config.use_phonemes
|
||||||
self.ap = AudioProcessor(**config.audio)
|
self.ap = AudioProcessor(**config.audio)
|
||||||
self.model = Tacotron(config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r)
|
|
||||||
|
if self.use_phonemes:
|
||||||
|
self.input_size = len(phonemes)
|
||||||
|
self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
|
||||||
|
else:
|
||||||
|
self.input_size = len(symbols)
|
||||||
|
self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
|
||||||
|
|
||||||
|
self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r)
|
||||||
# load model state
|
# load model state
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
cp = torch.load(self.model_file)
|
cp = torch.load(self.model_file)
|
||||||
else:
|
else:
|
||||||
cp = torch.load(
|
cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
|
||||||
self.model_file, map_location=lambda storage, loc: storage)
|
|
||||||
# load the model
|
# load the model
|
||||||
self.model.load_state_dict(cp['model'])
|
self.model.load_state_dict(cp['model'])
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
|
@ -42,7 +47,6 @@ class Synthesizer(object):
|
||||||
self.ap.save_wav(wav, path)
|
self.ap.save_wav(wav, path)
|
||||||
|
|
||||||
def tts(self, text):
|
def tts(self, text):
|
||||||
text_cleaner = [self.config.text_cleaner]
|
|
||||||
wavs = []
|
wavs = []
|
||||||
for sen in text.split('.'):
|
for sen in text.split('.'):
|
||||||
if len(sen) < 3:
|
if len(sen) < 3:
|
||||||
|
@ -51,7 +55,9 @@ class Synthesizer(object):
|
||||||
sen += '.'
|
sen += '.'
|
||||||
print(sen)
|
print(sen)
|
||||||
sen = sen.strip()
|
sen = sen.strip()
|
||||||
seq = np.array(text_to_sequence(sen, text_cleaner))
|
|
||||||
|
seq = np.array(self.input_adapter(sen))
|
||||||
|
|
||||||
chars_var = torch.from_numpy(seq).unsqueeze(0).long()
|
chars_var = torch.from_numpy(seq).unsqueeze(0).long()
|
||||||
if self.use_cuda:
|
if self.use_cuda:
|
||||||
chars_var = chars_var.cuda()
|
chars_var = chars_var.cuda()
|
||||||
|
@ -59,8 +65,9 @@ class Synthesizer(object):
|
||||||
chars_var)
|
chars_var)
|
||||||
linear_out = linear_out[0].data.cpu().numpy()
|
linear_out = linear_out[0].data.cpu().numpy()
|
||||||
wav = self.ap.inv_spectrogram(linear_out.T)
|
wav = self.ap.inv_spectrogram(linear_out.T)
|
||||||
out = io.BytesIO()
|
|
||||||
wavs += list(wav)
|
wavs += list(wav)
|
||||||
wavs += [0] * 10000
|
wavs += [0] * 10000
|
||||||
|
|
||||||
|
out = io.BytesIO()
|
||||||
self.save_wav(wavs, out)
|
self.save_wav(wavs, out)
|
||||||
return out
|
return out
|
|
@ -0,0 +1,7 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from utils.text import phonemes
|
||||||
|
|
||||||
|
class SymbolsTest(unittest.TestCase):
|
||||||
|
def test_uniqueness(self):
|
||||||
|
assert sorted(phonemes) == sorted(list(set(phonemes)))
|
|
@ -5,7 +5,6 @@ Defines the set of symbols used in text input to the model.
|
||||||
The default is a set of ASCII characters that works well for English or text that has been run
|
The default is a set of ASCII characters that works well for English or text that has been run
|
||||||
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
||||||
'''
|
'''
|
||||||
from utils.text import cmudict
|
|
||||||
|
|
||||||
_pad = '_'
|
_pad = '_'
|
||||||
_eos = '~'
|
_eos = '~'
|
||||||
|
@ -13,22 +12,24 @@ _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
|
||||||
_punctuations = '!\'(),-.:;? '
|
_punctuations = '!\'(),-.:;? '
|
||||||
_phoneme_punctuations = '.!;:,?'
|
_phoneme_punctuations = '.!;:,?'
|
||||||
|
|
||||||
# TODO: include more phoneme characters for other languages.
|
# Phonemes definition
|
||||||
_phonemes = ['l','ɹ','ɜ','ɚ','k','u','ʔ','ð','ɐ','ɾ','ɑ','ɔ','b','ɛ','t','v','n','m','ʊ','ŋ','s',
|
_vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ'
|
||||||
'ʌ','o','ʃ','i','p','æ','e','a','ʒ',' ','h','ɪ','ɡ','f','r','w','ɫ','ɬ','d','x','ː',
|
_non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ'
|
||||||
'ᵻ','ə','j','θ','z','ɒ']
|
_pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ'
|
||||||
|
_suprasegmentals = 'ˈˌːˑ'
|
||||||
_phonemes = sorted(list(set(_phonemes)))
|
_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ '
|
||||||
|
_diacrilics = 'ɚ˞ɫ'
|
||||||
|
_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics))
|
||||||
|
|
||||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||||
_arpabet = ['@' + s for s in _phonemes]
|
_arpabet = ['@' + s for s in _phonemes]
|
||||||
|
|
||||||
# Export all symbols:
|
# Export all symbols:
|
||||||
symbols = [_pad, _eos] + list(_characters) + _arpabet
|
symbols = [_pad, _eos] + list(_characters) + _arpabet
|
||||||
phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations)
|
phonemes = [_pad, _eos] + _phonemes + list(_punctuations)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(" > TTS symbols ")
|
print(" > TTS symbols {}".format(len(symbols)))
|
||||||
print(symbols)
|
print(symbols)
|
||||||
print(" > TTS phonemes ")
|
print(" > TTS phonemes {}".format(len(phonemes)))
|
||||||
print(phonemes)
|
print(phonemes)
|
||||||
|
|
Loading…
Reference in New Issue