Merge branch 'master' of github.com:mozilla/TTS

2019-03-12 10:17:09 +01:00 · 2019-03-12 10:17:09 +01:00 · e546efbed7
parent cc34fe4c7c 5acc9db4ac
commit e546efbed7
4 changed files with 43 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -140,7 +140,7 @@ Please feel free to offer new changes and pull things off. We are happy to discu
 - [x] Generate human-like speech on LJSpeech dataset.
 - [x] Generate human-like speech on a different dataset (Nancy) (TWEB).
 - [x] Train TTS with r=1 successfully.
- [ ] Enable process based distributed training. Similar [to] (https://github.com/fastai/imagenet-fast/).
+- [x] Enable process based distributed training. Similar [to] (https://github.com/fastai/imagenet-fast/).
 - [ ] Adapting Neural Vocoder. The most active work is [here] (https://github.com/erogol/WaveRNN)
 - [ ] Multi-speaker embedding.
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@ -1,16 +1,13 @@
 import io
 import os
 import librosa
 import torch
 import scipy
 import numpy as np
 import soundfile as sf
 from utils.text import text_to_sequence
 from utils.generic_utils import load_config
 from utils.audio import AudioProcessor
 from models.tacotron import Tacotron
 from matplotlib import pylab as plt
 import numpy as np
 import torch
 from models.tacotron import Tacotron
 from utils.audio import AudioProcessor
 from utils.generic_utils import load_config
 from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence
 class Synthesizer(object):
    def load_model(self, model_path, model_name, model_config, use_cuda):
@ -22,14 +19,22 @@ class Synthesizer(object):
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)
-        self.model = Tacotron(config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r)
+
        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
        self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r)
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
-            cp = torch.load(
+            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
                self.model_file, map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
@ -42,7 +47,6 @@ class Synthesizer(object):
        self.ap.save_wav(wav, path)
    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
@ -51,7 +55,9 @@ class Synthesizer(object):
            sen += '.'
            print(sen)
            sen = sen.strip()
-            seq = np.array(text_to_sequence(sen, text_cleaner))
+
            seq = np.array(self.input_adapter(sen))
            chars_var = torch.from_numpy(seq).unsqueeze(0).long()
            if self.use_cuda:
                chars_var = chars_var.cuda()
@ -59,8 +65,9 @@ class Synthesizer(object):
                chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            out = io.BytesIO()
            wavs += list(wav)
            wavs += [0] * 10000
        out = io.BytesIO()
        self.save_wav(wavs, out)
        return out
--- a/tests/symbols_tests.py
+++ b/tests/symbols_tests.py
@ -0,0 +1,7 @@
 import unittest
 from utils.text import phonemes
 class SymbolsTest(unittest.TestCase):
    def test_uniqueness(self):
        assert sorted(phonemes) == sorted(list(set(phonemes)))
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@ -5,7 +5,6 @@ Defines the set of symbols used in text input to the model.
 The default is a set of ASCII characters that works well for English or text that has been run
 through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 '''
 from utils.text import cmudict
 _pad = '_'
 _eos = '~'
@ -13,22 +12,24 @@ _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
 _punctuations = '!\'(),-.:;? '
 _phoneme_punctuations = '.!;:,?'
-# TODO: include more phoneme characters for other languages.
+# Phonemes definition
-_phonemes = ['l','ɹ','ɜ','ɚ','k','u','ʔ','ð','ɐ','ɾ','ɑ','ɔ','b','ɛ','t','v','n','m','ʊ','ŋ','s',
+_vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ'
-             'ʌ','o','ʃ','i','p','æ','e','a','ʒ',' ','h','ɪ','ɡ','f','r','w','ɫ','ɬ','d','x','ː',
+_non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ'
-             'ᵻ','ə','j','θ','z','ɒ']
+_pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ'
-
+_suprasegmentals = 'ˈˌːˑ'
-_phonemes = sorted(list(set(_phonemes)))
+_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ '
 _diacrilics = 'ɚ˞ɫ'
 _phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics))
 # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
 _arpabet = ['@' + s for s in _phonemes]
 # Export all symbols:
 symbols = [_pad, _eos] + list(_characters) + _arpabet
-phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations)
+phonemes = [_pad, _eos] + _phonemes + list(_punctuations)
 if __name__ == '__main__':
-    print(" > TTS symbols ")
+    print(" > TTS symbols {}".format(len(symbols)))
    print(symbols)
-    print(" > TTS phonemes ")
+    print(" > TTS phonemes {}".format(len(phonemes)))
    print(phonemes)