From 54fb236c86d78a44094e61fe3c660307a26d8d0d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 22 Jul 2019 15:10:06 +0200 Subject: [PATCH] demo server update --- server/conf.json | 13 +++++++------ server/synthesizer.py | 17 ++++++++++++++--- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/server/conf.json b/server/conf.json index ba8d5016..6341596d 100644 --- a/server/conf.json +++ b/server/conf.json @@ -1,11 +1,12 @@ { - "tts_path":"/media/erogol/data_ssd/Data/models/ljspeech_models/ljspeech-April-08-2019_07+32PM-8a47b46/", // tts model root folder - "tts_file":"checkpoint_261000.pth.tar", // tts checkpoint file + "tts_path":"/media/erogol/data_ssd/Models/libri_tts/ljspeech-July-22-2019_10+45AM-ee706b5/", // tts model root folder + "tts_file":"best_model.pth.tar", // tts checkpoint file "tts_config":"config.json", // tts config.json file - "wavernn_lib_path": "/home/erogol/projects/", // Rootpath to wavernn project folder to be important. If this is none, model uses GL for speech synthesis. - "wavernn_path":"/media/erogol/data_ssd/Data/models/wavernn/ljspeech/mold_ljspeech_best_model/", // wavernn model root path - "wavernn_file":"checkpoint_433000.pth.tar", // wavernn checkpoint file name - "wavernn_config":"config.json", // wavernn config file + "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. + "wavernn_lib_path": "/home/erogol/projects/", // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. + "wavernn_path":"/media/erogol/data_ssd/Models/wavernn/universal/4910/", // wavernn model root path + "wavernn_file":"best_model_16K.pth.tar", // wavernn checkpoint file name + "wavernn_config":"config_16K.json", // wavernn config file "is_wavernn_batched":true, "port": 5002, "use_cuda": true, diff --git a/server/synthesizer.py b/server/synthesizer.py index 29895b73..bdfd8c6c 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -8,6 +8,7 @@ import sys from utils.audio import AudioProcessor from utils.generic_utils import load_config, setup_model from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme +from utils.speakers import load_speaker_mapping import re alphabets = r"([A-Za-z])" @@ -44,7 +45,13 @@ class Synthesizer(object): else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) - self.tts_model = setup_model(self.input_size, c=self.tts_config) #FIXME: missing num_speakers argument to setup_model + # load speakers + if self.config.tts_speakers is not None: + self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) + num_speakers = len(self.tts_speakers) + else: + num_speakers = 0 + self.tts_model = setup_model(self.input_size, num_speakers=num_speakers , c=self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) @@ -58,6 +65,7 @@ class Synthesizer(object): self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): + # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) @@ -70,8 +78,11 @@ class Synthesizer(object): rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, - pad=2, - upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset + mulaw=self.wavernn_config.mulaw, + pad=self.wavernn_config.pad, + use_aux_net=self.wavernn_config.use_aux_net, + use_upsample_net = self.wavernn_config.use_upsample_net, + upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128,