From bc6764a5c7c58463d2a879f54ca0cc390ca69070 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 21 Feb 2020 14:57:10 +0100 Subject: [PATCH] bug fix at server --- server/synthesizer.py | 4 ++-- utils/synthesis.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 347bef21..afc083aa 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -94,8 +94,8 @@ class Synthesizer(object): sample_rate=self.ap.sample_rate, ).cuda() - check = torch.load(model_file) - self.wavernn.load_state_dict(check['model'], map_location="cpu") + check = torch.load(model_file, map_location="cpu") + self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() diff --git a/utils/synthesis.py b/utils/synthesis.py index 79a17c78..b4512dc6 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -69,6 +69,24 @@ def id_to_torch(speaker_id): return speaker_id +# TODO: perform GL with pytorch for batching +def apply_griffin_lim(inputs, input_lens, CONFIG, ap): + '''Apply griffin-lim to each sample iterating throught the first dimension. + Args: + inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size. + input_lens (Tensor or np.Array): 1D array of sample lengths. + CONFIG (Dict): TTS config. + ap (AudioProcessor): TTS audio processor. + ''' + wavs = [] + for idx, spec in enumerate(inputs): + wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding + wav = inv_spectrogram(spec, ap, CONFIG) + # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}" + wavs.append(wav[:wav_len]) + return wavs + + def synthesis(model, text, CONFIG,