diff --git a/server/synthesizer.py b/server/synthesizer.py index 347bef21..afc083aa 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -94,8 +94,8 @@ class Synthesizer(object): sample_rate=self.ap.sample_rate, ).cuda() - check = torch.load(model_file) - self.wavernn.load_state_dict(check['model'], map_location="cpu") + check = torch.load(model_file, map_location="cpu") + self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() diff --git a/utils/synthesis.py b/utils/synthesis.py index 79a17c78..b4512dc6 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -69,6 +69,24 @@ def id_to_torch(speaker_id): return speaker_id +# TODO: perform GL with pytorch for batching +def apply_griffin_lim(inputs, input_lens, CONFIG, ap): + '''Apply griffin-lim to each sample iterating throught the first dimension. + Args: + inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size. + input_lens (Tensor or np.Array): 1D array of sample lengths. + CONFIG (Dict): TTS config. + ap (AudioProcessor): TTS audio processor. + ''' + wavs = [] + for idx, spec in enumerate(inputs): + wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding + wav = inv_spectrogram(spec, ap, CONFIG) + # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}" + wavs.append(wav[:wav_len]) + return wavs + + def synthesis(model, text, CONFIG,