From caae1af4f6020277d16bb9e6cbfab26782308f29 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 25 Feb 2019 17:20:05 +0100 Subject: [PATCH] visual updates for phoenemes --- utils/generic_utils.py | 1 + utils/synthesis.py | 4 +--- utils/text/__init__.py | 2 +- utils/visual.py | 23 ++++++++++++++--------- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 7d4c961f..1c178fd6 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -182,4 +182,5 @@ def sequence_mask(sequence_length, max_len=None): seq_range_expand = seq_range_expand.cuda() seq_length_expand = (sequence_length.unsqueeze(1) .expand_as(seq_range_expand)) + # B x T_max return seq_range_expand < seq_length_expand diff --git a/utils/synthesis.py b/utils/synthesis.py index df36438a..dda36333 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -3,7 +3,7 @@ import time import librosa import torch import numpy as np -from .text import text_to_sequence, phoneme_to_sequence +from .text import text_to_sequence, phoneme_to_sequence, sequence_to_phoneme from .visual import visualize from matplotlib import pylab as plt @@ -11,8 +11,6 @@ from matplotlib import pylab as plt def synthesis(m, s, CONFIG, use_cuda, ap): """ Given the text, synthesising the audio """ text_cleaner = [CONFIG.text_cleaner] - # print(phoneme_to_sequence(s, text_cleaner))s - # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner))) if CONFIG.use_phonemes: seq = np.asarray( phoneme_to_sequence(s, text_cleaner, CONFIG.phoneme_language), diff --git a/utils/text/__init__.py b/utils/text/__init__.py index aae20e6b..76993d50 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -52,7 +52,7 @@ def phoneme_to_sequence(text, cleaner_names, language): for phoneme in phonemes.split('|'): # print(word, ' -- ', phonemes_text) sequence += _phoneme_to_sequence(phoneme) - # Aeepnd EOS char + # Append EOS char sequence.append(_phonemes_to_id['~']) return sequence diff --git a/utils/visual.py b/utils/visual.py index 9114da91..7efca05f 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -3,6 +3,7 @@ import librosa import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt +from utils.text import phoneme_to_sequence, sequence_to_phoneme def plot_alignment(alignment, info=None): @@ -29,19 +30,22 @@ def plot_spectrogram(linear_output, audio): return fig -def visualize(alignment, spectrogram, stop_tokens, text, hop_length, CONFIG, spectrogram2=None): - if spectrogram2 is not None: +def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None): + if spectrogram is not None: num_plot = 4 else: num_plot = 3 label_fontsize = 16 - plt.figure(figsize=(16, 32)) + plt.figure(figsize=(16, 48)) plt.subplot(num_plot, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) + if CONFIG.use_phonemes: + seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language) + text = sequence_to_phoneme(seq) plt.yticks(range(len(text)), list(text)) plt.colorbar() @@ -50,17 +54,18 @@ def visualize(alignment, spectrogram, stop_tokens, text, hop_length, CONFIG, spe plt.plot(range(len(stop_tokens)), list(stop_tokens)) plt.subplot(num_plot, 1, 3) - librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'], + librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) + plt.tight_layout() + plt.colorbar() - if spectrogram2 is not None: + if spectrogram is not None: plt.subplot(num_plot, 1, 4) - librosa.display.specshow(spectrogram2.T, sr=CONFIG.audio['sample_rate'], + librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) - - plt.tight_layout() - plt.colorbar() + plt.tight_layout() + plt.colorbar()