From caae1af4f6020277d16bb9e6cbfab26782308f29 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Mon, 25 Feb 2019 17:20:05 +0100
Subject: [PATCH] visual updates for phoenemes

---
 utils/generic_utils.py |  1 +
 utils/synthesis.py     |  4 +---
 utils/text/__init__.py |  2 +-
 utils/visual.py        | 23 ++++++++++++++---------
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 7d4c961f..1c178fd6 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -182,4 +182,5 @@ def sequence_mask(sequence_length, max_len=None):
         seq_range_expand = seq_range_expand.cuda()
     seq_length_expand = (sequence_length.unsqueeze(1)
                          .expand_as(seq_range_expand))
+    # B x T_max
     return seq_range_expand < seq_length_expand
diff --git a/utils/synthesis.py b/utils/synthesis.py
index df36438a..dda36333 100644
--- a/utils/synthesis.py
+++ b/utils/synthesis.py
@@ -3,7 +3,7 @@ import time
 import librosa
 import torch
 import numpy as np
-from .text import text_to_sequence, phoneme_to_sequence
+from .text import text_to_sequence, phoneme_to_sequence, sequence_to_phoneme
 from .visual import visualize
 from matplotlib import pylab as plt
 
@@ -11,8 +11,6 @@ from matplotlib import pylab as plt
 def synthesis(m, s, CONFIG, use_cuda, ap):
     """ Given the text, synthesising the audio """
     text_cleaner = [CONFIG.text_cleaner]
-    # print(phoneme_to_sequence(s, text_cleaner))s
-    # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner)))
     if CONFIG.use_phonemes:
         seq = np.asarray(
             phoneme_to_sequence(s, text_cleaner, CONFIG.phoneme_language),
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index aae20e6b..76993d50 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -52,7 +52,7 @@ def phoneme_to_sequence(text, cleaner_names, language):
     for phoneme in phonemes.split('|'):
         # print(word, ' -- ', phonemes_text)
         sequence += _phoneme_to_sequence(phoneme)
-    # Aeepnd EOS char
+    # Append EOS char
     sequence.append(_phonemes_to_id['~'])
     return sequence
 
diff --git a/utils/visual.py b/utils/visual.py
index 9114da91..7efca05f 100644
--- a/utils/visual.py
+++ b/utils/visual.py
@@ -3,6 +3,7 @@ import librosa
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
+from utils.text import phoneme_to_sequence, sequence_to_phoneme
 
 
 def plot_alignment(alignment, info=None):
@@ -29,19 +30,22 @@ def plot_spectrogram(linear_output, audio):
     return fig
 
 
-def visualize(alignment, spectrogram, stop_tokens, text, hop_length, CONFIG, spectrogram2=None):
-    if spectrogram2 is not None:
+def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None):
+    if spectrogram is not None:
         num_plot = 4
     else:
         num_plot = 3
 
     label_fontsize = 16
-    plt.figure(figsize=(16, 32))
+    plt.figure(figsize=(16, 48))
 
     plt.subplot(num_plot, 1, 1)
     plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
     plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
     plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
+    if CONFIG.use_phonemes:
+        seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language)
+        text = sequence_to_phoneme(seq)
     plt.yticks(range(len(text)), list(text))
     plt.colorbar()
     
@@ -50,17 +54,18 @@ def visualize(alignment, spectrogram, stop_tokens, text, hop_length, CONFIG, spe
     plt.plot(range(len(stop_tokens)), list(stop_tokens))
 
     plt.subplot(num_plot, 1, 3)
-    librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'],
+    librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'],
                              hop_length=hop_length, x_axis="time", y_axis="linear")
     plt.xlabel("Time", fontsize=label_fontsize)
     plt.ylabel("Hz", fontsize=label_fontsize)
+    plt.tight_layout()
+    plt.colorbar()
 
-    if spectrogram2 is not None:
+    if spectrogram is not None:
         plt.subplot(num_plot, 1, 4)
-        librosa.display.specshow(spectrogram2.T, sr=CONFIG.audio['sample_rate'],
+        librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'],
                                 hop_length=hop_length, x_axis="time", y_axis="linear")
         plt.xlabel("Time", fontsize=label_fontsize)
         plt.ylabel("Hz", fontsize=label_fontsize)
-
-    plt.tight_layout()
-    plt.colorbar()
+        plt.tight_layout()
+        plt.colorbar()