From 391dab45f024016dcb625ef8b8182f32c7a6aae1 Mon Sep 17 00:00:00 2001 From: erogol Date: Sun, 29 Mar 2020 23:07:12 +0200 Subject: [PATCH] update ExtractTTSSpecs notebook --- notebooks/ExtractTTSpectrogram.ipynb | 97 +++++++++++++++++----------- utils/audio.py | 5 +- 2 files changed, 63 insertions(+), 39 deletions(-) diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index b5a88611..c747c764 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -7,15 +7,6 @@ "This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TTS_PATH = \"/home/erogol/projects/\"" - ] - }, { "cell_type": "code", "execution_count": null, @@ -26,7 +17,6 @@ "%autoreload 2\n", "import os\n", "import sys\n", - "sys.path.append(TTS_PATH)\n", "import torch\n", "import importlib\n", "import numpy as np\n", @@ -42,7 +32,7 @@ "%matplotlib inline\n", "\n", "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='2'" + "os.environ['CUDA_VISIBLE_DEVICES']='0'" ] }, { @@ -69,12 +59,12 @@ "metadata": {}, "outputs": [], "source": [ - "OUT_PATH = \"/data/rw/pit/data/turkish-vocoder/\"\n", - "DATA_PATH = \"/data/rw/home/Turkish\"\n", + "OUT_PATH = \"/home/erogol/Data/LJSpeech-1.1/ljspeech-March-17-2020_01+16AM-871588c/\"\n", + "DATA_PATH = \"/home/erogol/Data/LJSpeech-1.1/\"\n", "DATASET = \"ljspeech\"\n", - "METADATA_FILE = \"metadata.txt\"\n", - "CONFIG_PATH = \"/data/rw/pit/keep/turkish-January-08-2020_01+56AM-ca5e133/config.json\"\n", - "MODEL_FILE = \"/data/rw/pit/keep/turkish-January-08-2020_01+56AM-ca5e133/checkpoint_255000.pth.tar\"\n", + "METADATA_FILE = \"metadata.csv\"\n", + "CONFIG_PATH = \"/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/config.json\"\n", + "MODEL_FILE = \"/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/checkpoint_420000.pth.tar\"\n", "BATCH_SIZE = 32\n", "\n", "QUANTIZED_WAV = False\n", @@ -85,6 +75,7 @@ "print(\" > CUDA enabled: \", use_cuda)\n", "\n", "C = load_config(CONFIG_PATH)\n", + "C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n", "ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)" ] }, @@ -94,7 +85,7 @@ "metadata": {}, "outputs": [], "source": [ - "# if the vocabulary was passed, replace the default\n", + "# if the vocabulary was passed, replace the default\n", "if 'characters' in C.keys():\n", " symbols, phonemes = make_symbols(**C.characters)\n", "\n", @@ -120,7 +111,7 @@ "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, @@ -143,7 +134,7 @@ "metadata = []\n", "losses = []\n", "postnet_losses = []\n", - "criterion = L1LossMasked()\n", + "criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)\n", "with torch.no_grad():\n", " for data in tqdm(loader):\n", " # setup input data\n", @@ -232,7 +223,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Check model performance" + "### Sanity Check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx = 1\n", + "ap.melspectrogram(ap.load_wav(item_idx[idx])).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import soundfile as sf\n", + "wav, sr = sf.read(item_idx[idx])\n", + "mel_postnet = postnet_outputs[idx][:mel_lengths[idx], :]\n", + "mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n", + "mel_truth = ap.melspectrogram(wav)\n", + "print(mel_truth.shape)" ] }, { @@ -242,10 +257,8 @@ "outputs": [], "source": [ "# plot posnet output\n", - "idx = 1\n", - "mel_example = postnet_outputs[idx]\n", - "plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n", - "print(mel_example[:mel_lengths[1], :].shape)" + "plot_spectrogram(mel_postnet, ap);\n", + "print(mel_postnet[:mel_lengths[idx], :].shape)" ] }, { @@ -255,9 +268,8 @@ "outputs": [], "source": [ "# plot decoder output\n", - "mel_example = mel_outputs[idx].data.cpu().numpy()\n", - "plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n", - "print(mel_example[:mel_lengths[1], :].shape)" + "plot_spectrogram(mel_decoder, ap);\n", + "print(mel_decoder.shape)" ] }, { @@ -267,10 +279,8 @@ "outputs": [], "source": [ "# plot GT specgrogram\n", - "wav = ap.load_wav(item_idx[idx])\n", - "melt = ap.melspectrogram(wav)\n", - "print(melt.shape)\n", - "plot_spectrogram(melt.T, ap);" + "print(mel_truth.shape)\n", + "plot_spectrogram(mel_truth.T, ap);" ] }, { @@ -281,9 +291,9 @@ "source": [ "# postnet, decoder diff\n", "from matplotlib import pylab as plt\n", - "mel_diff = mel_outputs[idx] - postnet_outputs[idx]\n", + "mel_diff = mel_decoder - mel_postnet\n", "plt.figure(figsize=(16, 10))\n", - "plt.imshow(abs(mel_diff.detach().cpu().numpy()[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n", "plt.colorbar()\n", "plt.tight_layout()" ] @@ -294,10 +304,25 @@ "metadata": {}, "outputs": [], "source": [ + "# PLOT GT SPECTROGRAM diff\n", "from matplotlib import pylab as plt\n", - "# mel = mel_poutputs[idx].detach().cpu().numpy()\n", - "mel = postnet_outputs[idx].detach().cpu().numpy()\n", - "mel_diff2 = melt.T - mel[:melt.shape[1]]\n", + "mel_diff2 = mel_truth.T - mel_decoder\n", + "plt.figure(figsize=(16, 10))\n", + "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.colorbar()\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# PLOT GT SPECTROGRAM diff\n", + "from matplotlib import pylab as plt\n", + "mel = postnet_outputs[idx]\n", + "mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]\n", "plt.figure(figsize=(16, 10))\n", "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", "plt.colorbar()\n", diff --git a/utils/audio.py b/utils/audio.py index 67110134..be44cc42 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -269,12 +269,11 @@ class AudioProcessor(object): y = self._istft(S_complex * angles) return y - def compute_stft_paddings(x, fsize, fshift, pad_sides=1): + def compute_stft_paddings(x, pad_sides=1): '''compute right padding (final frame) or both sides padding (first and final frames) ''' assert pad_sides in (1, 2) - # return int(fsize // 2) - pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] + pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] if pad_sides == 1: return 0, pad else: