mirror of https://github.com/coqui-ai/TTS.git
update ExtractTTSSpecs notebook
This commit is contained in:
parent
a678d684a2
commit
391dab45f0
|
@ -7,15 +7,6 @@
|
||||||
"This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training."
|
"This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"TTS_PATH = \"/home/erogol/projects/\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
@ -26,7 +17,6 @@
|
||||||
"%autoreload 2\n",
|
"%autoreload 2\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import sys\n",
|
"import sys\n",
|
||||||
"sys.path.append(TTS_PATH)\n",
|
|
||||||
"import torch\n",
|
"import torch\n",
|
||||||
"import importlib\n",
|
"import importlib\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
|
@ -42,7 +32,7 @@
|
||||||
"%matplotlib inline\n",
|
"%matplotlib inline\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"os.environ['CUDA_VISIBLE_DEVICES']='2'"
|
"os.environ['CUDA_VISIBLE_DEVICES']='0'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -69,12 +59,12 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"OUT_PATH = \"/data/rw/pit/data/turkish-vocoder/\"\n",
|
"OUT_PATH = \"/home/erogol/Data/LJSpeech-1.1/ljspeech-March-17-2020_01+16AM-871588c/\"\n",
|
||||||
"DATA_PATH = \"/data/rw/home/Turkish\"\n",
|
"DATA_PATH = \"/home/erogol/Data/LJSpeech-1.1/\"\n",
|
||||||
"DATASET = \"ljspeech\"\n",
|
"DATASET = \"ljspeech\"\n",
|
||||||
"METADATA_FILE = \"metadata.txt\"\n",
|
"METADATA_FILE = \"metadata.csv\"\n",
|
||||||
"CONFIG_PATH = \"/data/rw/pit/keep/turkish-January-08-2020_01+56AM-ca5e133/config.json\"\n",
|
"CONFIG_PATH = \"/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/config.json\"\n",
|
||||||
"MODEL_FILE = \"/data/rw/pit/keep/turkish-January-08-2020_01+56AM-ca5e133/checkpoint_255000.pth.tar\"\n",
|
"MODEL_FILE = \"/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/checkpoint_420000.pth.tar\"\n",
|
||||||
"BATCH_SIZE = 32\n",
|
"BATCH_SIZE = 32\n",
|
||||||
"\n",
|
"\n",
|
||||||
"QUANTIZED_WAV = False\n",
|
"QUANTIZED_WAV = False\n",
|
||||||
|
@ -85,6 +75,7 @@
|
||||||
"print(\" > CUDA enabled: \", use_cuda)\n",
|
"print(\" > CUDA enabled: \", use_cuda)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"C = load_config(CONFIG_PATH)\n",
|
"C = load_config(CONFIG_PATH)\n",
|
||||||
|
"C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n",
|
||||||
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
|
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -94,7 +85,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# if the vocabulary was passed, replace the default\n",
|
"# if the vocabulary was passed, replace the default\n",
|
||||||
"if 'characters' in C.keys():\n",
|
"if 'characters' in C.keys():\n",
|
||||||
" symbols, phonemes = make_symbols(**C.characters)\n",
|
" symbols, phonemes = make_symbols(**C.characters)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -120,7 +111,7 @@
|
||||||
"preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
|
"preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
|
||||||
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
||||||
"meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
|
"meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
|
||||||
"dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
|
"dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
|
||||||
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
|
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -143,7 +134,7 @@
|
||||||
"metadata = []\n",
|
"metadata = []\n",
|
||||||
"losses = []\n",
|
"losses = []\n",
|
||||||
"postnet_losses = []\n",
|
"postnet_losses = []\n",
|
||||||
"criterion = L1LossMasked()\n",
|
"criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)\n",
|
||||||
"with torch.no_grad():\n",
|
"with torch.no_grad():\n",
|
||||||
" for data in tqdm(loader):\n",
|
" for data in tqdm(loader):\n",
|
||||||
" # setup input data\n",
|
" # setup input data\n",
|
||||||
|
@ -232,7 +223,31 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Check model performance"
|
"### Sanity Check"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"idx = 1\n",
|
||||||
|
"ap.melspectrogram(ap.load_wav(item_idx[idx])).shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import soundfile as sf\n",
|
||||||
|
"wav, sr = sf.read(item_idx[idx])\n",
|
||||||
|
"mel_postnet = postnet_outputs[idx][:mel_lengths[idx], :]\n",
|
||||||
|
"mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n",
|
||||||
|
"mel_truth = ap.melspectrogram(wav)\n",
|
||||||
|
"print(mel_truth.shape)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -242,10 +257,8 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# plot posnet output\n",
|
"# plot posnet output\n",
|
||||||
"idx = 1\n",
|
"plot_spectrogram(mel_postnet, ap);\n",
|
||||||
"mel_example = postnet_outputs[idx]\n",
|
"print(mel_postnet[:mel_lengths[idx], :].shape)"
|
||||||
"plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n",
|
|
||||||
"print(mel_example[:mel_lengths[1], :].shape)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -255,9 +268,8 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# plot decoder output\n",
|
"# plot decoder output\n",
|
||||||
"mel_example = mel_outputs[idx].data.cpu().numpy()\n",
|
"plot_spectrogram(mel_decoder, ap);\n",
|
||||||
"plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n",
|
"print(mel_decoder.shape)"
|
||||||
"print(mel_example[:mel_lengths[1], :].shape)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -267,10 +279,8 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# plot GT specgrogram\n",
|
"# plot GT specgrogram\n",
|
||||||
"wav = ap.load_wav(item_idx[idx])\n",
|
"print(mel_truth.shape)\n",
|
||||||
"melt = ap.melspectrogram(wav)\n",
|
"plot_spectrogram(mel_truth.T, ap);"
|
||||||
"print(melt.shape)\n",
|
|
||||||
"plot_spectrogram(melt.T, ap);"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -281,9 +291,9 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# postnet, decoder diff\n",
|
"# postnet, decoder diff\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
"mel_diff = mel_outputs[idx] - postnet_outputs[idx]\n",
|
"mel_diff = mel_decoder - mel_postnet\n",
|
||||||
"plt.figure(figsize=(16, 10))\n",
|
"plt.figure(figsize=(16, 10))\n",
|
||||||
"plt.imshow(abs(mel_diff.detach().cpu().numpy()[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
"plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
"plt.colorbar()\n",
|
"plt.colorbar()\n",
|
||||||
"plt.tight_layout()"
|
"plt.tight_layout()"
|
||||||
]
|
]
|
||||||
|
@ -294,10 +304,25 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# PLOT GT SPECTROGRAM diff\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
"# mel = mel_poutputs[idx].detach().cpu().numpy()\n",
|
"mel_diff2 = mel_truth.T - mel_decoder\n",
|
||||||
"mel = postnet_outputs[idx].detach().cpu().numpy()\n",
|
"plt.figure(figsize=(16, 10))\n",
|
||||||
"mel_diff2 = melt.T - mel[:melt.shape[1]]\n",
|
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
|
"plt.colorbar()\n",
|
||||||
|
"plt.tight_layout()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# PLOT GT SPECTROGRAM diff\n",
|
||||||
|
"from matplotlib import pylab as plt\n",
|
||||||
|
"mel = postnet_outputs[idx]\n",
|
||||||
|
"mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]\n",
|
||||||
"plt.figure(figsize=(16, 10))\n",
|
"plt.figure(figsize=(16, 10))\n",
|
||||||
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
"plt.colorbar()\n",
|
"plt.colorbar()\n",
|
||||||
|
|
|
@ -269,12 +269,11 @@ class AudioProcessor(object):
|
||||||
y = self._istft(S_complex * angles)
|
y = self._istft(S_complex * angles)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
def compute_stft_paddings(x, fsize, fshift, pad_sides=1):
|
def compute_stft_paddings(x, pad_sides=1):
|
||||||
'''compute right padding (final frame) or both sides padding (first and final frames)
|
'''compute right padding (final frame) or both sides padding (first and final frames)
|
||||||
'''
|
'''
|
||||||
assert pad_sides in (1, 2)
|
assert pad_sides in (1, 2)
|
||||||
# return int(fsize // 2)
|
pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
|
||||||
pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
|
|
||||||
if pad_sides == 1:
|
if pad_sides == 1:
|
||||||
return 0, pad
|
return 0, pad
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue