diff --git a/notebooks/CheckSpectrograms.ipynb b/notebooks/CheckSpectrograms.ipynb deleted file mode 100644 index 7829d920..00000000 --- a/notebooks/CheckSpectrograms.ipynb +++ /dev/null @@ -1,262 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cd /home/erogol/projects/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "from mozilla_voice_tts.tts.utils.audio import AudioProcessor\n", - "from mozilla_voice_tts.tts.utils.visual import plot_spectrogram\n", - "from mozilla_voice_tts.tts.utils.generic_utils import load_config\n", - "import glob \n", - "import IPython.display as ipd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "config_path = \"/media/erogol/data_ssd/Data/models/tr/TTS-phoneme-January-14-2019_06+52PM-4ad64a7/config.json\"\n", - "data_path = \"/home/erogol/Data/Mozilla/\"\n", - "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n", - "CONFIG = load_config(config_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup Audio Processor\n", - "Play with the AP parameters until you find a good fit with the synthesis speech below. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "audio={\n", - " 'audio_processor': 'audio',\n", - " 'num_mels': 80, # In general, you don'tneed to change it \n", - " 'num_freq': 1025, # In general, you don'tneed to change it \n", - " 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n", - " 'frame_length_ms': 50, # In general, you don'tneed to change it \n", - " 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n", - " 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n", - " 'min_level_db': -100,\n", - " 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n", - " 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n", - " 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n", - " 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n", - " 'symmetric_norm': False, # Same as above\n", - " 'max_norm': 1, # Same as above\n", - " 'clip_norm': True, # Same as above\n", - " 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", - " 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", - " 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", - "\n", - "AP = AudioProcessor(**audio);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Check audio loading " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wav = AP.load_wav(file_paths[10])\n", - "ipd.Audio(data=wav, rate=AP.sample_rate) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generate Mel-Spectrogram and Re-synthesis with GL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mel = AP.melspectrogram(wav)\n", - "print(\"Max:\", mel.max())\n", - "print(\"Min:\", mel.min())\n", - "print(\"Mean:\", mel.mean())\n", - "plot_spectrogram(mel.T, AP);\n", - "\n", - "wav_gen = AP.inv_mel_spectrogram(mel)\n", - "ipd.Audio(wav_gen, rate=AP.sample_rate)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generate Linear-Spectrogram and Re-synthesis with GL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "spec = AP.spectrogram(wav)\n", - "print(\"Max:\", spec.max())\n", - "print(\"Min:\", spec.min())\n", - "print(\"Mean:\", spec.mean())\n", - "plot_spectrogram(spec.T, AP);\n", - "\n", - "wav_gen = AP.inv_spectrogram(spec)\n", - "ipd.Audio(wav_gen, rate=AP.sample_rate)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Compare values for a certain parameter\n", - "\n", - "Optimize your parameters by comparing different values per parameter at a time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "audio={\n", - " 'audio_processor': 'audio',\n", - " 'num_mels': 80, # In general, you don'tneed to change it \n", - " 'num_freq': 1025, # In general, you don'tneed to change it \n", - " 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n", - " 'frame_length_ms': 50, # In general, you don'tneed to change it \n", - " 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n", - " 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n", - " 'min_level_db': -100,\n", - " 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n", - " 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n", - " 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n", - " 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n", - " 'symmetric_norm': False, # Same as above\n", - " 'max_norm': 1, # Same as above\n", - " 'clip_norm': True, # Same as above\n", - " 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", - " 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", - " 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", - "\n", - "AP = AudioProcessor(**audio);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from librosa import display\n", - "from matplotlib import pylab as plt\n", - "import IPython\n", - "plt.rcParams['figure.figsize'] = (20.0, 16.0)\n", - "\n", - "def compare_values(attribute, values, file):\n", - " \"\"\"\n", - " attributes (str): the names of the attribute you like to test.\n", - " values (list): list of values to compare.\n", - " file (str): file name to perform the tests.\n", - " \"\"\"\n", - " wavs = []\n", - " for idx, val in enumerate(values):\n", - " set_val_cmd = \"AP.{}={}\".format(attribute, val)\n", - " exec(set_val_cmd)\n", - " wav = AP.load_wav(file)\n", - " spec = AP.spectrogram(wav)\n", - " spec_norm = AP._denormalize(spec.T)\n", - " plt.subplot(len(values), 2, 2*idx + 1)\n", - " plt.imshow(spec_norm.T, aspect=\"auto\", origin=\"lower\")\n", - " # plt.colorbar()\n", - " plt.tight_layout()\n", - " wav_gen = AP.inv_spectrogram(spec)\n", - " wavs.append(wav_gen)\n", - " plt.subplot(len(values), 2, 2*idx + 2)\n", - " display.waveplot(wav, alpha=0.5)\n", - " display.waveplot(wav_gen, alpha=0.25)\n", - " plt.title(\"{}={}\".format(attribute, val))\n", - " plt.tight_layout()\n", - " \n", - " wav = AP.load_wav(file)\n", - " print(\" > Ground-truth\")\n", - " IPython.display.display(IPython.display.Audio(wav, rate=AP.sample_rate))\n", - " \n", - " for idx, wav_gen in enumerate(wavs):\n", - " val = values[idx]\n", - " print(\" > {} = {}\".format(attribute, val))\n", - " IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "compare_values(\"ref_level_db\", [10, 15, 20, 25, 30, 35, 40], file_paths[10])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file