mirror of https://github.com/coqui-ai/TTS.git
Update CheckSpectrograms notebook (#1418)
This commit is contained in:
parent
c7f9ec07c8
commit
2e6e8f651d
|
@ -3,6 +3,10 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%matplotlib inline\n",
|
"%matplotlib inline\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -12,21 +16,51 @@
|
||||||
"\n",
|
"\n",
|
||||||
"import IPython.display as ipd\n",
|
"import IPython.display as ipd\n",
|
||||||
"import glob"
|
"import glob"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n",
|
"from TTS.config.shared_configs import BaseAudioConfig\n",
|
||||||
"data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n",
|
"CONFIG = BaseAudioConfig()"
|
||||||
"\n",
|
]
|
||||||
"file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
|
},
|
||||||
"CONFIG = load_config(config_path)\n",
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## ✍️ Set these values "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data_path = \"/root/wav48_silence_trimmed/\"\n",
|
||||||
|
"file_ext = \".flac\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Read audio files"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Change this to the index of the desired file listed below\n",
|
"# Change this to the index of the desired file listed below\n",
|
||||||
"sample_file_index = 10\n",
|
"sample_file_index = 10\n",
|
||||||
|
@ -35,44 +69,45 @@
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"File list, by index:\")\n",
|
"print(\"File list, by index:\")\n",
|
||||||
"dict(enumerate(file_paths))"
|
"dict(enumerate(file_paths))"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"### Setup Audio Processor\n",
|
"## ✍️ Set Audio Processor\n",
|
||||||
"Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
|
"Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The default values are loaded from your config.json file, so you only need to\n",
|
"The default values are loaded from your config.json file, so you only need to\n",
|
||||||
"uncomment and modify values below that you'd like to tune."
|
"uncomment and modify values below that you'd like to tune."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"tune_params={\n",
|
"tune_params={\n",
|
||||||
"# 'audio_processor': 'audio',\n",
|
" 'num_mels': 80, # In general, you don't need to change this. \n",
|
||||||
"# 'num_mels': 80, # In general, you don't need to change this. \n",
|
" 'fft_size': 2400, # In general, you don't need to change this.\n",
|
||||||
"# 'fft_size': 1024, # In general, you don't need to change this.\n",
|
" 'frame_length_ms': 50, \n",
|
||||||
"# 'sample_rate': 22050, # This must match the sample rate of the dataset.\n",
|
" 'frame_shift_ms': 12.5,\n",
|
||||||
"# 'hop_length': 256, # In general, you don't need to change this.\n",
|
" 'sample_rate': 48000, # This must match the sample rate of the dataset.\n",
|
||||||
"# 'win_length': 1024, # In general, you don't need to change this.\n",
|
" 'hop_length': None, # In general, you don't need to change this.\n",
|
||||||
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
|
" 'win_length': 1024, # In general, you don't need to change this.\n",
|
||||||
"# 'min_level_db': -100,\n",
|
" 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
|
||||||
"# 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
|
" 'min_level_db': -100,\n",
|
||||||
"# 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
|
" 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
|
||||||
"# 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
|
" 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
|
||||||
"# 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
|
" 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
|
||||||
"# 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
|
" 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
|
||||||
"# 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
|
" 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
|
||||||
|
" 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# These options have to be forced off in order to avoid errors about the \n",
|
"# These options have to be forced off in order to avoid errors about the \n",
|
||||||
|
@ -86,59 +121,57 @@
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Override select parts of loaded config with parameters above\n",
|
"# Override select parts of loaded config with parameters above\n",
|
||||||
"tuned_config = CONFIG.audio.copy()\n",
|
"tuned_config = CONFIG.copy()\n",
|
||||||
"tuned_config.update(reset)\n",
|
"tuned_config.update(reset)\n",
|
||||||
"tuned_config.update(tune_params)\n",
|
"tuned_config.update(tune_params)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"AP = AudioProcessor(**tuned_config);"
|
"AP = AudioProcessor(**tuned_config);"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Check audio loading "
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"Collapsed": "false"
|
"Collapsed": "false"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Check audio loading "
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
|
"wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
|
||||||
"ipd.Audio(data=wav, rate=AP.sample_rate) "
|
"ipd.Audio(data=wav, rate=AP.sample_rate) "
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Generate Mel-Spectrogram and Re-synthesis with GL"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"Collapsed": "false"
|
"Collapsed": "false"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Generate Mel-Spectrogram and Re-synthesis with GL"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"AP.power = 1.5"
|
"AP.power = 1.5"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"mel = AP.melspectrogram(wav)\n",
|
"mel = AP.melspectrogram(wav)\n",
|
||||||
"print(\"Max:\", mel.max())\n",
|
"print(\"Max:\", mel.max())\n",
|
||||||
|
@ -148,24 +181,24 @@
|
||||||
"\n",
|
"\n",
|
||||||
"wav_gen = AP.inv_melspectrogram(mel)\n",
|
"wav_gen = AP.inv_melspectrogram(mel)\n",
|
||||||
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
|
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Generate Linear-Spectrogram and Re-synthesis with GL"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"Collapsed": "false"
|
"Collapsed": "false"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Generate Linear-Spectrogram and Re-synthesis with GL"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"spec = AP.spectrogram(wav)\n",
|
"spec = AP.spectrogram(wav)\n",
|
||||||
"print(\"Max:\", spec.max())\n",
|
"print(\"Max:\", spec.max())\n",
|
||||||
|
@ -175,26 +208,26 @@
|
||||||
"\n",
|
"\n",
|
||||||
"wav_gen = AP.inv_spectrogram(spec)\n",
|
"wav_gen = AP.inv_spectrogram(spec)\n",
|
||||||
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
|
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"### Compare values for a certain parameter\n",
|
"### Compare values for a certain parameter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Optimize your parameters by comparing different values per parameter at a time."
|
"Optimize your parameters by comparing different values per parameter at a time."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from librosa import display\n",
|
"from librosa import display\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
|
@ -234,39 +267,39 @@
|
||||||
" val = values[idx]\n",
|
" val = values[idx]\n",
|
||||||
" print(\" > {} = {}\".format(attribute, val))\n",
|
" print(\" > {} = {}\".format(attribute, val))\n",
|
||||||
" IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
|
" IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"Collapsed": "false"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
|
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"source": [
|
|
||||||
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
|
|
||||||
],
|
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"Collapsed": "false"
|
"Collapsed": "false"
|
||||||
}
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
|
||||||
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"name": "python3",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"display_name": "Python 3.8.5 64-bit ('torch': conda)"
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
|
@ -278,12 +311,9 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.5"
|
"version": "3.9.5"
|
||||||
},
|
|
||||||
"interpreter": {
|
|
||||||
"hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 4
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue