diff --git a/notebooks/dataset_analysis/PhonemeCoverage.ipynb b/notebooks/dataset_analysis/PhonemeCoverage.ipynb new file mode 100644 index 00000000..af00deaf --- /dev/null +++ b/notebooks/dataset_analysis/PhonemeCoverage.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "# Jupyter Notbook for phoneme coverage analysis\n", + "\n", + "This jupyter notebook checks dataset configured in config.json for phoneme coverage.\n", + "As mentioned here https://github.com/mozilla/TTS/wiki/Dataset#what-makes-a-good-dataset a good phoneme coverage is recommended.\n", + "\n", + "Most parameters will be taken from config.json file in mozilla tts repo so please ensure it's configured correctly for your dataset.\n", + "This notebook used lots of existring code from the TTS repo to ensure future compatibility.\n", + "\n", + "Many thanks to Neil Stoker supporting me on this topic :-).\n", + "\n", + "I provide this notebook without any warrenty but it's hopefully useful for your dataset analysis.\n", + "\n", + "Happy TTS'ing :-)\n", + "\n", + "Thorsten Müller\n", + "\n", + "* https://github.com/thorstenMueller/deep-learning-german-tts\n", + "* https://discourse.mozilla.org/t/contributing-my-german-voice-for-tts/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# set some vars\n", + "# TTS_PATH = \"/home/thorsten/___dev/tts/mozilla/TTS\"\n", + "CONFIG_FILE = \"/path/to/config/config.json\"\n", + "CHARS_TO_REMOVE = \".,:!?'\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# import stuff\n", + "from TTS.utils.io import load_config\n", + "from TTS.tts.datasets.preprocess import load_meta_data\n", + "from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n", + "from tqdm import tqdm\n", + "from matplotlib import pylab as plt\n", + "from multiprocessing import Pool, cpu_count\n", + "\n", + "# extra imports that might not be included in requirements.txt\n", + "import collections\n", + "import operator\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "# Load config.json properties\n", + "CONFIG = load_config(CONFIG_FILE)\n", + "\n", + "# Load some properties from config.json\n", + "CONFIG_METADATA = sorted(load_meta_data(CONFIG.datasets)[0])\n", + "CONFIG_METADATA = CONFIG_METADATA\n", + "CONFIG_DATASET = CONFIG.datasets[0]\n", + "CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n", + "CONFIG_TEXT_CLEANER = CONFIG.text_cleaner\n", + "CONFIG_ENABLE_EOS_BOS_CHARS = CONFIG.enable_eos_bos_chars\n", + "\n", + "# Will be printed on generated output graph\n", + "CONFIG_RUN_NAME = CONFIG.run_name\n", + "CONFIG_RUN_DESC = CONFIG.run_description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "# print some debug information on loaded config values\n", + "print(\" > Run name: \" + CONFIG_RUN_NAME + \" (\" + CONFIG_RUN_DESC + \")\")\n", + "print(\" > Dataset files: \" + str(len(CONFIG_METADATA)))\n", + "print(\" > Phoneme language: \" + CONFIG_PHONEME_LANGUAGE)\n", + "print(\" > Used text cleaner: \" + CONFIG_TEXT_CLEANER)\n", + "print(\" > Enable eos bos chars: \" + str(CONFIG_ENABLE_EOS_BOS_CHARS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_phoneme_from_sequence(text):\n", + " temp_list = []\n", + " if len(text[0]) > 0:\n", + " temp_text = text[0].rstrip('\\n')\n", + " for rm_bad_chars in CHARS_TO_REMOVE:\n", + " temp_text = temp_text.replace(rm_bad_chars,\"\")\n", + " seq = phoneme_to_sequence(temp_text, [CONFIG_TEXT_CLEANER], CONFIG_PHONEME_LANGUAGE, CONFIG_ENABLE_EOS_BOS_CHARS)\n", + " text = sequence_to_phoneme(seq)\n", + " text = text.replace(\" \",\"\")\n", + " temp_list.append(text)\n", + " return temp_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "# Get phonemes from metadata\n", + "phonemes = []\n", + "\n", + "with Pool(cpu_count()-1) as p:\n", + " \n", + " phonemes = list(tqdm(p.imap(get_phoneme_from_sequence, CONFIG_METADATA), total=len(CONFIG_METADATA)))\n", + " phonemes = [i for sub in phonemes for i in sub]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "s = \"\"\n", + "phonemeString = s.join(phonemes)\n", + "\n", + "d = {}\n", + "collections._count_elements(d, phonemeString)\n", + "sorted_d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))\n", + "\n", + "# remove useless keys\n", + "sorted_d.pop(' ', None)\n", + "sorted_d.pop('ˈ', None)\n", + "\n", + "phonemesSum = len(phonemeString.replace(\" \",\"\"))\n", + "\n", + "print(\"Dataset contains \" + str(len(sorted_d)) + \" different ipa phonemes.\")\n", + "print(\"Dataset consists of \" + str(phonemesSum) + \" phonemes\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"5 rarest phonemes\")\n", + "\n", + "rareList = dict(sorted(sorted_d.items(), key=operator.itemgetter(1), reverse=False)[:5])\n", + "for key, value in rareList.items():\n", + " print(key + \" --> \" + str(value) + \" occurrences\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# create plot from analysis result\n", + "\n", + "x = []\n", + "y = []\n", + "\n", + "for key, value in sorted_d.items():\n", + " x.append(key)\n", + " y.append(value)\n", + "\n", + "plt.figure(figsize=(50,50))\n", + "plt.title(\"Phoneme coverage for \" + CONFIG_RUN_NAME + \" (\" + CONFIG_RUN_DESC + \")\", fontsize=50)\n", + "plt.xticks(fontsize=50)\n", + "plt.yticks(fontsize=50)\n", + "plt.barh(x,y, align='center', alpha=1.0)\n", + "plt.gca().invert_yaxis()\n", + "plt.ylabel('phoneme', fontsize=50)\n", + "plt.xlabel('occurrences', fontsize=50)\n", + "\n", + "for i, v in enumerate(y):\n", + " plt.text(v + 2, i - .2, str(v), fontsize=20)\n", + " plt.text(v + 2, i + .2, \"(\" + str(round(100/phonemesSum * v,2)) + \"%)\", fontsize=20)\n", + " \n", + " \n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9-final" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file