coqui-tts/notebooks/dataset_analysis/AnalyzeDataset.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "# TTS_PATH = \"/home/erogol/projects/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import librosa\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy.stats import norm\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from multiprocessing import Pool\n",
    "from matplotlib import pylab as plt\n",
    "from collections import Counter\n",
    "from TTS.config.shared_configs import BaseDatasetConfig\n",
    "from TTS.tts.datasets import load_tts_samples\n",
    "from TTS.tts.datasets.formatters import *\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "NUM_PROC = 8\n",
    "DATASET_CONFIG = BaseDatasetConfig(\n",
    "    formatter=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/absolute/path/to/your/dataset/\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument\n",
    "    txt_file = os.path.join(root_path, meta_file)\n",
    "    items = []\n",
    "    speaker_name = \"myspeaker\"\n",
    "    with open(txt_file, \"r\", encoding=\"utf-8\") as ttf:\n",
    "        for line in ttf:\n",
    "            cols = line.split(\"|\")\n",
    "            wav_file = os.path.join(root_path, \"wavs\", cols[0] + \".wav\")            \n",
    "            text = cols[1]\n",
    "            items.append({\"text\": text, \"audio_file\": wav_file, \"speaker_name\": speaker_name, \"root_path\": root_path})\n",
    "    return items"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n",
    "train_samples, eval_samples = load_tts_samples(DATASET_CONFIG, eval_split=True, formatter=formatter)\n",
    "if eval_samples is not None:\n",
    "    items = train_samples + eval_samples\n",
    "else:\n",
    "    items = train_samples\n",
    "print(\" > Number of audio files: {}\".format(len(items)))\n",
    "print(items[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "# check wavs if exist\n",
    "wav_files = []\n",
    "for item in items:\n",
    "    wav_file = item[\"audio_file\"].strip()\n",
    "    wav_files.append(wav_file)\n",
    "    if not os.path.exists(wav_file):\n",
    "        print(wav_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "# show duplicate items\n",
    "c = Counter(wav_files)\n",
    "print([item for item, count in c.items() if count > 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "item"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "def load_item(item):\n",
    "    text = item[\"text\"].strip()\n",
    "    file_name = item[\"audio_file\"].strip()\n",
    "    audio, sr = librosa.load(file_name, sr=None)\n",
    "    audio_len = len(audio) / sr\n",
    "    text_len = len(text)\n",
    "    return file_name, text, text_len, audio, audio_len\n",
    "\n",
    "# This will take a while depending on size of dataset\n",
    "if NUM_PROC == 1:\n",
    "    data = []\n",
    "    for m in tqdm(items):\n",
    "        data += [load_item(m)]\n",
    "else:\n",
    "    with Pool(8) as p:\n",
    "        data = list(tqdm(p.imap(load_item, items), total=len(items)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "# count words in the dataset\n",
    "w_count = Counter()\n",
    "for item in tqdm(data):\n",
    "    text = item[1].lower().strip()\n",
    "    for word in text.split():\n",
    "        w_count[word] += 1\n",
    "print(\" > Number of words: {}\".format(len(w_count)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "text_vs_durs = {}  # text length vs audio duration\n",
    "text_len_counter = Counter()  # number of sentences with the keyed length\n",
    "for item in tqdm(data):\n",
    "    text = item[1].lower().strip()\n",
    "    text_len = len(text)\n",
    "    text_len_counter[text_len] += 1\n",
    "    audio_len = item[-1]\n",
    "    try:\n",
    "        text_vs_durs[text_len] += [audio_len]\n",
    "    except:\n",
    "        text_vs_durs[text_len] = [audio_len]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "# text_len vs avg_audio_len, median_audio_len, std_audio_len\n",
    "text_vs_avg = {}\n",
    "text_vs_median = {}\n",
    "text_vs_std = {}\n",
    "for key, durs in text_vs_durs.items():\n",
    "    text_vs_avg[key] = np.mean(durs)\n",
    "    text_vs_median[key] = np.median(durs)\n",
    "    text_vs_std[key] = np.std(durs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "Collapsed": "false"
   },
   "source": [
    "### Avg audio length per char"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "for item in data:\n",
    "    if item[-1] < 2:\n",
    "        print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "sec_per_chars = []\n",
    "for item in data:\n",
    "    text = item[1]\n",
    "    dur = item[-1]\n",
    "    sec_per_char = dur / len(text)\n",
    "    sec_per_chars.append(sec_per_char)\n",
    "# sec_per_char /= len(data)\n",
    "# print(sec_per_char)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "mean = np.mean(sec_per_chars)\n",
    "std = np.std(sec_per_chars)\n",
    "print(mean)\n",
    "print(std)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "dist = norm(mean, std)\n",
    "\n",
    "# find irregular instances long or short voice durations\n",
    "for item in data:\n",
    "    text = item[1]\n",
    "    dur = item[-1]\n",
    "    sec_per_char = dur / len(text)\n",
    "    pdf =norm.pdf(sec_per_char)\n",
    "    if pdf < 0.39:\n",
    "        print(item)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "Collapsed": "false"
   },
   "source": [
    "### Plot Dataset Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "plt.title(\"text length vs mean audio duration\")\n",
    "plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "plt.title(\"text length vs median audio duration\")\n",
    "plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "plt.title(\"text length vs STD\")\n",
    "plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "plt.title(\"text length vs # instances\")\n",
    "plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "Collapsed": "false"
   },
   "source": [
    "### Check words frequencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "w_count_df = pd.DataFrame.from_dict(w_count, orient='index')\n",
    "w_count_df.sort_values(0, ascending=False, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "w_count_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "# check a certain word\n",
    "w_count_df.at['minute', 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "Collapsed": "false"
   },
   "outputs": [],
   "source": [
    "# fequency bar plot - it takes time!!\n",
    "w_count_df.plot.bar()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}