diff --git a/dataset_analysis/AnalyzeDataset.ipynb b/dataset_analysis/AnalyzeDataset.ipynb index 1b04a746..3ed54ded 100644 --- a/dataset_analysis/AnalyzeDataset.ipynb +++ b/dataset_analysis/AnalyzeDataset.ipynb @@ -2,142 +2,132 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TTS_PATH = \"/home/erogol/projects/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", + "import sys\n", + "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", "import glob\n", "import librosa\n", "import numpy as np\n", "import pandas as pd\n", + "from scipy.stats import norm\n", "from tqdm import tqdm_notebook as tqdm\n", "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "from collections import Counter\n", + "from TTS.datasets.preprocess import *\n", "%matplotlib inline" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "DATA_PATH = \"../../../Data/LJSpeech-1.1/wavs/\"\n", - "META_PATH = \"../../../Data/LJSpeech-1.1/metadata.csv\"\n", + "DATA_PATH = \"/home/erogol/Data/m-ai-labs/de_DE/by_book/male/karlsson/\"\n", + "META_DATA = [\"kleinzaches/metadata.csv\",\n", + " \"spiegel_kaetzchen/metadata.csv\",\n", + " \"herrnarnesschatz/metadata.csv\",\n", + " \"maedchen_von_moorhof/metadata.csv\",\n", + " \"koenigsgaukler/metadata.csv\",\n", + " \"altehous/metadata.csv\",\n", + " \"odysseus/metadata.csv\",\n", + " \"undine/metadata.csv\",\n", + " \"reise_tilsit/metadata.csv\",\n", + " \"schmied_seines_glueckes/metadata.csv\",\n", + " \"kammmacher/metadata.csv\",\n", + " \"unterm_birnbaum/metadata.csv\",\n", + " \"liebesbriefe/metadata.csv\",\n", + " \"sandmann/metadata.csv\"]\n", "NUM_PROC = 8" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Number of audio files: 13100\n" - ] - } - ], - "source": [ - "file_names = glob.glob(os.path.join(DATA_PATH, \"*.wav\"))\n", - "print(\" > Number of audio files: {}\".format(len(file_names)))" - ] - }, - { - "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "meta_f = open(META_PATH, 'r', encoding='utf8')\n", - "meta = [m.split(\"|\") for m in meta_f.readlines()]" + "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n", + "items = mailabs(DATA_PATH, META_DATA)\n", + "print(\" > Number of audio files: {}\".format(len(items)))" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f899c42f6f514ab9bf3834e5facef6a3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], + "source": [ + "# check wavs if exist\n", + "wav_files = []\n", + "for item in items:\n", + " wav_file = item[1].strip()\n", + " wav_files.append(wav_file)\n", + " if not os.path.exists(wav_file):\n", + " print(waf_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show duplicate items\n", + "c = Counter(wav_files)\n", + "print([item for item, count in c.items() if count > 1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "def load_item(item):\n", - " file_name = item[0]\n", - " text = item[2]\n", - " audio = librosa.load(os.path.join(DATA_PATH, file_name+'.wav'))\n", + " file_name = item[1].strip()\n", + " text = item[0].strip()\n", + " audio = librosa.load(file_name, sr=None)\n", " sr = audio[1]\n", " audio = audio[0]\n", " audio_len = len(audio) / sr\n", " text_len = len(text)\n", - " return text, text_len, audio, audio_len\n", + " return file_name, text, text_len, audio, audio_len\n", "\n", "# This will take a while depending on size of dataset\n", "if NUM_PROC == 1:\n", " data = []\n", - " for m in tqdm(meta):\n", + " for m in tqdm(items):\n", " data += [load_item(m)]\n", "else:\n", " with Pool(8) as p:\n", - " data = list(tqdm(p.imap(load_item, meta), total=len(meta)))" + " data = list(tqdm(p.imap(load_item, items), total=len(items)))" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e42aca59abe14f8bb32b5d5f19af1c67", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " > Number of words: 22943\n" - ] - } - ], + "outputs": [], "source": [ "# count words in the dataset\n", "w_count = Counter()\n", "for item in tqdm(data):\n", - " text = item[0].lower()\n", + " text = item[1].lower().strip()\n", " for word in text.split():\n", " w_count[word] += 1\n", "print(\" > Number of words: {}\".format(len(w_count)))" @@ -145,36 +135,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "647a2e1810324971aacb971acff91fb3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "text_vs_durs = {} # text length vs audio duration\n", "text_len_counter = Counter() # number of sentences with the keyed length\n", "for item in tqdm(data):\n", - " text = item[0].lower()\n", + " text = item[1].lower().strip()\n", " text_len = len(text)\n", " text_len_counter[text_len] += 1\n", " audio_len = item[-1]\n", @@ -186,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -200,6 +168,70 @@ " text_vs_std[key] = np.std(durs)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Avg audio length per char" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for item in data:\n", + " if item[-1] < 2:\n", + " print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sec_per_chars = []\n", + "for item in data:\n", + " text = item[1]\n", + " dur = item[-1]\n", + " sec_per_char = dur / len(text)\n", + " sec_per_chars.append(sec_per_char)\n", + "# sec_per_char /= len(data)\n", + "# print(sec_per_char)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean = np.mean(sec_per_chars)\n", + "std = np.std(sec_per_chars)\n", + "print(mean)\n", + "print(std)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dist = norm(mean, std)\n", + "\n", + "# find irregular instances long or short voice durations\n", + "for item in data:\n", + " text = item[1]\n", + " dur = item[-1]\n", + " sec_per_char = dur / len(text)\n", + " pdf =norm.pdf(sec_per_char)\n", + " if pdf < 0.39:\n", + " print(item)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -209,30 +241,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "plt.title(\"text length vs mean audio duration\")\n", "plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))" @@ -240,30 +251,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "plt.title(\"text length vs median audio duration\")\n", "plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))" @@ -271,30 +261,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "plt.title(\"text length vs STD\")\n", "plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))" @@ -302,30 +271,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "plt.title(\"text length vs # instances\")\n", "plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))" @@ -340,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -350,377 +298,20 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0
the18299
of8709
and6402
to6282
in4778
a4279
was3731
that2888
he2711
his2023
for1779
on1768
had1628
as1589
with1524
by1519
at1463
were1435
it1362
which1305
be1135
from1024
not1014
this992
is937
or932
but874
one782
have780
oswald776
......
eighteen:1
lading1
sustain1
inflict,1
markets,1
blow.1
ill-health1
delirium1
tremens,1
dejection,1
sacking1
prize-fighter1
scandalized1
outshone1
ferdinand1
grain.1
fluctuations1
attempt\"1
action;1
grains,1
prices,1
protectionists1
depress1
market,1
election;1
napoleon1
french,1
popularity1
\"convulsive1
lessening1
\n", - "

22943 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " 0\n", - "the 18299\n", - "of 8709\n", - "and 6402\n", - "to 6282\n", - "in 4778\n", - "a 4279\n", - "was 3731\n", - "that 2888\n", - "he 2711\n", - "his 2023\n", - "for 1779\n", - "on 1768\n", - "had 1628\n", - "as 1589\n", - "with 1524\n", - "by 1519\n", - "at 1463\n", - "were 1435\n", - "it 1362\n", - "which 1305\n", - "be 1135\n", - "from 1024\n", - "not 1014\n", - "this 992\n", - "is 937\n", - "or 932\n", - "but 874\n", - "one 782\n", - "have 780\n", - "oswald 776\n", - "... ...\n", - "eighteen: 1\n", - "lading 1\n", - "sustain 1\n", - "inflict, 1\n", - "markets, 1\n", - "blow. 1\n", - "ill-health 1\n", - "delirium 1\n", - "tremens, 1\n", - "dejection, 1\n", - "sacking 1\n", - "prize-fighter 1\n", - "scandalized 1\n", - "outshone 1\n", - "ferdinand 1\n", - "grain. 1\n", - "fluctuations 1\n", - "attempt\" 1\n", - "action; 1\n", - "grains, 1\n", - "prices, 1\n", - "protectionists 1\n", - "depress 1\n", - "market, 1\n", - "election; 1\n", - "napoleon 1\n", - "french, 1\n", - "popularity 1\n", - "\"convulsive 1\n", - "lessening 1\n", - "\n", - "[22943 rows x 1 columns]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "w_count_df" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# check a certain word\n", "w_count_df.at['minute', 0]"