coqui-tts/AnalyzeDataset.ipynb at 312a539a0ef1e2a3543e8ab6853e86f8a900e5ea

68 KiB

Raw Blame History

None <html lang="en"> <head> </head>

In [1]:

import os
import glob
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool
from matplotlib import pylab as plt
from collections import Counter
%matplotlib inline

In [5]:

DATA_PATH = "../../../Data/LJSpeech-1.1/wavs/"
META_PATH = "../../../Data/LJSpeech-1.1/metadata.csv"
NUM_PROC = 8

In [6]:

file_names = glob.glob(os.path.join(DATA_PATH, "*.wav"))
print(" > Number of audio files: {}".format(len(file_names)))

 > Number of audio files: 13100

In [7]:

meta_f = open(META_PATH, 'r', encoding='utf8')
meta = [m.split("|") for m in meta_f.readlines()]

In [8]:

def load_item(item):
    file_name = item[0]
    text = item[2]
    audio = librosa.load(os.path.join(DATA_PATH, file_name+'.wav'))
    sr = audio[1]
    audio = audio[0]
    audio_len = len(audio) / sr
    text_len = len(text)
    return text, text_len, audio, audio_len

# This will take a while depending on size of dataset
if NUM_PROC == 1:
    data = []
    for m in tqdm(meta):
        data += [load_item(m)]
else:
    with Pool(8) as p:
        data = list(tqdm(p.imap(load_item, meta), total=len(meta)))

HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))

In [9]:

# count words in the dataset
w_count = Counter()
for item in tqdm(data):
    text = item[0].lower()
    for word in text.split():
        w_count[word] += 1
print(" > Number of words: {}".format(len(w_count)))

HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))

 > Number of words: 22943

In [10]:

text_vs_durs = {}  # text length vs audio duration
text_len_counter = Counter()  # number of sentences with the keyed length
for item in tqdm(data):
    text = item[0].lower()
    text_len = len(text)
    text_len_counter[text_len] += 1
    audio_len = item[-1]
    try:
        text_vs_durs[text_len] += [audio_len]
    except:
        text_vs_durs[text_len] = [audio_len]

HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))

In [11]:

# text_len vs avg_audio_len, median_audio_len, std_audio_len
text_vs_avg = {}
text_vs_median = {}
text_vs_std = {}
for key, durs in text_vs_durs.items():
    text_vs_avg[key] = np.mean(durs)
    text_vs_median[key] = np.median(durs)
    text_vs_std[key] = np.std(durs)

Plot Dataset Statistics¶

In [12]:

plt.title("text length vs mean audio duration")
plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))

Out[12]:

<matplotlib.collections.PathCollection at 0x7f2428497a90>

No description has been provided for this image

In [13]:

plt.title("text length vs median audio duration")
plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))

Out[13]:

<matplotlib.collections.PathCollection at 0x7f24206c7358>

In [14]:

plt.title("text length vs STD")
plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))

Out[14]:

<matplotlib.collections.PathCollection at 0x7f242065e8d0>

In [15]:

plt.title("text length vs # instances")
plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))

Out[15]:

<matplotlib.collections.PathCollection at 0x7f24205f2358>

Check words frequencies¶

In [17]:

w_count_df = pd.DataFrame.from_dict(w_count, orient='index')
w_count_df.sort_values(0, ascending=False, inplace=True)

In [18]:

w_count_df

Out[18]:

	0
the	18299
of	8709
and	6402
to	6282
in	4778
a	4279
was	3731
that	2888
he	2711
his	2023
for	1779
on	1768
had	1628
as	1589
with	1524
by	1519
at	1463
were	1435
it	1362
which	1305
be	1135
from	1024
not	1014
this	992
is	937
or	932
but	874
one	782
have	780
oswald	776
...	...
eighteen:	1
lading	1
sustain	1
inflict,	1
markets,	1
blow.	1
ill-health	1
delirium	1
tremens,	1
dejection,	1
sacking	1
prize-fighter	1
scandalized	1
outshone	1
ferdinand	1
grain.	1
fluctuations	1
attempt"	1
action;	1
grains,	1
prices,	1
protectionists	1
depress	1
market,	1
election;	1
napoleon	1
french,	1
popularity	1
"convulsive	1
lessening	1

22943 rows × 1 columns

In [25]:

# check a certain word
w_count_df.at['minute', 0]

Out[25]:

In [ ]:

# fequency bar plot - it takes time!!
w_count_df.plot.bar()

</html>

68 KiB Raw Blame History Unescape Escape

Plot Dataset Statistics¶

Check words frequencies¶

68 KiB

Raw Blame History