mirror of https://github.com/coqui-ai/TTS.git
68 KiB
68 KiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
import os import glob import librosa import numpy as np import pandas as pd from tqdm import tqdm_notebook as tqdm from multiprocessing import Pool from matplotlib import pylab as plt from collections import Counter %matplotlib inline
In [5]:
DATA_PATH = "../../../Data/LJSpeech-1.1/wavs/" META_PATH = "../../../Data/LJSpeech-1.1/metadata.csv" NUM_PROC = 8
In [6]:
file_names = glob.glob(os.path.join(DATA_PATH, "*.wav")) print(" > Number of audio files: {}".format(len(file_names)))
> Number of audio files: 13100
In [7]:
meta_f = open(META_PATH, 'r', encoding='utf8') meta = [m.split("|") for m in meta_f.readlines()]
In [8]:
def load_item(item): file_name = item[0] text = item[2] audio = librosa.load(os.path.join(DATA_PATH, file_name+'.wav')) sr = audio[1] audio = audio[0] audio_len = len(audio) / sr text_len = len(text) return text, text_len, audio, audio_len # This will take a while depending on size of dataset if NUM_PROC == 1: data = [] for m in tqdm(meta): data += [load_item(m)] else: with Pool(8) as p: data = list(tqdm(p.imap(load_item, meta), total=len(meta)))
HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))
In [9]:
# count words in the dataset w_count = Counter() for item in tqdm(data): text = item[0].lower() for word in text.split(): w_count[word] += 1 print(" > Number of words: {}".format(len(w_count)))
HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))
> Number of words: 22943
In [10]:
text_vs_durs = {} # text length vs audio duration text_len_counter = Counter() # number of sentences with the keyed length for item in tqdm(data): text = item[0].lower() text_len = len(text) text_len_counter[text_len] += 1 audio_len = item[-1] try: text_vs_durs[text_len] += [audio_len] except: text_vs_durs[text_len] = [audio_len]
HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))
In [11]:
# text_len vs avg_audio_len, median_audio_len, std_audio_len text_vs_avg = {} text_vs_median = {} text_vs_std = {} for key, durs in text_vs_durs.items(): text_vs_avg[key] = np.mean(durs) text_vs_median[key] = np.median(durs) text_vs_std[key] = np.std(durs)
Plot Dataset Statistics¶
In [12]:
plt.title("text length vs mean audio duration") plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))
Out[12]:
<matplotlib.collections.PathCollection at 0x7f2428497a90>
In [13]:
plt.title("text length vs median audio duration") plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))
Out[13]:
<matplotlib.collections.PathCollection at 0x7f24206c7358>
In [14]:
plt.title("text length vs STD") plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))
Out[14]:
<matplotlib.collections.PathCollection at 0x7f242065e8d0>
In [15]:
plt.title("text length vs # instances") plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))
Out[15]:
<matplotlib.collections.PathCollection at 0x7f24205f2358>
Check words frequencies¶
In [17]:
w_count_df = pd.DataFrame.from_dict(w_count, orient='index') w_count_df.sort_values(0, ascending=False, inplace=True)
In [18]:
w_count_df
Out[18]:
0 | |
---|---|
the | 18299 |
of | 8709 |
and | 6402 |
to | 6282 |
in | 4778 |
a | 4279 |
was | 3731 |
that | 2888 |
he | 2711 |
his | 2023 |
for | 1779 |
on | 1768 |
had | 1628 |
as | 1589 |
with | 1524 |
by | 1519 |
at | 1463 |
were | 1435 |
it | 1362 |
which | 1305 |
be | 1135 |
from | 1024 |
not | 1014 |
this | 992 |
is | 937 |
or | 932 |
but | 874 |
one | 782 |
have | 780 |
oswald | 776 |
... | ... |
eighteen: | 1 |
lading | 1 |
sustain | 1 |
inflict, | 1 |
markets, | 1 |
blow. | 1 |
ill-health | 1 |
delirium | 1 |
tremens, | 1 |
dejection, | 1 |
sacking | 1 |
prize-fighter | 1 |
scandalized | 1 |
outshone | 1 |
ferdinand | 1 |
grain. | 1 |
fluctuations | 1 |
attempt" | 1 |
action; | 1 |
grains, | 1 |
prices, | 1 |
protectionists | 1 |
depress | 1 |
market, | 1 |
election; | 1 |
napoleon | 1 |
french, | 1 |
popularity | 1 |
"convulsive | 1 |
lessening | 1 |
22943 rows × 1 columns
In [25]:
# check a certain word w_count_df.at['minute', 0]
Out[25]:
11
In [ ]:
# fequency bar plot - it takes time!! w_count_df.plot.bar()