coqui-tts/dataset_analysis/AnalyzeDataset.ipynb

68 KiB
Raw Blame History

None <html lang="en"> <head> </head>
In [1]:
import os
import glob
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool
from matplotlib import pylab as plt
from collections import Counter
%matplotlib inline
In [5]:
DATA_PATH = "../../../Data/LJSpeech-1.1/wavs/"
META_PATH = "../../../Data/LJSpeech-1.1/metadata.csv"
NUM_PROC = 8
In [6]:
file_names = glob.glob(os.path.join(DATA_PATH, "*.wav"))
print(" > Number of audio files: {}".format(len(file_names)))
 > Number of audio files: 13100
In [7]:
meta_f = open(META_PATH, 'r', encoding='utf8')
meta = [m.split("|") for m in meta_f.readlines()]
In [8]:
def load_item(item):
    file_name = item[0]
    text = item[2]
    audio = librosa.load(os.path.join(DATA_PATH, file_name+'.wav'))
    sr = audio[1]
    audio = audio[0]
    audio_len = len(audio) / sr
    text_len = len(text)
    return text, text_len, audio, audio_len

# This will take a while depending on size of dataset
if NUM_PROC == 1:
    data = []
    for m in tqdm(meta):
        data += [load_item(m)]
else:
    with Pool(8) as p:
        data = list(tqdm(p.imap(load_item, meta), total=len(meta)))
HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))

In [9]:
# count words in the dataset
w_count = Counter()
for item in tqdm(data):
    text = item[0].lower()
    for word in text.split():
        w_count[word] += 1
print(" > Number of words: {}".format(len(w_count)))
HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))
 > Number of words: 22943
In [10]:
text_vs_durs = {}  # text length vs audio duration
text_len_counter = Counter()  # number of sentences with the keyed length
for item in tqdm(data):
    text = item[0].lower()
    text_len = len(text)
    text_len_counter[text_len] += 1
    audio_len = item[-1]
    try:
        text_vs_durs[text_len] += [audio_len]
    except:
        text_vs_durs[text_len] = [audio_len]
HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))

In [11]:
# text_len vs avg_audio_len, median_audio_len, std_audio_len
text_vs_avg = {}
text_vs_median = {}
text_vs_std = {}
for key, durs in text_vs_durs.items():
    text_vs_avg[key] = np.mean(durs)
    text_vs_median[key] = np.median(durs)
    text_vs_std[key] = np.std(durs)

Plot Dataset Statistics

In [12]:
plt.title("text length vs mean audio duration")
plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))
Out[12]:
<matplotlib.collections.PathCollection at 0x7f2428497a90>
No description has been provided for this image
In [13]:
plt.title("text length vs median audio duration")
plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))
Out[13]:
<matplotlib.collections.PathCollection at 0x7f24206c7358>
No description has been provided for this image
In [14]:
plt.title("text length vs STD")
plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))
Out[14]:
<matplotlib.collections.PathCollection at 0x7f242065e8d0>
No description has been provided for this image
In [15]:
plt.title("text length vs # instances")
plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))
Out[15]:
<matplotlib.collections.PathCollection at 0x7f24205f2358>
No description has been provided for this image

Check words frequencies

In [17]:
w_count_df = pd.DataFrame.from_dict(w_count, orient='index')
w_count_df.sort_values(0, ascending=False, inplace=True)
In [18]:
w_count_df
Out[18]:
0
the 18299
of 8709
and 6402
to 6282
in 4778
a 4279
was 3731
that 2888
he 2711
his 2023
for 1779
on 1768
had 1628
as 1589
with 1524
by 1519
at 1463
were 1435
it 1362
which 1305
be 1135
from 1024
not 1014
this 992
is 937
or 932
but 874
one 782
have 780
oswald 776
... ...
eighteen: 1
lading 1
sustain 1
inflict, 1
markets, 1
blow. 1
ill-health 1
delirium 1
tremens, 1
dejection, 1
sacking 1
prize-fighter 1
scandalized 1
outshone 1
ferdinand 1
grain. 1
fluctuations 1
attempt" 1
action; 1
grains, 1
prices, 1
protectionists 1
depress 1
market, 1
election; 1
napoleon 1
french, 1
popularity 1
"convulsive 1
lessening 1

22943 rows × 1 columns

In [25]:
# check a certain word
w_count_df.at['minute', 0]
Out[25]:
11
In [ ]:
# fequency bar plot - it takes time!!
w_count_df.plot.bar()
</html>