coqui-tts/notebooks/dataset_analysis/AnalyzeDataset.ipynb

9.2 KiB

None <html lang="en"> <head> </head>
In [ ]:
TTS_PATH = "/home/erogol/projects/"
In [ ]:
import os
import sys
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
import glob
import librosa
import numpy as np
import pandas as pd
from scipy.stats import norm
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool
from matplotlib import pylab as plt
from collections import Counter
from TTS.tts.datasets.preprocess import *
%matplotlib inline
In [ ]:
DATA_PATH = "/home/erogol/Data/m-ai-labs/de_DE/by_book/male/karlsson/"
META_DATA = ["kleinzaches/metadata.csv",
            "spiegel_kaetzchen/metadata.csv",
            "herrnarnesschatz/metadata.csv",
            "maedchen_von_moorhof/metadata.csv",
            "koenigsgaukler/metadata.csv",
            "altehous/metadata.csv",
            "odysseus/metadata.csv",
            "undine/metadata.csv",
            "reise_tilsit/metadata.csv",
            "schmied_seines_glueckes/metadata.csv",
            "kammmacher/metadata.csv",
            "unterm_birnbaum/metadata.csv",
            "liebesbriefe/metadata.csv",
            "sandmann/metadata.csv"]
NUM_PROC = 8
In [ ]:
# use your own preprocessor at this stage - TTS/datasets/proprocess.py
items = mailabs(DATA_PATH, META_DATA)
print(" > Number of audio files: {}".format(len(items)))
In [ ]:
# check wavs if exist
wav_files = []
for item in items:
    wav_file = item[1].strip()
    wav_files.append(wav_file)
    if not os.path.exists(wav_file):
        print(waf_path)
In [ ]:
# show duplicate items
c = Counter(wav_files)
print([item for item, count in c.items() if count > 1])
In [ ]:
def load_item(item):
    file_name = item[1].strip()
    text = item[0].strip()
    audio = librosa.load(file_name, sr=None)
    sr = audio[1]
    audio = audio[0]
    audio_len = len(audio) / sr
    text_len = len(text)
    return file_name, text, text_len, audio, audio_len

# This will take a while depending on size of dataset
if NUM_PROC == 1:
    data = []
    for m in tqdm(items):
        data += [load_item(m)]
else:
    with Pool(8) as p:
        data = list(tqdm(p.imap(load_item, items), total=len(items)))
In [ ]:
# count words in the dataset
w_count = Counter()
for item in tqdm(data):
    text = item[1].lower().strip()
    for word in text.split():
        w_count[word] += 1
print(" > Number of words: {}".format(len(w_count)))
In [ ]:
text_vs_durs = {}  # text length vs audio duration
text_len_counter = Counter()  # number of sentences with the keyed length
for item in tqdm(data):
    text = item[1].lower().strip()
    text_len = len(text)
    text_len_counter[text_len] += 1
    audio_len = item[-1]
    try:
        text_vs_durs[text_len] += [audio_len]
    except:
        text_vs_durs[text_len] = [audio_len]
In [ ]:
# text_len vs avg_audio_len, median_audio_len, std_audio_len
text_vs_avg = {}
text_vs_median = {}
text_vs_std = {}
for key, durs in text_vs_durs.items():
    text_vs_avg[key] = np.mean(durs)
    text_vs_median[key] = np.median(durs)
    text_vs_std[key] = np.std(durs)

Avg audio length per char

In [ ]:
for item in data:
    if item[-1] < 2:
        print(item)
In [ ]:
sec_per_chars = []
for item in data:
    text = item[1]
    dur = item[-1]
    sec_per_char = dur / len(text)
    sec_per_chars.append(sec_per_char)
# sec_per_char /= len(data)
# print(sec_per_char)
In [ ]:
mean = np.mean(sec_per_chars)
std = np.std(sec_per_chars)
print(mean)
print(std)
In [ ]:
dist = norm(mean, std)

# find irregular instances long or short voice durations
for item in data:
    text = item[1]
    dur = item[-1]
    sec_per_char = dur / len(text)
    pdf =norm.pdf(sec_per_char)
    if pdf < 0.39:
        print(item)

Plot Dataset Statistics

In [ ]:
plt.title("text length vs mean audio duration")
plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))
In [ ]:
plt.title("text length vs median audio duration")
plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))
In [ ]:
plt.title("text length vs STD")
plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))
In [ ]:
plt.title("text length vs # instances")
plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))

Check words frequencies

In [ ]:
w_count_df = pd.DataFrame.from_dict(w_count, orient='index')
w_count_df.sort_values(0, ascending=False, inplace=True)
In [ ]:
w_count_df
In [ ]:
# check a certain word
w_count_df.at['minute', 0]
In [ ]:
# fequency bar plot - it takes time!!
w_count_df.plot.bar()
</html>