mirror of https://github.com/coqui-ai/TTS.git
9.2 KiB
9.2 KiB
None
<html lang="en">
<head>
</head>
</html>
In [ ]:
TTS_PATH = "/home/erogol/projects/"
In [ ]:
import os import sys sys.path.append(TTS_PATH) # set this if TTS is not installed globally import glob import librosa import numpy as np import pandas as pd from scipy.stats import norm from tqdm import tqdm_notebook as tqdm from multiprocessing import Pool from matplotlib import pylab as plt from collections import Counter from TTS.tts.datasets.preprocess import * %matplotlib inline
In [ ]:
DATA_PATH = "/home/erogol/Data/m-ai-labs/de_DE/by_book/male/karlsson/" META_DATA = ["kleinzaches/metadata.csv", "spiegel_kaetzchen/metadata.csv", "herrnarnesschatz/metadata.csv", "maedchen_von_moorhof/metadata.csv", "koenigsgaukler/metadata.csv", "altehous/metadata.csv", "odysseus/metadata.csv", "undine/metadata.csv", "reise_tilsit/metadata.csv", "schmied_seines_glueckes/metadata.csv", "kammmacher/metadata.csv", "unterm_birnbaum/metadata.csv", "liebesbriefe/metadata.csv", "sandmann/metadata.csv"] NUM_PROC = 8
In [ ]:
# use your own preprocessor at this stage - TTS/datasets/proprocess.py items = mailabs(DATA_PATH, META_DATA) print(" > Number of audio files: {}".format(len(items)))
In [ ]:
# check wavs if exist wav_files = [] for item in items: wav_file = item[1].strip() wav_files.append(wav_file) if not os.path.exists(wav_file): print(waf_path)
In [ ]:
# show duplicate items c = Counter(wav_files) print([item for item, count in c.items() if count > 1])
In [ ]:
def load_item(item): file_name = item[1].strip() text = item[0].strip() audio = librosa.load(file_name, sr=None) sr = audio[1] audio = audio[0] audio_len = len(audio) / sr text_len = len(text) return file_name, text, text_len, audio, audio_len # This will take a while depending on size of dataset if NUM_PROC == 1: data = [] for m in tqdm(items): data += [load_item(m)] else: with Pool(8) as p: data = list(tqdm(p.imap(load_item, items), total=len(items)))
In [ ]:
# count words in the dataset w_count = Counter() for item in tqdm(data): text = item[1].lower().strip() for word in text.split(): w_count[word] += 1 print(" > Number of words: {}".format(len(w_count)))
In [ ]:
text_vs_durs = {} # text length vs audio duration text_len_counter = Counter() # number of sentences with the keyed length for item in tqdm(data): text = item[1].lower().strip() text_len = len(text) text_len_counter[text_len] += 1 audio_len = item[-1] try: text_vs_durs[text_len] += [audio_len] except: text_vs_durs[text_len] = [audio_len]
In [ ]:
# text_len vs avg_audio_len, median_audio_len, std_audio_len text_vs_avg = {} text_vs_median = {} text_vs_std = {} for key, durs in text_vs_durs.items(): text_vs_avg[key] = np.mean(durs) text_vs_median[key] = np.median(durs) text_vs_std[key] = np.std(durs)
Avg audio length per char¶
In [ ]:
for item in data: if item[-1] < 2: print(item)
In [ ]:
sec_per_chars = [] for item in data: text = item[1] dur = item[-1] sec_per_char = dur / len(text) sec_per_chars.append(sec_per_char) # sec_per_char /= len(data) # print(sec_per_char)
In [ ]:
mean = np.mean(sec_per_chars) std = np.std(sec_per_chars) print(mean) print(std)
In [ ]:
dist = norm(mean, std) # find irregular instances long or short voice durations for item in data: text = item[1] dur = item[-1] sec_per_char = dur / len(text) pdf =norm.pdf(sec_per_char) if pdf < 0.39: print(item)
Plot Dataset Statistics¶
In [ ]:
plt.title("text length vs mean audio duration") plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))
In [ ]:
plt.title("text length vs median audio duration") plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))
In [ ]:
plt.title("text length vs STD") plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))
In [ ]:
plt.title("text length vs # instances") plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))
Check words frequencies¶
In [ ]:
w_count_df = pd.DataFrame.from_dict(w_count, orient='index') w_count_df.sort_values(0, ascending=False, inplace=True)
In [ ]:
w_count_df
In [ ]:
# check a certain word w_count_df.at['minute', 0]
In [ ]:
# fequency bar plot - it takes time!! w_count_df.plot.bar()