mirror of https://github.com/coqui-ai/TTS.git
347 lines
9.6 KiB
Plaintext
347 lines
9.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus\n",
|
|
"#\n",
|
|
"#\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"import os\n",
|
|
"import librosa"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# unpackage tar gz file cv-corpus-12.0-2022-12-07-be.tar.gz\n",
|
|
"# import tarfile\n",
|
|
"# tar = tarfile.open(\"cv-corpus-12.0-2022-12-07-be.tar.gz\", \"r:gz\")\n",
|
|
"# tar.extractall()\n",
|
|
"# tar.close()\n",
|
|
"\n",
|
|
"corpuspath = '/a/cv-corpus'\n",
|
|
"outputpath = '/storage/filtered_dataset'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# open validated.tsv\n",
|
|
"df = pd.read_csv(corpuspath+'/be/validated.tsv', sep='\\t' ,low_memory=False)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# drop from df columns age, accents\n",
|
|
"df = df.drop(['age', 'accents', 'gender', 'variant', 'locale', 'segment'], axis=1)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# count number of recordes with down_votes > 0\n",
|
|
"df[df['down_votes'] > 0].count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# count number of recordes with up_votes == 0\n",
|
|
"df[df['up_votes'] == 0].count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# drop all rows with down_votes > 0 and up_votes == 0\n",
|
|
"df = df[df['down_votes'] == 0]\n",
|
|
"df = df[df['up_votes'] > 0]\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# drop column down_votes and up_votes\n",
|
|
"df = df.drop(['down_votes', 'up_votes'], axis=1)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# sort by count\n",
|
|
"df_sorted = df.groupby('client_id').count().sort_values(by='path', ascending=False)\n",
|
|
"df_sorted"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# get top 10 speakers\n",
|
|
"top_10_speakers = df_sorted.head(10)\n",
|
|
"top_10_speakers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# get for the first speaker ten random paths to audio files\n",
|
|
"def get_speaker_audio_list(speaker_id, n=10):\n",
|
|
" return df[df['client_id'] == speaker_id].sample(n)['path'].values.tolist()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# CHOOSE : which speaker will we use\n",
|
|
"speaker_index = 0\n",
|
|
"speaker_audio_list = get_speaker_audio_list(top_10_speakers.index[speaker_index])\n",
|
|
"print(speaker_audio_list)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# open audio files from speaker_audio_list and play them\n",
|
|
"# audio files lie in cv-corpus-12.0-2022-12-07/be/clips\n",
|
|
"import IPython.display as ipd\n",
|
|
"for audio in speaker_audio_list:\n",
|
|
" audio = corpuspath+'/be/clips/' + audio\n",
|
|
" audio_data = ipd.Audio(audio)\n",
|
|
" display(audio_data)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# 0 is pretty good\n",
|
|
"# 1 is bad\n",
|
|
"# 2 is partly 0, other are different\n",
|
|
"# 3 is bad\n",
|
|
"# 4 is pretty fast and clear, but not good\n",
|
|
"# 5 is echoing, sometimes mic cracks\n",
|
|
"# 6 is really slow and clear, but accent?\n",
|
|
"# 7 has a lot of intonation, but is pretty clear\n",
|
|
"# 8 is clear and slow, sometimes little mic crack\n",
|
|
"# 9 has background noise, whispering\n",
|
|
"\n",
|
|
"# options: 0, 6, 8"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# calculate speech rate in words per minute for each speaker\n",
|
|
"def get_speech_rate(speaker_id):\n",
|
|
" df_speaker = df[df['client_id'] == speaker_id]\n",
|
|
" # get 1000 random samples to calculate speech rate\n",
|
|
" df_speaker = df_speaker.sample(1000)\n",
|
|
" # get duration of each audio file\n",
|
|
" df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n",
|
|
" # get number of words in each audio file\n",
|
|
" df_speaker['words'] = df_speaker['sentence'].apply(lambda x: len(x.split()))\n",
|
|
" # calculate speech rate\n",
|
|
" df_speaker['speech_rate'] = df_speaker['words'] / df_speaker['duration'] * 60\n",
|
|
" # return mean speech rate\n",
|
|
" return df_speaker['speech_rate'].mean()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# calculate speech rate for each speaker\n",
|
|
"print(f'Speech rate for speaker {speaker_index}: ', get_speech_rate(top_10_speakers.index[speaker_index]))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_average_duration(df_speaker):\n",
|
|
" # get 1000 random samples to calculate speech rate\n",
|
|
" df_speaker = df_speaker.sample(1000)\n",
|
|
" # get duration of each audio file\n",
|
|
" df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n",
|
|
" return df_speaker['duration'].mean()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n",
|
|
"\n",
|
|
"avg_duration = get_average_duration(df_speaker)\n",
|
|
"avg_total_duration = avg_duration * len(df_speaker.index)\n",
|
|
"print(f'Average duration for speaker {speaker_index}: ', avg_duration, \", average total duration(hours): \",(avg_total_duration/60.0/60.0))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# get df with speaker_index speaker \n",
|
|
"df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n",
|
|
"df_speaker = df_speaker.drop(['client_id'], axis=1)\n",
|
|
"\n",
|
|
"# get only x latest hours\n",
|
|
"limit_hours = 30\n",
|
|
"limit_files = round(limit_hours*60*60 / avg_duration)\n",
|
|
"df_speaker = df_speaker.tail(limit_files)\n",
|
|
"\n",
|
|
"df_speaker"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# # move all files of that speaker to another folder\n",
|
|
"# # use multiprocessing to speed up\n",
|
|
"# # add progress bar\n",
|
|
"# from tqdm import tqdm\n",
|
|
"# import multiprocessing\n",
|
|
"# from multiprocessing import Pool\n",
|
|
"# import shutil\n",
|
|
"\n",
|
|
"# def move_file(file):\n",
|
|
"# shutil.move(corpuspath+'/be/clips/' + file, corpuspath+'/be/speaker_0/' + file)\n",
|
|
"\n",
|
|
"# # get list of files to move\n",
|
|
"# files = df_speaker['path'].values.tolist()\n",
|
|
"\n",
|
|
"# # move files\n",
|
|
"# with Pool(multiprocessing.cpu_count()) as p:\n",
|
|
"# r = list(tqdm(p.imap(move_file, files), total=len(files)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# cleanup output and save text lines to csv\n",
|
|
"if os.path.isdir(outputpath):\n",
|
|
" for file in os.scandir(outputpath):\n",
|
|
" os.remove(file.path)\n",
|
|
"else:\n",
|
|
" os.mkdir(outputpath)\n",
|
|
"\n",
|
|
"df_speaker['path2'] = df_speaker['path'].str.replace('\\.mp3$','.wav', regex=True)\n",
|
|
"df_speaker[['path2','sentence']].to_csv(outputpath+'/df_speaker.csv', sep='|', header=False, index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# make rate=22050 of all mp3 files in speaker_0 folder with multiprocessing and tqdm\n",
|
|
"import multiprocessing\n",
|
|
"from multiprocessing import Pool\n",
|
|
"from tqdm import tqdm\n",
|
|
"from pydub import AudioSegment\n",
|
|
"\n",
|
|
"def convert_mp3_to_wav(file):\n",
|
|
" sound = AudioSegment.from_mp3(corpuspath+'/be/clips/' + file)\n",
|
|
" sound = sound.set_frame_rate(22050)\n",
|
|
" sound.export(outputpath+'/' + file[:-4] + '.wav', format='wav')\n",
|
|
"\n",
|
|
"# get list of files to convert\n",
|
|
"files = df_speaker['path'].values.tolist()\n",
|
|
"\n",
|
|
"# convert files\n",
|
|
"with Pool(multiprocessing.cpu_count()) as p:\n",
|
|
" r = list(tqdm(p.imap(convert_mp3_to_wav, files), total=len(files)))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|