coqui-tts/recipes/bel-alex73/choose_speaker.ipynb

347 lines
9.6 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus\n",
"#\n",
"#\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import os\n",
"import librosa"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# unpackage tar gz file cv-corpus-12.0-2022-12-07-be.tar.gz\n",
"# import tarfile\n",
"# tar = tarfile.open(\"cv-corpus-12.0-2022-12-07-be.tar.gz\", \"r:gz\")\n",
"# tar.extractall()\n",
"# tar.close()\n",
"\n",
"corpuspath = '/a/cv-corpus'\n",
"outputpath = '/storage/filtered_dataset'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# open validated.tsv\n",
"df = pd.read_csv(corpuspath+'/be/validated.tsv', sep='\\t' ,low_memory=False)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# drop from df columns age, accents\n",
"df = df.drop(['age', 'accents', 'gender', 'variant', 'locale', 'segment'], axis=1)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# count number of recordes with down_votes > 0\n",
"df[df['down_votes'] > 0].count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# count number of recordes with up_votes == 0\n",
"df[df['up_votes'] == 0].count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# drop all rows with down_votes > 0 and up_votes == 0\n",
"df = df[df['down_votes'] == 0]\n",
"df = df[df['up_votes'] > 0]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# drop column down_votes and up_votes\n",
"df = df.drop(['down_votes', 'up_votes'], axis=1)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# sort by count\n",
"df_sorted = df.groupby('client_id').count().sort_values(by='path', ascending=False)\n",
"df_sorted"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get top 10 speakers\n",
"top_10_speakers = df_sorted.head(10)\n",
"top_10_speakers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get for the first speaker ten random paths to audio files\n",
"def get_speaker_audio_list(speaker_id, n=10):\n",
" return df[df['client_id'] == speaker_id].sample(n)['path'].values.tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# CHOOSE : which speaker will we use\n",
"speaker_index = 0\n",
"speaker_audio_list = get_speaker_audio_list(top_10_speakers.index[speaker_index])\n",
"print(speaker_audio_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# open audio files from speaker_audio_list and play them\n",
"# audio files lie in cv-corpus-12.0-2022-12-07/be/clips\n",
"import IPython.display as ipd\n",
"for audio in speaker_audio_list:\n",
" audio = corpuspath+'/be/clips/' + audio\n",
" audio_data = ipd.Audio(audio)\n",
" display(audio_data)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 0 is pretty good\n",
"# 1 is bad\n",
"# 2 is partly 0, other are different\n",
"# 3 is bad\n",
"# 4 is pretty fast and clear, but not good\n",
"# 5 is echoing, sometimes mic cracks\n",
"# 6 is really slow and clear, but accent?\n",
"# 7 has a lot of intonation, but is pretty clear\n",
"# 8 is clear and slow, sometimes little mic crack\n",
"# 9 has background noise, whispering\n",
"\n",
"# options: 0, 6, 8"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# calculate speech rate in words per minute for each speaker\n",
"def get_speech_rate(speaker_id):\n",
" df_speaker = df[df['client_id'] == speaker_id]\n",
" # get 1000 random samples to calculate speech rate\n",
" df_speaker = df_speaker.sample(1000)\n",
" # get duration of each audio file\n",
" df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n",
" # get number of words in each audio file\n",
" df_speaker['words'] = df_speaker['sentence'].apply(lambda x: len(x.split()))\n",
" # calculate speech rate\n",
" df_speaker['speech_rate'] = df_speaker['words'] / df_speaker['duration'] * 60\n",
" # return mean speech rate\n",
" return df_speaker['speech_rate'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# calculate speech rate for each speaker\n",
"print(f'Speech rate for speaker {speaker_index}: ', get_speech_rate(top_10_speakers.index[speaker_index]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_average_duration(df_speaker):\n",
" # get 1000 random samples to calculate speech rate\n",
" df_speaker = df_speaker.sample(1000)\n",
" # get duration of each audio file\n",
" df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n",
" return df_speaker['duration'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n",
"\n",
"avg_duration = get_average_duration(df_speaker)\n",
"avg_total_duration = avg_duration * len(df_speaker.index)\n",
"print(f'Average duration for speaker {speaker_index}: ', avg_duration, \", average total duration(hours): \",(avg_total_duration/60.0/60.0))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get df with speaker_index speaker \n",
"df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n",
"df_speaker = df_speaker.drop(['client_id'], axis=1)\n",
"\n",
"# get only x latest hours\n",
"limit_hours = 30\n",
"limit_files = round(limit_hours*60*60 / avg_duration)\n",
"df_speaker = df_speaker.tail(limit_files)\n",
"\n",
"df_speaker"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # move all files of that speaker to another folder\n",
"# # use multiprocessing to speed up\n",
"# # add progress bar\n",
"# from tqdm import tqdm\n",
"# import multiprocessing\n",
"# from multiprocessing import Pool\n",
"# import shutil\n",
"\n",
"# def move_file(file):\n",
"# shutil.move(corpuspath+'/be/clips/' + file, corpuspath+'/be/speaker_0/' + file)\n",
"\n",
"# # get list of files to move\n",
"# files = df_speaker['path'].values.tolist()\n",
"\n",
"# # move files\n",
"# with Pool(multiprocessing.cpu_count()) as p:\n",
"# r = list(tqdm(p.imap(move_file, files), total=len(files)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# cleanup output and save text lines to csv\n",
"if os.path.isdir(outputpath):\n",
" for file in os.scandir(outputpath):\n",
" os.remove(file.path)\n",
"else:\n",
" os.mkdir(outputpath)\n",
"\n",
"df_speaker['path2'] = df_speaker['path'].str.replace('\\.mp3$','.wav', regex=True)\n",
"df_speaker[['path2','sentence']].to_csv(outputpath+'/df_speaker.csv', sep='|', header=False, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# make rate=22050 of all mp3 files in speaker_0 folder with multiprocessing and tqdm\n",
"import multiprocessing\n",
"from multiprocessing import Pool\n",
"from tqdm import tqdm\n",
"from pydub import AudioSegment\n",
"\n",
"def convert_mp3_to_wav(file):\n",
" sound = AudioSegment.from_mp3(corpuspath+'/be/clips/' + file)\n",
" sound = sound.set_frame_rate(22050)\n",
" sound.export(outputpath+'/' + file[:-4] + '.wav', format='wav')\n",
"\n",
"# get list of files to convert\n",
"files = df_speaker['path'].values.tolist()\n",
"\n",
"# convert files\n",
"with Pool(multiprocessing.cpu_count()) as p:\n",
" r = list(tqdm(p.imap(convert_mp3_to_wav, files), total=len(files)))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}