mirror of https://github.com/coqui-ai/TTS.git
9.6 KiB
9.6 KiB
None
<html lang="en">
<head>
</head>
</html>
In [ ]:
# This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus # # import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import os import librosa
In [ ]:
# unpackage tar gz file cv-corpus-12.0-2022-12-07-be.tar.gz # import tarfile # tar = tarfile.open("cv-corpus-12.0-2022-12-07-be.tar.gz", "r:gz") # tar.extractall() # tar.close() corpuspath = '/a/cv-corpus' outputpath = '/storage/filtered_dataset'
In [ ]:
# open validated.tsv df = pd.read_csv(corpuspath+'/be/validated.tsv', sep='\t' ,low_memory=False) df
In [ ]:
# drop from df columns age, accents df = df.drop(['age', 'accents', 'gender', 'variant', 'locale', 'segment'], axis=1) df
In [ ]:
# count number of recordes with down_votes > 0 df[df['down_votes'] > 0].count()
In [ ]:
# count number of recordes with up_votes == 0 df[df['up_votes'] == 0].count()
In [ ]:
# drop all rows with down_votes > 0 and up_votes == 0 df = df[df['down_votes'] == 0] df = df[df['up_votes'] > 0] df
In [ ]:
# drop column down_votes and up_votes df = df.drop(['down_votes', 'up_votes'], axis=1) df
In [ ]:
# sort by count df_sorted = df.groupby('client_id').count().sort_values(by='path', ascending=False) df_sorted
In [ ]:
# get top 10 speakers top_10_speakers = df_sorted.head(10) top_10_speakers
In [ ]:
# get for the first speaker ten random paths to audio files def get_speaker_audio_list(speaker_id, n=10): return df[df['client_id'] == speaker_id].sample(n)['path'].values.tolist()
In [ ]:
# CHOOSE : which speaker will we use speaker_index = 0 speaker_audio_list = get_speaker_audio_list(top_10_speakers.index[speaker_index]) print(speaker_audio_list)
In [ ]:
# open audio files from speaker_audio_list and play them # audio files lie in cv-corpus-12.0-2022-12-07/be/clips import IPython.display as ipd for audio in speaker_audio_list: audio = corpuspath+'/be/clips/' + audio audio_data = ipd.Audio(audio) display(audio_data)
In [ ]:
# 0 is pretty good # 1 is bad # 2 is partly 0, other are different # 3 is bad # 4 is pretty fast and clear, but not good # 5 is echoing, sometimes mic cracks # 6 is really slow and clear, but accent? # 7 has a lot of intonation, but is pretty clear # 8 is clear and slow, sometimes little mic crack # 9 has background noise, whispering # options: 0, 6, 8
In [ ]:
# calculate speech rate in words per minute for each speaker def get_speech_rate(speaker_id): df_speaker = df[df['client_id'] == speaker_id] # get 1000 random samples to calculate speech rate df_speaker = df_speaker.sample(1000) # get duration of each audio file df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x)) # get number of words in each audio file df_speaker['words'] = df_speaker['sentence'].apply(lambda x: len(x.split())) # calculate speech rate df_speaker['speech_rate'] = df_speaker['words'] / df_speaker['duration'] * 60 # return mean speech rate return df_speaker['speech_rate'].mean()
In [ ]:
# calculate speech rate for each speaker print(f'Speech rate for speaker {speaker_index}: ', get_speech_rate(top_10_speakers.index[speaker_index]))
In [ ]:
def get_average_duration(df_speaker): # get 1000 random samples to calculate speech rate df_speaker = df_speaker.sample(1000) # get duration of each audio file df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x)) return df_speaker['duration'].mean()
In [ ]:
df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]] avg_duration = get_average_duration(df_speaker) avg_total_duration = avg_duration * len(df_speaker.index) print(f'Average duration for speaker {speaker_index}: ', avg_duration, ", average total duration(hours): ",(avg_total_duration/60.0/60.0))
In [ ]:
# get df with speaker_index speaker df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]] df_speaker = df_speaker.drop(['client_id'], axis=1) # get only x latest hours limit_hours = 30 limit_files = round(limit_hours*60*60 / avg_duration) df_speaker = df_speaker.tail(limit_files) df_speaker
In [ ]:
# # move all files of that speaker to another folder # # use multiprocessing to speed up # # add progress bar # from tqdm import tqdm # import multiprocessing # from multiprocessing import Pool # import shutil # def move_file(file): # shutil.move(corpuspath+'/be/clips/' + file, corpuspath+'/be/speaker_0/' + file) # # get list of files to move # files = df_speaker['path'].values.tolist() # # move files # with Pool(multiprocessing.cpu_count()) as p: # r = list(tqdm(p.imap(move_file, files), total=len(files)))
In [ ]:
# cleanup output and save text lines to csv if os.path.isdir(outputpath): for file in os.scandir(outputpath): os.remove(file.path) else: os.mkdir(outputpath) df_speaker['path2'] = df_speaker['path'].str.replace('\.mp3$','.wav', regex=True) df_speaker[['path2','sentence']].to_csv(outputpath+'/df_speaker.csv', sep='|', header=False, index=False)
In [ ]:
# make rate=22050 of all mp3 files in speaker_0 folder with multiprocessing and tqdm import multiprocessing from multiprocessing import Pool from tqdm import tqdm from pydub import AudioSegment def convert_mp3_to_wav(file): sound = AudioSegment.from_mp3(corpuspath+'/be/clips/' + file) sound = sound.set_frame_rate(22050) sound.export(outputpath+'/' + file[:-4] + '.wav', format='wav') # get list of files to convert files = df_speaker['path'].values.tolist() # convert files with Pool(multiprocessing.cpu_count()) as p: r = list(tqdm(p.imap(convert_mp3_to_wav, files), total=len(files)))