Internal formatter (#1629)

* Add coqui formatter

* Make style
This commit is contained in:
WeberJulian 2022-06-08 14:31:03 +02:00 committed by GitHub
parent 68cef28a88
commit f09ea11c71
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 29 additions and 0 deletions

View File

@ -5,6 +5,7 @@ from glob import glob
from pathlib import Path
from typing import List
import pandas as pd
from tqdm import tqdm
########################
@ -12,6 +13,34 @@ from tqdm import tqdm
########################
def coqui(root_path, meta_file, ignored_speakers=None):
"""Interal dataset formatter."""
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["audio_file", "text"])
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
items = []
not_found_counter = 0
for row in metadata.itertuples():
if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
continue
audio_path = os.path.join(root_path, row.audio_file)
if not os.path.exists(audio_path):
not_found_counter += 1
continue
items.append(
{
"text": row.text,
"audio_file": audio_path,
"speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
}
)
if not_found_counter > 0:
print(f" | > [!] {not_found_counter} files not found")
return items
def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalize TWEB dataset.
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset