Internal formatter (#1629)

* Add coqui formatter

* Make style
This commit is contained in:
WeberJulian 2022-06-08 14:31:03 +02:00 committed by GitHub
parent 68cef28a88
commit f09ea11c71
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 29 additions and 0 deletions

View File

@ -5,6 +5,7 @@ from glob import glob
from pathlib import Path from pathlib import Path
from typing import List from typing import List
import pandas as pd
from tqdm import tqdm from tqdm import tqdm
######################## ########################
@ -12,6 +13,34 @@ from tqdm import tqdm
######################## ########################
def coqui(root_path, meta_file, ignored_speakers=None):
"""Interal dataset formatter."""
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["audio_file", "text"])
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
items = []
not_found_counter = 0
for row in metadata.itertuples():
if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
continue
audio_path = os.path.join(root_path, row.audio_file)
if not os.path.exists(audio_path):
not_found_counter += 1
continue
items.append(
{
"text": row.text,
"audio_file": audio_path,
"speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
}
)
if not_found_counter > 0:
print(f" | > [!] {not_found_counter} files not found")
return items
def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalize TWEB dataset. """Normalize TWEB dataset.
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset