Parse speaker name in artic dataset to extract language and append language item

Add comments
This commit is contained in:
Jindrich Matousek 2022-09-13 17:32:25 +02:00
parent ec4501d31c
commit 8cfbe23d9e
1 changed files with 16 additions and 2 deletions

View File

@ -581,11 +581,25 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
def artic(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the ARTIC meta data file to TTS format"""
"""Normalizes the ARTIC meta data file to TTS format
Args:
root_path (str): path to the artic dataset
meta_file (str): name of the meta file containing names of wav to select and
transcripts of the corresponding utterances
Returns:
List[List[str]]: List of (text, wav_path, speaker_name, language, root_path) associated with each utterance
"""
txt_file = os.path.join(root_path, meta_file)
items = []
# Speaker name is the name of the directory with the data (last part of `root_path`)
speaker_name = os.path.basename(os.path.normpath(root_path))
# Speaker name can consists of language code (eg. cs-CZ) and gender (m/f) separated by dots
# Example: AndJa.cs-CZ.m
parts = speaker_name.split(".")
lang = parts[1] if len(parts) == 3 and "-" in parts[1] else None
print(f" > ARTIC dataset: voice {parts[0]}, language {lang}")
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
# Check the number of standard separators
@ -599,7 +613,7 @@ def artic(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
# In either way, wav name is stored in `cols[0]` and text in `cols[-1]`
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[-1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "language": lang, "root_path": root_path})
return items