Language of each item (sample/utterance) is set to dataset language only when not defined at the sample/utterance level

Speaker name is prepended by dataset name in case of multispeaker datasets
Refactor "artic" formatter
This commit is contained in:
Jindrich Matousek 2023-09-06 17:05:47 +02:00
parent a0db2eeee8
commit c312343585
2 changed files with 14 additions and 7 deletions

View File

@ -58,8 +58,12 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
def add_extra_keys(metadata, language, dataset_name): def add_extra_keys(metadata, language, dataset_name):
for item in metadata: for item in metadata:
# add language name # JMa: Add language name only if not defined at the sample level. Could be good for multi-language datasets.
if not item["language"]:
item["language"] = language item["language"] = language
# JMa: Prepend dataset name to speaker name. Could be good for multispeaker datasets.
if item["speaker_name"] != dataset_name and not item["speaker_name"].startswith(dataset_name+"_"):
item["speaker_name"] = f'{dataset_name}_{item["speaker_name"]}'
# add unique audio name # add unique audio name
relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0] relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
audio_unique_name = f"{dataset_name}#{relfilepath}" audio_unique_name = f"{dataset_name}#{relfilepath}"

View File

@ -652,11 +652,14 @@ def artic(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
items = [] items = []
# Speaker name is the name of the directory with the data (last part of `root_path`) # Speaker name is the name of the directory with the data (last part of `root_path`)
speaker_name = os.path.basename(os.path.normpath(root_path)) speaker_name = os.path.basename(os.path.normpath(root_path))
# Speaker name can consists of language code (eg. cs-CZ) and gender (m/f) separated by dots # Speaker name can consists of language code (eg. cs-CZ or en) and gender (m/f) separated by dots
# Example: AndJa.cs-CZ.m # Example: AndJa.cs-CZ.m, LJS.en.f
parts = speaker_name.split(".") try:
lang = parts[1] if len(parts) == 3 and "-" in parts[1] else None voice, lang, sex = speaker_name.split(".")
print(f" > ARTIC dataset: voice {parts[0]}, language {lang}") except ValueError:
voice = speaker_name
lang, sex = None, None
print(f" > ARTIC dataset: voice={voice}, sex={sex}, language={lang}")
with open(txt_file, "r", encoding="utf-8") as ttf: with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf: for line in ttf:
# Check the number of standard separators # Check the number of standard separators