From c312343585daf7da2ddd275d37da446419cb4367 Mon Sep 17 00:00:00 2001 From: Jindrich Matousek Date: Wed, 6 Sep 2023 17:05:47 +0200 Subject: [PATCH] Language of each item (sample/utterance) is set to dataset language only when not defined at the sample/utterance level Speaker name is prepended by dataset name in case of multispeaker datasets Refactor "artic" formatter --- TTS/tts/datasets/__init__.py | 8 ++++++-- TTS/tts/datasets/formatters.py | 13 ++++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 19213856..02434917 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -58,8 +58,12 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): def add_extra_keys(metadata, language, dataset_name): for item in metadata: - # add language name - item["language"] = language + # JMa: Add language name only if not defined at the sample level. Could be good for multi-language datasets. + if not item["language"]: + item["language"] = language + # JMa: Prepend dataset name to speaker name. Could be good for multispeaker datasets. + if item["speaker_name"] != dataset_name and not item["speaker_name"].startswith(dataset_name+"_"): + item["speaker_name"] = f'{dataset_name}_{item["speaker_name"]}' # add unique audio name relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0] audio_unique_name = f"{dataset_name}#{relfilepath}" diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 2ea75ec3..113fc8d1 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -652,11 +652,14 @@ def artic(root_path, meta_file, **kwargs): # pylint: disable=unused-argument items = [] # Speaker name is the name of the directory with the data (last part of `root_path`) speaker_name = os.path.basename(os.path.normpath(root_path)) - # Speaker name can consists of language code (eg. cs-CZ) and gender (m/f) separated by dots - # Example: AndJa.cs-CZ.m - parts = speaker_name.split(".") - lang = parts[1] if len(parts) == 3 and "-" in parts[1] else None - print(f" > ARTIC dataset: voice {parts[0]}, language {lang}") + # Speaker name can consists of language code (eg. cs-CZ or en) and gender (m/f) separated by dots + # Example: AndJa.cs-CZ.m, LJS.en.f + try: + voice, lang, sex = speaker_name.split(".") + except ValueError: + voice = speaker_name + lang, sex = None, None + print(f" > ARTIC dataset: voice={voice}, sex={sex}, language={lang}") with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: # Check the number of standard separators