mirror of https://github.com/coqui-ai/TTS.git
Language of each item (sample/utterance) is set to dataset language only when not defined at the sample/utterance level
Speaker name is prepended by dataset name in case of multispeaker datasets Refactor "artic" formatter
This commit is contained in:
parent
a0db2eeee8
commit
c312343585
|
@ -58,8 +58,12 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
||||||
|
|
||||||
def add_extra_keys(metadata, language, dataset_name):
|
def add_extra_keys(metadata, language, dataset_name):
|
||||||
for item in metadata:
|
for item in metadata:
|
||||||
# add language name
|
# JMa: Add language name only if not defined at the sample level. Could be good for multi-language datasets.
|
||||||
item["language"] = language
|
if not item["language"]:
|
||||||
|
item["language"] = language
|
||||||
|
# JMa: Prepend dataset name to speaker name. Could be good for multispeaker datasets.
|
||||||
|
if item["speaker_name"] != dataset_name and not item["speaker_name"].startswith(dataset_name+"_"):
|
||||||
|
item["speaker_name"] = f'{dataset_name}_{item["speaker_name"]}'
|
||||||
# add unique audio name
|
# add unique audio name
|
||||||
relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
|
relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
|
||||||
audio_unique_name = f"{dataset_name}#{relfilepath}"
|
audio_unique_name = f"{dataset_name}#{relfilepath}"
|
||||||
|
|
|
@ -652,11 +652,14 @@ def artic(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
|
||||||
items = []
|
items = []
|
||||||
# Speaker name is the name of the directory with the data (last part of `root_path`)
|
# Speaker name is the name of the directory with the data (last part of `root_path`)
|
||||||
speaker_name = os.path.basename(os.path.normpath(root_path))
|
speaker_name = os.path.basename(os.path.normpath(root_path))
|
||||||
# Speaker name can consists of language code (eg. cs-CZ) and gender (m/f) separated by dots
|
# Speaker name can consists of language code (eg. cs-CZ or en) and gender (m/f) separated by dots
|
||||||
# Example: AndJa.cs-CZ.m
|
# Example: AndJa.cs-CZ.m, LJS.en.f
|
||||||
parts = speaker_name.split(".")
|
try:
|
||||||
lang = parts[1] if len(parts) == 3 and "-" in parts[1] else None
|
voice, lang, sex = speaker_name.split(".")
|
||||||
print(f" > ARTIC dataset: voice {parts[0]}, language {lang}")
|
except ValueError:
|
||||||
|
voice = speaker_name
|
||||||
|
lang, sex = None, None
|
||||||
|
print(f" > ARTIC dataset: voice={voice}, sex={sex}, language={lang}")
|
||||||
with open(txt_file, "r", encoding="utf-8") as ttf:
|
with open(txt_file, "r", encoding="utf-8") as ttf:
|
||||||
for line in ttf:
|
for line in ttf:
|
||||||
# Check the number of standard separators
|
# Check the number of standard separators
|
||||||
|
|
Loading…
Reference in New Issue