From 9758971baaed77b94e5a7978d4deb8b956c8bf96 Mon Sep 17 00:00:00 2001 From: Jindrich Matousek Date: Sun, 10 Jul 2022 11:27:02 +0200 Subject: [PATCH] Add artic formatter --- TTS/tts/datasets/formatters.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index ef05ea7c..ce36c819 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -556,3 +556,25 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument text = cols[2].replace(" ", "") items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items + + +def artic(root_path, meta_file, **kwargs): # pylint: disable=unused-argument + """Normalizes the ARTIC meta data file to TTS format""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "artic" + with open(txt_file, "r", encoding="utf-8") as ttf: + for line in ttf: + # Split according to standard delimiter + cols = line.split("|") + if len(cols) > 1: + # One or two |s are present => text is taken from the last part + text = cols[-1] + else: + # Assume ARTIC SNT format => wav name is delimited by the first space + cols = line.split(maxsplit=1) + text = cols[1] + # in either way, wav name is stored in `cols[0]` + wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) + return items