From 9758971baaed77b94e5a7978d4deb8b956c8bf96 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@kky.zcu.cz>
Date: Sun, 10 Jul 2022 11:27:02 +0200
Subject: [PATCH] Add artic formatter

---
 TTS/tts/datasets/formatters.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index ef05ea7c..ce36c819 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -556,3 +556,25 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             text = cols[2].replace(" ", "")
             items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
     return items
+
+
+def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
+    """Normalizes the ARTIC meta data file to TTS format"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "artic"
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            # Split according to standard delimiter
+            cols = line.split("|")
+            if len(cols) > 1:
+                # One or two |s are present => text is taken from the last part
+                text = cols[-1]
+            else:
+                # Assume ARTIC SNT format => wav name is delimited by the first space
+                cols = line.split(maxsplit=1)
+                text = cols[1]
+            # in either way, wav name is stored in `cols[0]`
+            wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+    return items