From e84f120a04c8851a0b3a63a7018b419e8ccce84b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 1 Apr 2021 03:41:41 +0200
Subject: [PATCH] sam-accenture model preprocessor

---
 TTS/tts/datasets/preprocess.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index a82a5db2..12148b1e 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -1,12 +1,12 @@
 import os
-from glob import glob
 import re
 import sys
+import xml.etree.ElementTree as ET
+from glob import glob
 from pathlib import Path
 from typing import List
 
 from tqdm import tqdm
-
 from TTS.tts.utils.generic_utils import split_dataset
 
 ####################
@@ -168,6 +168,23 @@ def ljspeech(root_path, meta_file):
     return items
 
 
+def sam_accenture(root_path, meta_file):
+    """Normalizes the sam-accenture meta data file to TTS format
+    https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
+    xml_file = os.path.join(root_path, 'voice_over_recordings', meta_file)
+    xml_root = ET.parse(xml_file).getroot()
+    items = []
+    speaker_name = "sam_accenture"
+    for item in xml_root.findall('./fileid'):
+        text = item.text
+        wav_file = os.path.join(root_path, 'vo_voice_quality_transformation', item.get('id')+'.wav')
+        if not os.path.exists(wav_file):
+            print(f' [!] {wav_file} in metafile does not exist. Skipping...')
+            continue
+        items.append([text, wav_file, speaker_name])
+    return items
+
+
 def ruslan(root_path, meta_file):
     """Normalizes the RUSLAN meta data file to TTS format
     https://ruslan-corpus.github.io/"""