From b0b97d636fdd31b07a18c492a00466ce880c6094 Mon Sep 17 00:00:00 2001
From: Qingping Hou <qph@scribd.com>
Date: Sat, 14 Nov 2020 23:43:03 -0800
Subject: [PATCH] speed up metafile build for voxceleb

---
 TTS/bin/train_encoder.py       | 2 +-
 TTS/tts/datasets/preprocess.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index 078f7b84..dba866db 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -35,7 +35,7 @@ print(" > Using CUDA: ", use_cuda)
 print(" > Number of GPUs: ", num_gpus)
 
 
-def setup_loader(ap, is_val=False, verbose=False):
+def setup_loader(ap: AudioProcessor, is_val: bool=False, verbose: bool=False):
     if is_val:
         loader = None
     else:
diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index 469da07e..31d4b2b5 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -228,7 +228,6 @@ def brspeech(root_path, meta_file):
             if line.startswith("wav_filename"):
                 continue
             cols = line.split('|')
-            #print(cols)
             wav_file = os.path.join(root_path, cols[0])
             text = cols[2]
             speaker_name = cols[3]
@@ -304,17 +303,17 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
 
     elif not cache_to.exists():
         cnt = 0
-        meta_data = ""
+        meta_data = []
         wav_files = voxceleb_path.rglob("**/*.wav")
         for path in tqdm(wav_files, desc=f"Building VoxCeleb {voxcel_idx} Meta file ... this needs to be done only once.",
                          total=expected_count):
             speaker_id = str(Path(path).parent.parent.stem)
             assert speaker_id.startswith('id')
             text = None  # VoxCel does not provide transciptions, and they are not needed for training the SE
-            meta_data += f"{text}|{path}|voxcel{voxcel_idx}_{speaker_id}\n"
+            meta_data.append(f"{text}|{path}|voxcel{voxcel_idx}_{speaker_id}\n")
             cnt += 1
         with open(str(cache_to), 'w') as f:
-            f.write(meta_data)
+            f.write("".join(meta_data))
         if cnt < expected_count:
             raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}")