From a0a5d580e97e852939dbe9e3113e4c5cd983d9cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 18 Oct 2021 08:54:02 +0000
Subject: [PATCH] Approximate audio length from file size

---
 TTS/tts/configs/tacotron_config.py | 2 +-
 TTS/tts/datasets/dataset.py        | 2 +-
 TTS/tts/models/base_tacotron.py    | 2 +-
 TTS/tts/models/base_tts.py         | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py
index 89fb8d81..2577fc51 100644
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@@ -106,7 +106,7 @@ class TacotronConfig(BaseTTSConfig):
             Weight decay coefficient. Defaults to `1e-6`.
         grad_clip (float):
             Gradient clipping threshold. Defaults to `5`.
-        seq_len_notm (bool):
+        seq_len_norm (bool):
             enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
             is divided by the sequence length. Defaults to False.
         loss_masking (bool):
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index c81e0e6c..bfe0d778 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -330,7 +330,7 @@ class TTSDataset(Dataset):
         if by_audio_len:
             lengths = []
             for item in self.items:
-                lengths.append(os.path.getsize(item[1]))
+                lengths.append(os.path.getsize(item[1]) / 16 * 8)  # assuming 16bit audio
             lengths = np.array(lengths)
         else:
             lengths = np.array([len(ins[0]) for ins in self.items])
diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py
index b47a5751..c661c4cc 100644
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@@ -242,4 +242,4 @@ class BaseTacotron(BaseTTS):
             self.decoder.set_r(r)
             if trainer.config.bidirectional_decoder:
                 trainer.model.decoder_backward.set_r(r)
-            print(f"\n > Number of output frames: {self.decoder.r}")
+            print(f"\n > Number of output frames: {self.decoder.r}")
\ No newline at end of file
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 0c9f60e8..9f4d70c8 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -20,9 +20,9 @@ from TTS.utils.audio import AudioProcessor
 
 
 class BaseTTS(BaseModel):
-    """Abstract `tts` class. Every new `tts` model must inherit this.
+    """Base `tts` class. Every new `tts` model must inherit this.
 
-    It defines `tts` specific functions on top of `Model`.
+    It defines common `tts` specific functions on top of `Model` implementation.
 
     Notes on input/output tensor shapes:
         Any input or output tensor of the model must be shaped as