Approximate audio length from file size

This commit is contained in:
Eren Gölge 2021-10-18 08:54:02 +00:00
parent b4b890df03
commit a0a5d580e9
4 changed files with 5 additions and 5 deletions

View File

@ -106,7 +106,7 @@ class TacotronConfig(BaseTTSConfig):
Weight decay coefficient. Defaults to `1e-6`.
grad_clip (float):
Gradient clipping threshold. Defaults to `5`.
seq_len_notm (bool):
seq_len_norm (bool):
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
is divided by the sequence length. Defaults to False.
loss_masking (bool):

View File

@ -330,7 +330,7 @@ class TTSDataset(Dataset):
if by_audio_len:
lengths = []
for item in self.items:
lengths.append(os.path.getsize(item[1]))
lengths.append(os.path.getsize(item[1]) / 16 * 8) # assuming 16bit audio
lengths = np.array(lengths)
else:
lengths = np.array([len(ins[0]) for ins in self.items])

View File

@ -242,4 +242,4 @@ class BaseTacotron(BaseTTS):
self.decoder.set_r(r)
if trainer.config.bidirectional_decoder:
trainer.model.decoder_backward.set_r(r)
print(f"\n > Number of output frames: {self.decoder.r}")
print(f"\n > Number of output frames: {self.decoder.r}")

View File

@ -20,9 +20,9 @@ from TTS.utils.audio import AudioProcessor
class BaseTTS(BaseModel):
"""Abstract `tts` class. Every new `tts` model must inherit this.
"""Base `tts` class. Every new `tts` model must inherit this.
It defines `tts` specific functions on top of `Model`.
It defines common `tts` specific functions on top of `Model` implementation.
Notes on input/output tensor shapes:
Any input or output tensor of the model must be shaped as