mirror of https://github.com/coqui-ai/TTS.git
Approximate audio length from file size
This commit is contained in:
parent
b4b890df03
commit
a0a5d580e9
|
@ -106,7 +106,7 @@ class TacotronConfig(BaseTTSConfig):
|
|||
Weight decay coefficient. Defaults to `1e-6`.
|
||||
grad_clip (float):
|
||||
Gradient clipping threshold. Defaults to `5`.
|
||||
seq_len_notm (bool):
|
||||
seq_len_norm (bool):
|
||||
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
||||
is divided by the sequence length. Defaults to False.
|
||||
loss_masking (bool):
|
||||
|
|
|
@ -330,7 +330,7 @@ class TTSDataset(Dataset):
|
|||
if by_audio_len:
|
||||
lengths = []
|
||||
for item in self.items:
|
||||
lengths.append(os.path.getsize(item[1]))
|
||||
lengths.append(os.path.getsize(item[1]) / 16 * 8) # assuming 16bit audio
|
||||
lengths = np.array(lengths)
|
||||
else:
|
||||
lengths = np.array([len(ins[0]) for ins in self.items])
|
||||
|
|
|
@ -242,4 +242,4 @@ class BaseTacotron(BaseTTS):
|
|||
self.decoder.set_r(r)
|
||||
if trainer.config.bidirectional_decoder:
|
||||
trainer.model.decoder_backward.set_r(r)
|
||||
print(f"\n > Number of output frames: {self.decoder.r}")
|
||||
print(f"\n > Number of output frames: {self.decoder.r}")
|
|
@ -20,9 +20,9 @@ from TTS.utils.audio import AudioProcessor
|
|||
|
||||
|
||||
class BaseTTS(BaseModel):
|
||||
"""Abstract `tts` class. Every new `tts` model must inherit this.
|
||||
"""Base `tts` class. Every new `tts` model must inherit this.
|
||||
|
||||
It defines `tts` specific functions on top of `Model`.
|
||||
It defines common `tts` specific functions on top of `Model` implementation.
|
||||
|
||||
Notes on input/output tensor shapes:
|
||||
Any input or output tensor of the model must be shaped as
|
||||
|
|
Loading…
Reference in New Issue