diff --git a/TTS/tts/layers/glow_tts/encoder.py b/TTS/tts/layers/glow_tts/encoder.py index f3eb4655..50bcc451 100644 --- a/TTS/tts/layers/glow_tts/encoder.py +++ b/TTS/tts/layers/glow_tts/encoder.py @@ -9,7 +9,7 @@ from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlo from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask class Encoder(nn.Module): diff --git a/TTS/tts/layers/glow_tts/monotonic_align/__init__.py b/TTS/tts/layers/glow_tts/monotonic_align/__init__.py index ee058095..303467ed 100644 --- a/TTS/tts/layers/glow_tts/monotonic_align/__init__.py +++ b/TTS/tts/layers/glow_tts/monotonic_align/__init__.py @@ -2,7 +2,7 @@ import numpy as np import torch from torch.nn import functional as F -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask try: # TODO: fix pypi cython installation problem. diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index a2fd7635..2c752376 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -6,7 +6,7 @@ from coqpit import Coqpit from torch import nn from torch.nn import functional -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.ssim import ssim from TTS.utils.audio import TorchSTFT diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index cf9d6e41..cfc8b6ac 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -5,7 +5,7 @@ from torch import nn from TTS.tts.layers.glow_tts.glow import WN from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask LRELU_SLOPE = 0.1 diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 2c3bed3d..3e5e7e5d 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -12,7 +12,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_fsspec diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index 01291775..efe379d5 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -9,7 +9,7 @@ from torch import nn from TTS.tts.layers.losses import TacotronLoss from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager from TTS.tts.utils.text import make_symbols from TTS.utils.generic_utils import format_aux_input diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 06c7cb2b..90ac18e9 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -1,3 +1,4 @@ +from TTS.tts.utils.helpers import segment import os from typing import Dict, List, Tuple diff --git a/TTS/tts/models/fast_pitch.py b/TTS/tts/models/fast_pitch.py index 1dd0bd68..9a87fe0f 100644 --- a/TTS/tts/models/fast_pitch.py +++ b/TTS/tts/models/fast_pitch.py @@ -13,9 +13,10 @@ from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram from TTS.utils.audio import AudioProcessor +from TTS.tts.utils.helpers import average_over_durations @dataclass @@ -416,7 +417,7 @@ class FastPitch(BaseTTS): """ o_pitch = self.pitch_predictor(o_en, x_mask) if pitch is not None: - avg_pitch = average_pitch(pitch, dr) + avg_pitch = average_over_durations(pitch, dr) o_pitch_emb = self.pitch_emb(avg_pitch) return o_pitch_emb, o_pitch, avg_pitch o_pitch_emb = self.pitch_emb(o_pitch) @@ -670,28 +671,3 @@ class FastPitch(BaseTTS): """Enable binary alignment loss when needed""" if trainer.total_steps_done > self.config.binary_align_loss_start_step: self.use_binary_alignment_loss = True - - -def average_pitch(pitch, durs): - """Compute the average pitch value for each input character based on the durations. - - Shapes: - - pitch: :math:`[B, 1, T_de]` - - durs: :math:`[B, T_en]` - """ - - durs_cums_ends = torch.cumsum(durs, dim=1).long() - durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0)) - pitch_nonzero_cums = torch.nn.functional.pad(torch.cumsum(pitch != 0.0, dim=2), (1, 0)) - pitch_cums = torch.nn.functional.pad(torch.cumsum(pitch, dim=2), (1, 0)) - - bs, l = durs_cums_ends.size() - n_formants = pitch.size(1) - dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l) - dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l) - - pitch_sums = (torch.gather(pitch_cums, 2, dce) - torch.gather(pitch_cums, 2, dcs)).float() - pitch_nelems = (torch.gather(pitch_nonzero_cums, 2, dce) - torch.gather(pitch_nonzero_cums, 2, dcs)).float() - - pitch_avg = torch.where(pitch_nelems == 0.0, pitch_nelems, pitch_sums / pitch_nelems) - return pitch_avg diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index b063b6b4..2c194a64 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -9,7 +9,7 @@ from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.speakers import get_speaker_manager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment, plot_spectrogram diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 86109e74..8dbf0e0a 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -10,7 +10,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 0da43f90..0aceb61c 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -14,7 +14,7 @@ from TTS.tts.layers.vits.discriminator import VitsDiscriminator from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.speakers import get_speaker_manager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment @@ -22,28 +22,7 @@ from TTS.utils.audio import AudioProcessor from TTS.utils.trainer_utils import get_optimizer, get_scheduler from TTS.vocoder.models.hifigan_generator import HifiganGenerator from TTS.vocoder.utils.generic_utils import plot_results - - -def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4): - """Segment each sample in a batch based on the provided segment indices""" - segments = torch.zeros_like(x[:, :, :segment_size]) - for i in range(x.size(0)): - index_start = segment_indices[i] - index_end = index_start + segment_size - segments[i] = x[i, :, index_start:index_end] - return segments - - -def rand_segment(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4): - """Create random segments based on the input lengths.""" - B, _, T = x.size() - if x_lengths is None: - x_lengths = T - max_idxs = x_lengths - segment_size + 1 - assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size." - segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long() - ret = segment(x, segment_indices, segment_size) - return ret, segment_indices +from TTS.tts.utils.helpers import rand_segments, segment @dataclass @@ -451,7 +430,7 @@ class Vits(BaseTTS): logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p]) # select a random feature segment for the waveform decoder - z_slice, slice_ids = rand_segment(z, y_lengths, self.spec_segment_size) + z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size) o = self.waveform_decoder(z_slice, g=g) outputs.update( { diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 887f4376..7a766958 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -54,33 +54,3 @@ def pad_per_step(inputs, pad_len): return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0) -# pylint: disable=attribute-defined-outside-init -class StandardScaler: - def set_stats(self, mean, scale): - self.mean_ = mean - self.scale_ = scale - - def reset_stats(self): - delattr(self, "mean_") - delattr(self, "scale_") - - def transform(self, X): - X = np.asarray(X) - X -= self.mean_ - X /= self.scale_ - return X - - def inverse_transform(self, X): - X = np.asarray(X) - X *= self.scale_ - X += self.mean_ - return X - - -# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length, max_len=None): - if max_len is None: - max_len = sequence_length.data.max() - seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) - # B x T_max - return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) diff --git a/TTS/tts/utils/visual.py b/TTS/tts/utils/visual.py index 7101ed3d..ff71958e 100644 --- a/TTS/tts/utils/visual.py +++ b/TTS/tts/utils/visual.py @@ -101,6 +101,7 @@ def visualize( figsize=(8, 24), output_fig=False, ): + """Intended to be used in Notebooks.""" if decoder_output is not None: num_plot = 4 diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 01d1f7d1..a8952afc 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -9,7 +9,7 @@ import soundfile as sf import torch from torch import nn -from TTS.tts.utils.data import StandardScaler +from TTS.tts.utils.helpers import StandardScaler class TorchSTFT(nn.Module): # pylint: disable=abstract-method diff --git a/docs/source/faq.md b/docs/source/faq.md index 4dbaab13..3de6663d 100644 --- a/docs/source/faq.md +++ b/docs/source/faq.md @@ -7,7 +7,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is - If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny. ## What are the requirements of a good 🐸TTS dataset? -* https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset +* {ref}`See this page ` ## How should I choose the right model? - First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2. diff --git a/docs/source/training_a_model.md b/docs/source/training_a_model.md index a7e81f28..aadd741e 100644 --- a/docs/source/training_a_model.md +++ b/docs/source/training_a_model.md @@ -54,7 +54,7 @@ 4. Run the training. - You need to call the python training script. + You need to run the training script. ```bash $ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py @@ -63,7 +63,7 @@ Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable. To see available GPUs on your system, you can use `nvidia-smi` command on the terminal. - If you like to run a multi-gpu training + If you like to run a multi-gpu training using DDP back-end, ```bash $ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script /train_glowtts.py diff --git a/docs/source/what_makes_a_good_dataset.md b/docs/source/what_makes_a_good_dataset.md index 49a2943b..18c87453 100644 --- a/docs/source/what_makes_a_good_dataset.md +++ b/docs/source/what_makes_a_good_dataset.md @@ -1,3 +1,4 @@ +(what_makes_a_good_dataset)= # What makes a good TTS dataset ## What Makes a Good Dataset diff --git a/tests/tts_tests/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py index 1c2d3803..6b26b88f 100644 --- a/tests/tts_tests/test_feed_forward_layers.py +++ b/tests/tts_tests/test_feed_forward_layers.py @@ -2,7 +2,7 @@ import torch from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.encoder import Encoder -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index a5c481f1..adf17b8b 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -3,7 +3,7 @@ import torch from TTS.tts.configs import SpeedySpeechConfig from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 783be0db..fdce75dd 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -4,7 +4,7 @@ import torch as T from TTS.tts.layers.losses import L1LossMasked, SSIMLoss from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet -from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.helpers import sequence_mask # pylint: disable=unused-variable