Fix imports

This commit is contained in:
Eren Gölge 2021-09-08 13:34:40 +00:00
parent b0b96b427a
commit 4761853c5c
20 changed files with 24 additions and 96 deletions

View File

@ -9,7 +9,7 @@ from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlo
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
class Encoder(nn.Module):

View File

@ -2,7 +2,7 @@ import numpy as np
import torch
from torch.nn import functional as F
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
try:
# TODO: fix pypi cython installation problem.

View File

@ -6,7 +6,7 @@ from coqpit import Coqpit
from torch import nn
from torch.nn import functional
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.ssim import ssim
from TTS.utils.audio import TorchSTFT

View File

@ -5,7 +5,7 @@ from torch import nn
from TTS.tts.layers.glow_tts.glow import WN
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
LRELU_SLOPE = 0.1

View File

@ -12,7 +12,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_fsspec

View File

@ -9,7 +9,7 @@ from torch import nn
from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
from TTS.tts.utils.text import make_symbols
from TTS.utils.generic_utils import format_aux_input

View File

@ -1,3 +1,4 @@
from TTS.tts.utils.helpers import segment
import os
from typing import Dict, List, Tuple

View File

@ -13,9 +13,10 @@ from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.helpers import average_over_durations
@dataclass
@ -416,7 +417,7 @@ class FastPitch(BaseTTS):
"""
o_pitch = self.pitch_predictor(o_en, x_mask)
if pitch is not None:
avg_pitch = average_pitch(pitch, dr)
avg_pitch = average_over_durations(pitch, dr)
o_pitch_emb = self.pitch_emb(avg_pitch)
return o_pitch_emb, o_pitch, avg_pitch
o_pitch_emb = self.pitch_emb(o_pitch)
@ -670,28 +671,3 @@ class FastPitch(BaseTTS):
"""Enable binary alignment loss when needed"""
if trainer.total_steps_done > self.config.binary_align_loss_start_step:
self.use_binary_alignment_loss = True
def average_pitch(pitch, durs):
"""Compute the average pitch value for each input character based on the durations.
Shapes:
- pitch: :math:`[B, 1, T_de]`
- durs: :math:`[B, T_en]`
"""
durs_cums_ends = torch.cumsum(durs, dim=1).long()
durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
pitch_nonzero_cums = torch.nn.functional.pad(torch.cumsum(pitch != 0.0, dim=2), (1, 0))
pitch_cums = torch.nn.functional.pad(torch.cumsum(pitch, dim=2), (1, 0))
bs, l = durs_cums_ends.size()
n_formants = pitch.size(1)
dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
pitch_sums = (torch.gather(pitch_cums, 2, dce) - torch.gather(pitch_cums, 2, dcs)).float()
pitch_nelems = (torch.gather(pitch_nonzero_cums, 2, dce) - torch.gather(pitch_nonzero_cums, 2, dcs)).float()
pitch_avg = torch.where(pitch_nelems == 0.0, pitch_nelems, pitch_sums / pitch_nelems)
return pitch_avg

View File

@ -9,7 +9,7 @@ from TTS.tts.layers.glow_tts.decoder import Decoder
from TTS.tts.layers.glow_tts.encoder import Encoder
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.speakers import get_speaker_manager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram

View File

@ -10,7 +10,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.layers.glow_tts.monotonic_align import generate_path
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.audio import AudioProcessor

View File

@ -14,7 +14,7 @@ from TTS.tts.layers.vits.discriminator import VitsDiscriminator
from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.speakers import get_speaker_manager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.visual import plot_alignment
@ -22,28 +22,7 @@ from TTS.utils.audio import AudioProcessor
from TTS.utils.trainer_utils import get_optimizer, get_scheduler
from TTS.vocoder.models.hifigan_generator import HifiganGenerator
from TTS.vocoder.utils.generic_utils import plot_results
def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4):
"""Segment each sample in a batch based on the provided segment indices"""
segments = torch.zeros_like(x[:, :, :segment_size])
for i in range(x.size(0)):
index_start = segment_indices[i]
index_end = index_start + segment_size
segments[i] = x[i, :, index_start:index_end]
return segments
def rand_segment(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4):
"""Create random segments based on the input lengths."""
B, _, T = x.size()
if x_lengths is None:
x_lengths = T
max_idxs = x_lengths - segment_size + 1
assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size."
segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long()
ret = segment(x, segment_indices, segment_size)
return ret, segment_indices
from TTS.tts.utils.helpers import rand_segments, segment
@dataclass
@ -451,7 +430,7 @@ class Vits(BaseTTS):
logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p])
# select a random feature segment for the waveform decoder
z_slice, slice_ids = rand_segment(z, y_lengths, self.spec_segment_size)
z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size)
o = self.waveform_decoder(z_slice, g=g)
outputs.update(
{

View File

@ -54,33 +54,3 @@ def pad_per_step(inputs, pad_len):
return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
# pylint: disable=attribute-defined-outside-init
class StandardScaler:
def set_stats(self, mean, scale):
self.mean_ = mean
self.scale_ = scale
def reset_stats(self):
delattr(self, "mean_")
delattr(self, "scale_")
def transform(self, X):
X = np.asarray(X)
X -= self.mean_
X /= self.scale_
return X
def inverse_transform(self, X):
X = np.asarray(X)
X *= self.scale_
X += self.mean_
return X
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
def sequence_mask(sequence_length, max_len=None):
if max_len is None:
max_len = sequence_length.data.max()
seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
# B x T_max
return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)

View File

@ -101,6 +101,7 @@ def visualize(
figsize=(8, 24),
output_fig=False,
):
"""Intended to be used in Notebooks."""
if decoder_output is not None:
num_plot = 4

View File

@ -9,7 +9,7 @@ import soundfile as sf
import torch
from torch import nn
from TTS.tts.utils.data import StandardScaler
from TTS.tts.utils.helpers import StandardScaler
class TorchSTFT(nn.Module): # pylint: disable=abstract-method

View File

@ -7,7 +7,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny.
## What are the requirements of a good 🐸TTS dataset?
* https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset
* {ref}`See this page <what_makes_a_good_dataset>`
## How should I choose the right model?
- First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2.

View File

@ -54,7 +54,7 @@
4. Run the training.
You need to call the python training script.
You need to run the training script.
```bash
$ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py
@ -63,7 +63,7 @@
Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable.
To see available GPUs on your system, you can use `nvidia-smi` command on the terminal.
If you like to run a multi-gpu training
If you like to run a multi-gpu training using DDP back-end,
```bash
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py

View File

@ -1,3 +1,4 @@
(what_makes_a_good_dataset)=
# What makes a good TTS dataset
## What Makes a Good Dataset

View File

@ -2,7 +2,7 @@ import torch
from TTS.tts.layers.feed_forward.decoder import Decoder
from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

View File

@ -3,7 +3,7 @@ import torch
from TTS.tts.configs import SpeedySpeechConfig
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

View File

@ -4,7 +4,7 @@ import torch as T
from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet
from TTS.tts.utils.data import sequence_mask
from TTS.tts.utils.helpers import sequence_mask
# pylint: disable=unused-variable