mirror of https://github.com/coqui-ai/TTS.git
Fix imports
This commit is contained in:
parent
b0b96b427a
commit
4761853c5c
|
@ -9,7 +9,7 @@ from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlo
|
|||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
|
|
|
@ -2,7 +2,7 @@ import numpy as np
|
|||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
try:
|
||||
# TODO: fix pypi cython installation problem.
|
||||
|
|
|
@ -6,7 +6,7 @@ from coqpit import Coqpit
|
|||
from torch import nn
|
||||
from torch.nn import functional
|
||||
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.ssim import ssim
|
||||
from TTS.utils.audio import TorchSTFT
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from torch import nn
|
|||
|
||||
from TTS.tts.layers.glow_tts.glow import WN
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder
|
|||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_fsspec
|
||||
|
|
|
@ -9,7 +9,7 @@ from torch import nn
|
|||
|
||||
from TTS.tts.layers.losses import TacotronLoss
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
|
||||
from TTS.tts.utils.text import make_symbols
|
||||
from TTS.utils.generic_utils import format_aux_input
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from TTS.tts.utils.helpers import segment
|
||||
import os
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
|
|
|
@ -13,9 +13,10 @@ from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
|||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.tts.utils.helpers import average_over_durations
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -416,7 +417,7 @@ class FastPitch(BaseTTS):
|
|||
"""
|
||||
o_pitch = self.pitch_predictor(o_en, x_mask)
|
||||
if pitch is not None:
|
||||
avg_pitch = average_pitch(pitch, dr)
|
||||
avg_pitch = average_over_durations(pitch, dr)
|
||||
o_pitch_emb = self.pitch_emb(avg_pitch)
|
||||
return o_pitch_emb, o_pitch, avg_pitch
|
||||
o_pitch_emb = self.pitch_emb(o_pitch)
|
||||
|
@ -670,28 +671,3 @@ class FastPitch(BaseTTS):
|
|||
"""Enable binary alignment loss when needed"""
|
||||
if trainer.total_steps_done > self.config.binary_align_loss_start_step:
|
||||
self.use_binary_alignment_loss = True
|
||||
|
||||
|
||||
def average_pitch(pitch, durs):
|
||||
"""Compute the average pitch value for each input character based on the durations.
|
||||
|
||||
Shapes:
|
||||
- pitch: :math:`[B, 1, T_de]`
|
||||
- durs: :math:`[B, T_en]`
|
||||
"""
|
||||
|
||||
durs_cums_ends = torch.cumsum(durs, dim=1).long()
|
||||
durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
|
||||
pitch_nonzero_cums = torch.nn.functional.pad(torch.cumsum(pitch != 0.0, dim=2), (1, 0))
|
||||
pitch_cums = torch.nn.functional.pad(torch.cumsum(pitch, dim=2), (1, 0))
|
||||
|
||||
bs, l = durs_cums_ends.size()
|
||||
n_formants = pitch.size(1)
|
||||
dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
|
||||
dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
|
||||
|
||||
pitch_sums = (torch.gather(pitch_cums, 2, dce) - torch.gather(pitch_cums, 2, dcs)).float()
|
||||
pitch_nelems = (torch.gather(pitch_nonzero_cums, 2, dce) - torch.gather(pitch_nonzero_cums, 2, dcs)).float()
|
||||
|
||||
pitch_avg = torch.where(pitch_nelems == 0.0, pitch_nelems, pitch_sums / pitch_nelems)
|
||||
return pitch_avg
|
||||
|
|
|
@ -9,7 +9,7 @@ from TTS.tts.layers.glow_tts.decoder import Decoder
|
|||
from TTS.tts.layers.glow_tts.encoder import Encoder
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.speakers import get_speaker_manager
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
|
|
|
@ -10,7 +10,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder
|
|||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
|
|
@ -14,7 +14,7 @@ from TTS.tts.layers.vits.discriminator import VitsDiscriminator
|
|||
from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
|
||||
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.speakers import get_speaker_manager
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.visual import plot_alignment
|
||||
|
@ -22,28 +22,7 @@ from TTS.utils.audio import AudioProcessor
|
|||
from TTS.utils.trainer_utils import get_optimizer, get_scheduler
|
||||
from TTS.vocoder.models.hifigan_generator import HifiganGenerator
|
||||
from TTS.vocoder.utils.generic_utils import plot_results
|
||||
|
||||
|
||||
def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4):
|
||||
"""Segment each sample in a batch based on the provided segment indices"""
|
||||
segments = torch.zeros_like(x[:, :, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
index_start = segment_indices[i]
|
||||
index_end = index_start + segment_size
|
||||
segments[i] = x[i, :, index_start:index_end]
|
||||
return segments
|
||||
|
||||
|
||||
def rand_segment(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4):
|
||||
"""Create random segments based on the input lengths."""
|
||||
B, _, T = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = T
|
||||
max_idxs = x_lengths - segment_size + 1
|
||||
assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size."
|
||||
segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long()
|
||||
ret = segment(x, segment_indices, segment_size)
|
||||
return ret, segment_indices
|
||||
from TTS.tts.utils.helpers import rand_segments, segment
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -451,7 +430,7 @@ class Vits(BaseTTS):
|
|||
logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p])
|
||||
|
||||
# select a random feature segment for the waveform decoder
|
||||
z_slice, slice_ids = rand_segment(z, y_lengths, self.spec_segment_size)
|
||||
z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size)
|
||||
o = self.waveform_decoder(z_slice, g=g)
|
||||
outputs.update(
|
||||
{
|
||||
|
|
|
@ -54,33 +54,3 @@ def pad_per_step(inputs, pad_len):
|
|||
return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
|
||||
|
||||
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
class StandardScaler:
|
||||
def set_stats(self, mean, scale):
|
||||
self.mean_ = mean
|
||||
self.scale_ = scale
|
||||
|
||||
def reset_stats(self):
|
||||
delattr(self, "mean_")
|
||||
delattr(self, "scale_")
|
||||
|
||||
def transform(self, X):
|
||||
X = np.asarray(X)
|
||||
X -= self.mean_
|
||||
X /= self.scale_
|
||||
return X
|
||||
|
||||
def inverse_transform(self, X):
|
||||
X = np.asarray(X)
|
||||
X *= self.scale_
|
||||
X += self.mean_
|
||||
return X
|
||||
|
||||
|
||||
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
|
||||
def sequence_mask(sequence_length, max_len=None):
|
||||
if max_len is None:
|
||||
max_len = sequence_length.data.max()
|
||||
seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
|
||||
# B x T_max
|
||||
return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
|
||||
|
|
|
@ -101,6 +101,7 @@ def visualize(
|
|||
figsize=(8, 24),
|
||||
output_fig=False,
|
||||
):
|
||||
"""Intended to be used in Notebooks."""
|
||||
|
||||
if decoder_output is not None:
|
||||
num_plot = 4
|
||||
|
|
|
@ -9,7 +9,7 @@ import soundfile as sf
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.utils.data import StandardScaler
|
||||
from TTS.tts.utils.helpers import StandardScaler
|
||||
|
||||
|
||||
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||
|
|
|
@ -7,7 +7,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
|
|||
- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny.
|
||||
|
||||
## What are the requirements of a good 🐸TTS dataset?
|
||||
* https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset
|
||||
* {ref}`See this page <what_makes_a_good_dataset>`
|
||||
|
||||
## How should I choose the right model?
|
||||
- First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2.
|
||||
|
|
|
@ -54,7 +54,7 @@
|
|||
|
||||
4. Run the training.
|
||||
|
||||
You need to call the python training script.
|
||||
You need to run the training script.
|
||||
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py
|
||||
|
@ -63,7 +63,7 @@
|
|||
Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable.
|
||||
To see available GPUs on your system, you can use `nvidia-smi` command on the terminal.
|
||||
|
||||
If you like to run a multi-gpu training
|
||||
If you like to run a multi-gpu training using DDP back-end,
|
||||
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
(what_makes_a_good_dataset)=
|
||||
# What makes a good TTS dataset
|
||||
|
||||
## What Makes a Good Dataset
|
||||
|
|
|
@ -2,7 +2,7 @@ import torch
|
|||
|
||||
from TTS.tts.layers.feed_forward.decoder import Decoder
|
||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import torch
|
|||
from TTS.tts.configs import SpeedySpeechConfig
|
||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
|
|
@ -4,7 +4,7 @@ import torch as T
|
|||
|
||||
from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
|
||||
from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
|
|
Loading…
Reference in New Issue