mirror of https://github.com/coqui-ai/TTS.git
commit
0592a5805c
|
@ -142,7 +142,6 @@ old_configs/*
|
|||
model_importers/*
|
||||
model_profiling/*
|
||||
docs/source/TODO/*
|
||||
docs/source/models/*
|
||||
.noseids
|
||||
.dccache
|
||||
log.txt
|
||||
|
|
5
Makefile
5
Makefile
|
@ -1,5 +1,5 @@
|
|||
.DEFAULT_GOAL := help
|
||||
.PHONY: test system-deps dev-deps deps style lint install help
|
||||
.PHONY: test system-deps dev-deps deps style lint install help docs
|
||||
|
||||
help:
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
||||
|
@ -45,3 +45,6 @@ deps: ## install 🐸 requirements.
|
|||
|
||||
install: ## install 🐸 TTS for development.
|
||||
pip install -e .[all]
|
||||
|
||||
docs: ## build the docs
|
||||
$(MAKE) -C docs clean && $(MAKE) -C docs html
|
||||
|
|
|
@ -72,6 +72,8 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
|
|||
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
|
||||
- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
|
||||
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
|
||||
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
|
||||
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
|
||||
|
||||
### End-to-End Models
|
||||
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
|
||||
|
@ -82,6 +84,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
|
|||
- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
|
||||
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
|
||||
- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
|
||||
- Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
|
||||
|
||||
### Speaker Encoder
|
||||
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
|
||||
|
|
|
@ -38,6 +38,16 @@
|
|||
"license": "MPL",
|
||||
"contact": "egolge@coqui.com"
|
||||
},
|
||||
"speedy-speech": {
|
||||
"description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip",
|
||||
"stats_file": null,
|
||||
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||
"commit": "4581e3d",
|
||||
"author": "Eren Gölge @erogol",
|
||||
"license": "TBD",
|
||||
"contact": "egolge@coqui.com"
|
||||
},
|
||||
"tacotron2-DCA": {
|
||||
"description": "",
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip",
|
||||
|
@ -47,15 +57,6 @@
|
|||
"license": "MPL",
|
||||
"contact": "egolge@coqui.com"
|
||||
},
|
||||
"speedy-speech-wn": {
|
||||
"description": "Speedy Speech model with wavenet decoder.",
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ljspeech--speedy-speech-wn.zip",
|
||||
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
||||
"commit": "77b6145",
|
||||
"author": "Eren Gölge @erogol",
|
||||
"license": "MPL",
|
||||
"contact": "egolge@coqui.com"
|
||||
},
|
||||
"vits": {
|
||||
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--ljspeech--vits.zip",
|
||||
|
@ -218,11 +219,11 @@
|
|||
"contact": "egolge@coqui.ai"
|
||||
},
|
||||
"univnet": {
|
||||
"description": "UnivNet model trained on LJSpeech to complement the TacotronDDC_ph model.",
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/vocoder_models--en--ljspeech--univnet.zip",
|
||||
"commit": "3900448",
|
||||
"description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
|
||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip",
|
||||
"commit": "4581e3d",
|
||||
"author": "Eren @erogol",
|
||||
"license": "",
|
||||
"license": "TBD",
|
||||
"contact": "egolge@coqui.ai"
|
||||
}
|
||||
},
|
||||
|
|
|
@ -1 +1 @@
|
|||
0.2.2
|
||||
0.3.0
|
|
@ -16,7 +16,6 @@ from TTS.tts.models import setup_model
|
|||
from TTS.tts.utils.speakers import get_speaker_manager
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import count_parameters
|
||||
from TTS.utils.io import load_fsspec
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
|
||||
|
@ -77,14 +76,14 @@ def set_filename(wav_path, out_path):
|
|||
|
||||
def format_data(data):
|
||||
# setup input data
|
||||
text_input = data['text']
|
||||
text_lengths = data['text_lengths']
|
||||
mel_input = data['mel']
|
||||
mel_lengths = data['mel_lengths']
|
||||
item_idx = data['item_idxs']
|
||||
d_vectors = data['d_vectors']
|
||||
speaker_ids = data['speaker_ids']
|
||||
attn_mask = data['attns']
|
||||
text_input = data["text"]
|
||||
text_lengths = data["text_lengths"]
|
||||
mel_input = data["mel"]
|
||||
mel_lengths = data["mel_lengths"]
|
||||
item_idx = data["item_idxs"]
|
||||
d_vectors = data["d_vectors"]
|
||||
speaker_ids = data["speaker_ids"]
|
||||
attn_mask = data["attns"]
|
||||
avg_text_length = torch.mean(text_lengths.float())
|
||||
avg_spec_length = torch.mean(mel_lengths.float())
|
||||
|
||||
|
@ -133,7 +132,11 @@ def inference(
|
|||
elif d_vectors is not None:
|
||||
speaker_c = d_vectors
|
||||
outputs = model.inference_with_MAS(
|
||||
text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids}
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_input,
|
||||
mel_lengths,
|
||||
aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
|
||||
)
|
||||
model_output = outputs["model_outputs"]
|
||||
model_output = model_output.transpose(1, 2).detach().cpu().numpy()
|
||||
|
@ -239,8 +242,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
model = setup_model(c)
|
||||
|
||||
# restore model
|
||||
checkpoint = load_fsspec(args.checkpoint_path, map_location="cpu")
|
||||
model.load_state_dict(checkpoint["model"])
|
||||
model.load_checkpoint(c, args.checkpoint_path, eval=True)
|
||||
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
|
|
|
@ -205,7 +205,7 @@ class Trainer:
|
|||
# load data for `tts` models
|
||||
self.data_train, self.data_eval = load_meta_data(self.config.datasets)
|
||||
elif self.config.feature_path is not None:
|
||||
# load data for `vocoder`models
|
||||
# load pre-comnputed features for `vocoder`models
|
||||
print(f" > Loading features from: {self.config.feature_path}")
|
||||
self.data_eval, self.data_train = load_wav_feat_data(
|
||||
self.config.data_path, self.config.feature_path, self.config.eval_split_size
|
||||
|
@ -275,6 +275,7 @@ class Trainer:
|
|||
if self.args.continue_path:
|
||||
if isinstance(self.scheduler, list):
|
||||
for scheduler in self.scheduler:
|
||||
if scheduler is not None:
|
||||
scheduler.last_epoch = self.restore_step
|
||||
else:
|
||||
self.scheduler.last_epoch = self.restore_step
|
||||
|
@ -662,6 +663,7 @@ class Trainer:
|
|||
lrs = {"current_lr": current_lr}
|
||||
|
||||
# log run-time stats
|
||||
loss_dict.update(lrs)
|
||||
loss_dict.update(
|
||||
{
|
||||
"step_time": round(step_time, 4),
|
||||
|
@ -878,7 +880,7 @@ class Trainer:
|
|||
"""Restore the best loss from the args.best_path if provided else
|
||||
from the model (`args.restore_path` or `args.continue_path`) used for resuming the training"""
|
||||
if self.restore_step != 0 or self.args.best_path:
|
||||
print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...")
|
||||
print(f" > Restoring best loss from {os.path.basename(self.args.best_path)} ...")
|
||||
ch = load_fsspec(self.args.restore_path, map_location="cpu")
|
||||
if "model_loss" in ch:
|
||||
self.best_loss = ch["model_loss"]
|
||||
|
@ -1125,7 +1127,7 @@ def get_last_checkpoint(path: str) -> Tuple[str, str]:
|
|||
last_model_num = model_num
|
||||
last_model = file_name
|
||||
|
||||
# if there is not checkpoint found above
|
||||
# if there is no checkpoint found above
|
||||
# find the checkpoint with the latest
|
||||
# modification date.
|
||||
key_file_names = [fn for fn in file_names if key in fn]
|
||||
|
@ -1144,7 +1146,7 @@ def get_last_checkpoint(path: str) -> Tuple[str, str]:
|
|||
last_models["checkpoint"] = last_models["best_model"]
|
||||
elif "best_model" not in last_models: # no best model
|
||||
# this shouldn't happen, but let's handle it just in case
|
||||
last_models["best_model"] = None
|
||||
last_models["best_model"] = last_models["checkpoint"]
|
||||
# finally check if last best model is more recent than checkpoint
|
||||
elif last_model_nums["best_model"] > last_model_nums["checkpoint"]:
|
||||
last_models["checkpoint"] = last_models["best_model"]
|
||||
|
@ -1180,7 +1182,6 @@ def process_args(args, config=None):
|
|||
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
|
||||
if not args.best_path:
|
||||
args.best_path = best_model
|
||||
|
||||
# init config if not already defined
|
||||
if config is None:
|
||||
if args.config_path:
|
||||
|
|
|
@ -2,12 +2,12 @@ from dataclasses import dataclass, field
|
|||
from typing import List
|
||||
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
from TTS.tts.models.fast_pitch import FastPitchArgs
|
||||
from TTS.tts.models.forward_tts import ForwardTTSArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
class FastPitchConfig(BaseTTSConfig):
|
||||
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
|
||||
"""Configure `ForwardTTS` as FastPitch model.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -18,6 +18,10 @@ class FastPitchConfig(BaseTTSConfig):
|
|||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
|
||||
|
||||
base_model (str):
|
||||
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
|
||||
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
|
||||
|
||||
model_args (Coqpit):
|
||||
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
|
||||
|
||||
|
@ -36,22 +40,43 @@ class FastPitchConfig(BaseTTSConfig):
|
|||
d_vector_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
d_vector_dim (int):
|
||||
Dimension of the external speaker embeddings. Defaults to 0.
|
||||
|
||||
warmup_steps (int):
|
||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||
optimizer (str):
|
||||
Name of the model optimizer. Defaults to `Adam`.
|
||||
|
||||
optimizer_params (dict):
|
||||
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
|
||||
|
||||
lr_scheduler (str):
|
||||
Name of the learning rate scheduler. Defaults to `Noam`.
|
||||
|
||||
lr_scheduler_params (dict):
|
||||
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
|
||||
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-3`.
|
||||
|
||||
grad_clip (float):
|
||||
Gradient norm clipping value. Defaults to `5.0`.
|
||||
|
||||
spec_loss_type (str):
|
||||
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
||||
|
||||
duration_loss_type (str):
|
||||
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
||||
|
||||
use_ssim_loss (bool):
|
||||
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
|
||||
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-7`.
|
||||
|
||||
ssim_loss_alpha (float):
|
||||
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
|
||||
|
||||
huber_loss_alpha (float):
|
||||
dur_loss_alpha (float):
|
||||
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
|
||||
|
||||
spec_loss_alpha (float):
|
||||
|
@ -74,8 +99,10 @@ class FastPitchConfig(BaseTTSConfig):
|
|||
"""
|
||||
|
||||
model: str = "fast_pitch"
|
||||
base_model: str = "forward_tts"
|
||||
|
||||
# model specific params
|
||||
model_args: FastPitchArgs = field(default_factory=FastPitchArgs)
|
||||
model_args: ForwardTTSArgs = ForwardTTSArgs()
|
||||
|
||||
# multi-speaker settings
|
||||
use_speaker_embedding: bool = False
|
||||
|
@ -92,11 +119,13 @@ class FastPitchConfig(BaseTTSConfig):
|
|||
grad_clip: float = 5.0
|
||||
|
||||
# loss params
|
||||
spec_loss_type: str = "mse"
|
||||
duration_loss_type: str = "mse"
|
||||
use_ssim_loss: bool = True
|
||||
ssim_loss_alpha: float = 1.0
|
||||
dur_loss_alpha: float = 1.0
|
||||
spec_loss_alpha: float = 1.0
|
||||
pitch_loss_alpha: float = 1.0
|
||||
dur_loss_alpha: float = 1.0
|
||||
aligner_loss_alpha: float = 1.0
|
||||
binary_align_loss_alpha: float = 1.0
|
||||
binary_align_loss_start_step: int = 20000
|
||||
|
|
|
@ -0,0 +1,151 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
from TTS.tts.models.forward_tts import ForwardTTSArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
class FastSpeechConfig(BaseTTSConfig):
|
||||
"""Configure `ForwardTTS` as FastSpeech model.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.tts.configs import FastSpeechConfig
|
||||
>>> config = FastSpeechConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
|
||||
|
||||
base_model (str):
|
||||
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
|
||||
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
|
||||
|
||||
model_args (Coqpit):
|
||||
Model class arguments. Check `FastSpeechArgs` for more details. Defaults to `FastSpeechArgs()`.
|
||||
|
||||
data_dep_init_steps (int):
|
||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||
for the rest. Defaults to 10.
|
||||
|
||||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
|
||||
use_d_vector_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
|
||||
d_vector_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
|
||||
d_vector_dim (int):
|
||||
Dimension of the external speaker embeddings. Defaults to 0.
|
||||
|
||||
optimizer (str):
|
||||
Name of the model optimizer. Defaults to `Adam`.
|
||||
|
||||
optimizer_params (dict):
|
||||
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
|
||||
|
||||
lr_scheduler (str):
|
||||
Name of the learning rate scheduler. Defaults to `Noam`.
|
||||
|
||||
lr_scheduler_params (dict):
|
||||
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
|
||||
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-3`.
|
||||
|
||||
grad_clip (float):
|
||||
Gradient norm clipping value. Defaults to `5.0`.
|
||||
|
||||
spec_loss_type (str):
|
||||
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
||||
|
||||
duration_loss_type (str):
|
||||
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
||||
|
||||
use_ssim_loss (bool):
|
||||
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
|
||||
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-7`.
|
||||
|
||||
ssim_loss_alpha (float):
|
||||
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
|
||||
|
||||
dur_loss_alpha (float):
|
||||
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
|
||||
|
||||
spec_loss_alpha (float):
|
||||
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
|
||||
|
||||
pitch_loss_alpha (float):
|
||||
Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
|
||||
|
||||
binary_loss_alpha (float):
|
||||
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
|
||||
|
||||
binary_align_loss_start_step (int):
|
||||
Start binary alignment loss after this many steps. Defaults to 20000.
|
||||
|
||||
min_seq_len (int):
|
||||
Minimum input sequence length to be used at training.
|
||||
|
||||
max_seq_len (int):
|
||||
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||
"""
|
||||
|
||||
model: str = "fast_speech"
|
||||
base_model: str = "forward_tts"
|
||||
|
||||
# model specific params
|
||||
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
|
||||
|
||||
# multi-speaker settings
|
||||
use_speaker_embedding: bool = False
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_file: str = False
|
||||
d_vector_dim: int = 0
|
||||
|
||||
# optimizer parameters
|
||||
optimizer: str = "Adam"
|
||||
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
|
||||
lr_scheduler: str = "NoamLR"
|
||||
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
|
||||
lr: float = 1e-4
|
||||
grad_clip: float = 5.0
|
||||
|
||||
# loss params
|
||||
spec_loss_type: str = "mse"
|
||||
duration_loss_type: str = "mse"
|
||||
use_ssim_loss: bool = True
|
||||
ssim_loss_alpha: float = 1.0
|
||||
dur_loss_alpha: float = 1.0
|
||||
spec_loss_alpha: float = 1.0
|
||||
pitch_loss_alpha: float = 0.0
|
||||
aligner_loss_alpha: float = 1.0
|
||||
binary_align_loss_alpha: float = 1.0
|
||||
binary_align_loss_start_step: int = 20000
|
||||
|
||||
# overrides
|
||||
min_seq_len: int = 13
|
||||
max_seq_len: int = 200
|
||||
r: int = 1 # DO NOT CHANGE
|
||||
|
||||
# dataset configs
|
||||
compute_f0: bool = True
|
||||
f0_cache_path: str = None
|
||||
|
||||
# testing
|
||||
test_sentences: List[str] = field(
|
||||
default_factory=lambda: [
|
||||
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||
"Be a voice, not an echo.",
|
||||
"I'm sorry Dave. I'm afraid I can't do that.",
|
||||
"This cake is great. It's so delicious and moist.",
|
||||
"Prior to November 22, 1963.",
|
||||
]
|
||||
)
|
|
@ -2,12 +2,12 @@ from dataclasses import dataclass, field
|
|||
from typing import List
|
||||
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
|
||||
from TTS.tts.models.forward_tts import ForwardTTSArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpeedySpeechConfig(BaseTTSConfig):
|
||||
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
|
||||
"""Configure `ForwardTTS` as SpeedySpeech model.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -17,66 +17,145 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
|||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
|
||||
|
||||
base_model (str):
|
||||
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
|
||||
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
|
||||
|
||||
model_args (Coqpit):
|
||||
Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`.
|
||||
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
|
||||
|
||||
data_dep_init_steps (int):
|
||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||
for the rest. Defaults to 10.
|
||||
|
||||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
|
||||
use_d_vector_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
|
||||
d_vector_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
warmup_steps (int):
|
||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||
|
||||
d_vector_dim (int):
|
||||
Dimension of the external speaker embeddings. Defaults to 0.
|
||||
|
||||
optimizer (str):
|
||||
Name of the model optimizer. Defaults to `RAdam`.
|
||||
|
||||
optimizer_params (dict):
|
||||
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
|
||||
|
||||
lr_scheduler (str):
|
||||
Name of the learning rate scheduler. Defaults to `Noam`.
|
||||
|
||||
lr_scheduler_params (dict):
|
||||
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
|
||||
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-3`.
|
||||
|
||||
grad_clip (float):
|
||||
Gradient norm clipping value. Defaults to `5.0`.
|
||||
|
||||
spec_loss_type (str):
|
||||
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `l1`.
|
||||
|
||||
duration_loss_type (str):
|
||||
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `huber`.
|
||||
|
||||
use_ssim_loss (bool):
|
||||
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
|
||||
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-7`.
|
||||
ssim_alpha (float):
|
||||
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
|
||||
huber_alpha (float):
|
||||
Weight for the duration predictor's loss. Defaults to 1.0.
|
||||
l1_alpha (float):
|
||||
Weight for the L1 spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
|
||||
|
||||
ssim_loss_alpha (float):
|
||||
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
|
||||
|
||||
dur_loss_alpha (float):
|
||||
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
|
||||
|
||||
spec_loss_alpha (float):
|
||||
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
|
||||
|
||||
binary_loss_alpha (float):
|
||||
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
|
||||
|
||||
binary_align_loss_start_step (int):
|
||||
Start binary alignment loss after this many steps. Defaults to 20000.
|
||||
|
||||
min_seq_len (int):
|
||||
Minimum input sequence length to be used at training.
|
||||
|
||||
max_seq_len (int):
|
||||
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||
"""
|
||||
|
||||
model: str = "speedy_speech"
|
||||
# model specific params
|
||||
model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs)
|
||||
base_model: str = "forward_tts"
|
||||
|
||||
# set model args as SpeedySpeech
|
||||
model_args: ForwardTTSArgs = ForwardTTSArgs(
|
||||
use_pitch=False,
|
||||
encoder_type="residual_conv_bn",
|
||||
encoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13,
|
||||
},
|
||||
decoder_type="residual_conv_bn",
|
||||
decoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17,
|
||||
},
|
||||
out_channels=80,
|
||||
hidden_channels=128,
|
||||
num_speakers=0,
|
||||
positional_encoding=True,
|
||||
detach_duration_predictor=True
|
||||
)
|
||||
|
||||
# multi-speaker settings
|
||||
use_speaker_embedding: bool = False
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_file: str = False
|
||||
d_vector_dim: int = 0
|
||||
|
||||
# optimizer parameters
|
||||
optimizer: str = "RAdam"
|
||||
optimizer: str = "Adam"
|
||||
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
|
||||
lr_scheduler: str = None
|
||||
lr_scheduler_params: dict = None
|
||||
lr_scheduler: str = "NoamLR"
|
||||
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
|
||||
lr: float = 1e-4
|
||||
grad_clip: float = 5.0
|
||||
|
||||
# loss params
|
||||
ssim_alpha: float = 1.0
|
||||
huber_alpha: float = 1.0
|
||||
l1_alpha: float = 1.0
|
||||
spec_loss_type: str = "l1"
|
||||
duration_loss_type: str = "huber"
|
||||
use_ssim_loss: bool = False
|
||||
ssim_loss_alpha: float = 1.0
|
||||
dur_loss_alpha: float = 1.0
|
||||
spec_loss_alpha: float = 1.0
|
||||
aligner_loss_alpha: float = 1.0
|
||||
binary_align_loss_alpha: float = 0.3
|
||||
binary_align_loss_start_step: int = 50000
|
||||
|
||||
# overrides
|
||||
min_seq_len: int = 13
|
||||
max_seq_len: int = 200
|
||||
r: int = 1 # DO NOT CHANGE
|
||||
|
||||
# dataset configs
|
||||
compute_f0: bool = False
|
||||
f0_cache_path: str = None
|
||||
|
||||
# testing
|
||||
test_sentences: List[str] = field(
|
||||
default_factory=lambda: [
|
||||
|
|
|
@ -1,15 +1 @@
|
|||
from TTS.tts.layers.losses import *
|
||||
|
||||
|
||||
def setup_loss(config):
|
||||
if config.model.lower() in ["tacotron", "tacotron2"]:
|
||||
model = TacotronLoss(config)
|
||||
elif config.model.lower() == "glow_tts":
|
||||
model = GlowTTSLoss()
|
||||
elif config.model.lower() == "speedy_speech":
|
||||
model = SpeedySpeechLoss(config)
|
||||
elif config.model.lower() == "align_tts":
|
||||
model = AlignTTSLoss(config)
|
||||
else:
|
||||
raise ValueError(f" [!] loss for model {config.model.lower()} cannot be found.")
|
||||
return model
|
||||
|
|
|
@ -70,7 +70,9 @@ class FFTransformerBlock(nn.Module):
|
|||
|
||||
|
||||
class FFTDurationPredictor:
|
||||
def __init__(self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None): # pylint: disable=unused-argument
|
||||
def __init__(
|
||||
self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None
|
||||
): # pylint: disable=unused-argument
|
||||
self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p)
|
||||
self.proj = nn.Linear(in_channels, 1)
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlo
|
|||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
|
|
|
@ -1,106 +0,0 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
|
||||
try:
|
||||
# TODO: fix pypi cython installation problem.
|
||||
from TTS.tts.layers.glow_tts.monotonic_align.core import maximum_path_c
|
||||
|
||||
CYTHON = True
|
||||
except ModuleNotFoundError:
|
||||
CYTHON = False
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
l = pad_shape[::-1]
|
||||
pad_shape = [item for sublist in l for item in sublist]
|
||||
return pad_shape
|
||||
|
||||
|
||||
def generate_path(duration, mask):
|
||||
"""
|
||||
Shapes:
|
||||
- duration: :math:`[B, T_en]`
|
||||
- mask: :math:'[B, T_en, T_de]`
|
||||
- path: :math:`[B, T_en, T_de]`
|
||||
"""
|
||||
device = duration.device
|
||||
b, t_x, t_y = mask.shape
|
||||
cum_duration = torch.cumsum(duration, 1)
|
||||
path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
|
||||
|
||||
cum_duration_flat = cum_duration.view(b * t_x)
|
||||
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
||||
path = path.view(b, t_x, t_y)
|
||||
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
||||
path = path * mask
|
||||
return path
|
||||
|
||||
|
||||
def maximum_path(value, mask):
|
||||
if CYTHON:
|
||||
return maximum_path_cython(value, mask)
|
||||
return maximum_path_numpy(value, mask)
|
||||
|
||||
|
||||
def maximum_path_cython(value, mask):
|
||||
"""Cython optimised version.
|
||||
Shapes:
|
||||
- value: :math:`[B, T_en, T_de]`
|
||||
- mask: :math:`[B, T_en, T_de]`
|
||||
"""
|
||||
value = value * mask
|
||||
device = value.device
|
||||
dtype = value.dtype
|
||||
value = value.data.cpu().numpy().astype(np.float32)
|
||||
path = np.zeros_like(value).astype(np.int32)
|
||||
mask = mask.data.cpu().numpy()
|
||||
|
||||
t_x_max = mask.sum(1)[:, 0].astype(np.int32)
|
||||
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
|
||||
maximum_path_c(path, value, t_x_max, t_y_max)
|
||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
|
||||
|
||||
def maximum_path_numpy(value, mask, max_neg_val=None):
|
||||
"""
|
||||
Monotonic alignment search algorithm
|
||||
Numpy-friendly version. It's about 4 times faster than torch version.
|
||||
value: [b, t_x, t_y]
|
||||
mask: [b, t_x, t_y]
|
||||
"""
|
||||
if max_neg_val is None:
|
||||
max_neg_val = -np.inf # Patch for Sphinx complaint
|
||||
value = value * mask
|
||||
|
||||
device = value.device
|
||||
dtype = value.dtype
|
||||
value = value.cpu().detach().numpy()
|
||||
mask = mask.cpu().detach().numpy().astype(np.bool)
|
||||
|
||||
b, t_x, t_y = value.shape
|
||||
direction = np.zeros(value.shape, dtype=np.int64)
|
||||
v = np.zeros((b, t_x), dtype=np.float32)
|
||||
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
|
||||
for j in range(t_y):
|
||||
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
|
||||
v1 = v
|
||||
max_mask = v1 >= v0
|
||||
v_max = np.where(max_mask, v1, v0)
|
||||
direction[:, :, j] = max_mask
|
||||
|
||||
index_mask = x_range <= j
|
||||
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
|
||||
direction = np.where(mask, direction, 1)
|
||||
|
||||
path = np.zeros(value.shape, dtype=np.float32)
|
||||
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
|
||||
index_range = np.arange(b)
|
||||
for j in reversed(range(t_y)):
|
||||
path[index_range, index, j] = 1
|
||||
index = index + direction[index_range, index, j] - 1
|
||||
path = path * mask.astype(np.float32)
|
||||
path = torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
return path
|
|
@ -6,7 +6,7 @@ from coqpit import Coqpit
|
|||
from torch import nn
|
||||
from torch.nn import functional
|
||||
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.ssim import ssim
|
||||
from TTS.utils.audio import TorchSTFT
|
||||
|
||||
|
@ -236,10 +236,40 @@ class Huber(nn.Module):
|
|||
y: B x T
|
||||
length: B
|
||||
"""
|
||||
mask = sequence_mask(sequence_length=length, max_len=y.size(1)).float()
|
||||
mask = sequence_mask(sequence_length=length, max_len=y.size(1)).unsqueeze(2).float()
|
||||
return torch.nn.functional.smooth_l1_loss(x * mask, y * mask, reduction="sum") / mask.sum()
|
||||
|
||||
|
||||
class ForwardSumLoss(nn.Module):
|
||||
def __init__(self, blank_logprob=-1):
|
||||
super().__init__()
|
||||
self.log_softmax = torch.nn.LogSoftmax(dim=3)
|
||||
self.ctc_loss = torch.nn.CTCLoss(zero_infinity=True)
|
||||
self.blank_logprob = blank_logprob
|
||||
|
||||
def forward(self, attn_logprob, in_lens, out_lens):
|
||||
key_lens = in_lens
|
||||
query_lens = out_lens
|
||||
attn_logprob_padded = torch.nn.functional.pad(input=attn_logprob, pad=(1, 0), value=self.blank_logprob)
|
||||
|
||||
total_loss = 0.0
|
||||
for bid in range(attn_logprob.shape[0]):
|
||||
target_seq = torch.arange(1, key_lens[bid] + 1).unsqueeze(0)
|
||||
curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2)[: query_lens[bid], :, : key_lens[bid] + 1]
|
||||
|
||||
curr_logprob = self.log_softmax(curr_logprob[None])[0]
|
||||
loss = self.ctc_loss(
|
||||
curr_logprob,
|
||||
target_seq,
|
||||
input_lengths=query_lens[bid : bid + 1],
|
||||
target_lengths=key_lens[bid : bid + 1],
|
||||
)
|
||||
total_loss = total_loss + loss
|
||||
|
||||
total_loss = total_loss / attn_logprob.shape[0]
|
||||
return total_loss
|
||||
|
||||
|
||||
########################
|
||||
# MODEL LOSS LAYERS
|
||||
########################
|
||||
|
@ -413,25 +443,6 @@ class GlowTTSLoss(torch.nn.Module):
|
|||
return return_dict
|
||||
|
||||
|
||||
class SpeedySpeechLoss(nn.Module):
|
||||
def __init__(self, c):
|
||||
super().__init__()
|
||||
self.l1 = L1LossMasked(False)
|
||||
self.ssim = SSIMLoss()
|
||||
self.huber = Huber()
|
||||
|
||||
self.ssim_alpha = c.ssim_alpha
|
||||
self.huber_alpha = c.huber_alpha
|
||||
self.l1_alpha = c.l1_alpha
|
||||
|
||||
def forward(self, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens):
|
||||
l1_loss = self.l1(decoder_output, decoder_target, decoder_output_lens)
|
||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
||||
huber_loss = self.huber(dur_output, dur_target, input_lens)
|
||||
loss = self.l1_alpha * l1_loss + self.ssim_alpha * ssim_loss + self.huber_alpha * huber_loss
|
||||
return {"loss": loss, "loss_l1": l1_loss, "loss_ssim": ssim_loss, "loss_dur": huber_loss}
|
||||
|
||||
|
||||
def mse_loss_custom(x, y):
|
||||
"""MSE loss using the torch back-end without reduction.
|
||||
It uses less VRAM than the raw code"""
|
||||
|
@ -660,51 +671,41 @@ class VitsDiscriminatorLoss(nn.Module):
|
|||
return return_dict
|
||||
|
||||
|
||||
class ForwardSumLoss(nn.Module):
|
||||
def __init__(self, blank_logprob=-1):
|
||||
super().__init__()
|
||||
self.log_softmax = torch.nn.LogSoftmax(dim=3)
|
||||
self.ctc_loss = torch.nn.CTCLoss(zero_infinity=True)
|
||||
self.blank_logprob = blank_logprob
|
||||
class ForwardTTSLoss(nn.Module):
|
||||
"""Generic configurable ForwardTTS loss."""
|
||||
|
||||
def forward(self, attn_logprob, in_lens, out_lens):
|
||||
key_lens = in_lens
|
||||
query_lens = out_lens
|
||||
attn_logprob_padded = torch.nn.functional.pad(input=attn_logprob, pad=(1, 0), value=self.blank_logprob)
|
||||
|
||||
total_loss = 0.0
|
||||
for bid in range(attn_logprob.shape[0]):
|
||||
target_seq = torch.arange(1, key_lens[bid] + 1).unsqueeze(0)
|
||||
curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2)[: query_lens[bid], :, : key_lens[bid] + 1]
|
||||
|
||||
curr_logprob = self.log_softmax(curr_logprob[None])[0]
|
||||
loss = self.ctc_loss(
|
||||
curr_logprob,
|
||||
target_seq,
|
||||
input_lengths=query_lens[bid : bid + 1],
|
||||
target_lengths=key_lens[bid : bid + 1],
|
||||
)
|
||||
total_loss = total_loss + loss
|
||||
|
||||
total_loss = total_loss / attn_logprob.shape[0]
|
||||
return total_loss
|
||||
|
||||
|
||||
class FastPitchLoss(nn.Module):
|
||||
def __init__(self, c):
|
||||
super().__init__()
|
||||
if c.spec_loss_type == "mse":
|
||||
self.spec_loss = MSELossMasked(False)
|
||||
self.ssim = SSIMLoss()
|
||||
elif c.spec_loss_type == "l1":
|
||||
self.spec_loss = L1LossMasked(False)
|
||||
else:
|
||||
raise ValueError(" [!] Unknown spec_loss_type {}".format(c.spec_loss_type))
|
||||
|
||||
if c.duration_loss_type == "mse":
|
||||
self.dur_loss = MSELossMasked(False)
|
||||
self.pitch_loss = MSELossMasked(False)
|
||||
elif c.duration_loss_type == "l1":
|
||||
self.dur_loss = L1LossMasked(False)
|
||||
elif c.duration_loss_type == "huber":
|
||||
self.dur_loss = Huber()
|
||||
else:
|
||||
raise ValueError(" [!] Unknown duration_loss_type {}".format(c.duration_loss_type))
|
||||
|
||||
if c.model_args.use_aligner:
|
||||
self.aligner_loss = ForwardSumLoss()
|
||||
self.aligner_loss_alpha = c.aligner_loss_alpha
|
||||
|
||||
if c.model_args.use_pitch:
|
||||
self.pitch_loss = MSELossMasked(False)
|
||||
self.pitch_loss_alpha = c.pitch_loss_alpha
|
||||
|
||||
if c.use_ssim_loss:
|
||||
self.ssim = SSIMLoss() if c.use_ssim_loss else None
|
||||
self.ssim_loss_alpha = c.ssim_loss_alpha
|
||||
|
||||
self.spec_loss_alpha = c.spec_loss_alpha
|
||||
self.ssim_loss_alpha = c.ssim_loss_alpha
|
||||
self.dur_loss_alpha = c.dur_loss_alpha
|
||||
self.pitch_loss_alpha = c.pitch_loss_alpha
|
||||
self.aligner_loss_alpha = c.aligner_loss_alpha
|
||||
self.binary_alignment_loss_alpha = c.binary_align_loss_alpha
|
||||
|
||||
@staticmethod
|
||||
|
@ -731,7 +732,7 @@ class FastPitchLoss(nn.Module):
|
|||
):
|
||||
loss = 0
|
||||
return_dict = {}
|
||||
if self.ssim_loss_alpha > 0:
|
||||
if hasattr(self, "ssim_loss") and self.ssim_loss_alpha > 0:
|
||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
||||
loss = loss + self.ssim_loss_alpha * ssim_loss
|
||||
return_dict["loss_ssim"] = self.ssim_loss_alpha * ssim_loss
|
||||
|
@ -747,12 +748,12 @@ class FastPitchLoss(nn.Module):
|
|||
loss = loss + self.dur_loss_alpha * dur_loss
|
||||
return_dict["loss_dur"] = self.dur_loss_alpha * dur_loss
|
||||
|
||||
if self.pitch_loss_alpha > 0:
|
||||
if hasattr(self, "pitch_loss") and self.pitch_loss_alpha > 0:
|
||||
pitch_loss = self.pitch_loss(pitch_output.transpose(1, 2), pitch_target.transpose(1, 2), input_lens)
|
||||
loss = loss + self.pitch_loss_alpha * pitch_loss
|
||||
return_dict["loss_pitch"] = self.pitch_loss_alpha * pitch_loss
|
||||
|
||||
if self.aligner_loss_alpha > 0:
|
||||
if hasattr(self, "aligner_loss") and self.aligner_loss_alpha > 0:
|
||||
aligner_loss = self.aligner_loss(alignment_logprob, input_lens, decoder_output_lens)
|
||||
loss = loss + self.aligner_loss_alpha * aligner_loss
|
||||
return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss
|
||||
|
|
|
@ -5,7 +5,7 @@ from torch import nn
|
|||
|
||||
from TTS.tts.layers.glow_tts.glow import WN
|
||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
|
|
@ -4,6 +4,10 @@ from TTS.utils.generic_utils import find_module
|
|||
|
||||
def setup_model(config):
|
||||
print(" > Using model: {}".format(config.model))
|
||||
# fetch the right model implementation.
|
||||
if "base_model" in config and config["base_model"] is not None:
|
||||
MyModel = find_module("TTS.tts.models", config.base_model.lower())
|
||||
else:
|
||||
MyModel = find_module("TTS.tts.models", config.model.lower())
|
||||
# define set of characters used by the model
|
||||
if config.characters is not None:
|
||||
|
|
|
@ -10,9 +10,8 @@ from TTS.tts.layers.feed_forward.decoder import Decoder
|
|||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_fsspec
|
||||
|
@ -168,7 +167,12 @@ class AlignTTS(BaseTTS):
|
|||
return dr_mas.squeeze(1), log_p
|
||||
|
||||
@staticmethod
|
||||
def convert_dr_to_align(dr, x_mask, y_mask):
|
||||
def generate_attn(dr, x_mask, y_mask=None):
|
||||
# compute decode mask from the durations
|
||||
if y_mask is None:
|
||||
y_lengths = dr.sum(1).long()
|
||||
y_lengths[y_lengths < 1] = 1
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
|
||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
||||
attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
|
||||
return attn
|
||||
|
@ -187,7 +191,7 @@ class AlignTTS(BaseTTS):
|
|||
[0, 1, 1, 1, 0, 0, 0],
|
||||
[1, 0, 0, 0, 0, 0, 0]]
|
||||
"""
|
||||
attn = self.convert_dr_to_align(dr, x_mask, y_mask)
|
||||
attn = self.generate_attn(dr, x_mask, y_mask)
|
||||
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2)
|
||||
return o_en_ex, attn
|
||||
|
||||
|
@ -275,7 +279,7 @@ class AlignTTS(BaseTTS):
|
|||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask)
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
|
||||
attn = self.convert_dr_to_align(dr_mas, x_mask, y_mask)
|
||||
attn = self.generate_attn(dr_mas, x_mask, y_mask)
|
||||
elif phase == 1:
|
||||
# train decoder
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
|
|
|
@ -9,7 +9,7 @@ from torch import nn
|
|||
|
||||
from TTS.tts.layers.losses import TacotronLoss
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
|
||||
from TTS.tts.utils.text import make_symbols
|
||||
from TTS.utils.generic_utils import format_aux_input
|
||||
|
@ -115,12 +115,19 @@ class BaseTacotron(BaseTTS):
|
|||
): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
||||
self.load_state_dict(state["model"])
|
||||
# TODO: set r in run-time by taking it from the new config
|
||||
if "r" in state:
|
||||
# set r from the state (for compatibility with older checkpoints)
|
||||
self.decoder.set_r(state["r"])
|
||||
else:
|
||||
elif "config" in state:
|
||||
# set r from config used at training time (for inference)
|
||||
self.decoder.set_r(state["config"]["r"])
|
||||
else:
|
||||
# set r from the new config (for new-models)
|
||||
self.decoder.set_r(config.r)
|
||||
if eval:
|
||||
self.eval()
|
||||
print(f" > Model's reduction rate `r` is set to: {self.decoder.r}")
|
||||
assert not self.training
|
||||
|
||||
def get_criterion(self) -> nn.Module:
|
||||
|
|
|
@ -11,16 +11,15 @@ from TTS.tts.layers.feed_forward.encoder import Encoder
|
|||
from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
|
||||
@dataclass
|
||||
class FastPitchArgs(Coqpit):
|
||||
"""Fast Pitch Model arguments.
|
||||
class ForwardTTSArgs(Coqpit):
|
||||
"""ForwardTTS Model arguments.
|
||||
|
||||
Args:
|
||||
|
||||
|
@ -36,6 +35,14 @@ class FastPitchArgs(Coqpit):
|
|||
num_speakers (int):
|
||||
Number of speakers for the speaker embedding layer. Defaults to 0.
|
||||
|
||||
use_aligner (bool):
|
||||
Whether to use aligner network to learn the text to speech alignment or use pre-computed durations.
|
||||
If set False, durations should be computed by `TTS/bin/compute_attention_masks.py` and path to the
|
||||
pre-computed durations must be provided to `config.datasets[0].meta_file_attn_mask`. Defaults to True.
|
||||
|
||||
use_pitch (bool):
|
||||
Use pitch predictor to learn the pitch. Defaults to True.
|
||||
|
||||
duration_predictor_hidden_channels (int):
|
||||
Number of hidden channels in the duration predictor. Defaults to 256.
|
||||
|
||||
|
@ -93,21 +100,21 @@ class FastPitchArgs(Coqpit):
|
|||
max_duration (int):
|
||||
Maximum duration accepted by the model. Defaults to 75.
|
||||
|
||||
use_aligner (bool):
|
||||
Use aligner network to learn the text to speech alignment. Defaults to True.
|
||||
"""
|
||||
|
||||
num_chars: int = None
|
||||
out_channels: int = 80
|
||||
hidden_channels: int = 384
|
||||
num_speakers: int = 0
|
||||
duration_predictor_hidden_channels: int = 256
|
||||
duration_predictor_kernel_size: int = 3
|
||||
duration_predictor_dropout_p: float = 0.1
|
||||
use_aligner: bool = True
|
||||
use_pitch: bool = True
|
||||
pitch_predictor_hidden_channels: int = 256
|
||||
pitch_predictor_kernel_size: int = 3
|
||||
pitch_predictor_dropout_p: float = 0.1
|
||||
pitch_embedding_kernel_size: int = 3
|
||||
duration_predictor_hidden_channels: int = 256
|
||||
duration_predictor_kernel_size: int = 3
|
||||
duration_predictor_dropout_p: float = 0.1
|
||||
positional_encoding: bool = True
|
||||
poisitonal_encoding_use_scale: bool = True
|
||||
length_scale: int = 1
|
||||
|
@ -123,32 +130,32 @@ class FastPitchArgs(Coqpit):
|
|||
d_vector_dim: int = 0
|
||||
detach_duration_predictor: bool = False
|
||||
max_duration: int = 75
|
||||
use_aligner: bool = True
|
||||
|
||||
|
||||
class FastPitch(BaseTTS):
|
||||
"""FastPitch model. Very similart to SpeedySpeech model but with pitch prediction.
|
||||
class ForwardTTS(BaseTTS):
|
||||
"""General forward TTS model implementation that uses an encoder-decoder architecture with an optional alignment
|
||||
network and a pitch predictor.
|
||||
|
||||
Paper::
|
||||
https://arxiv.org/abs/2006.06873
|
||||
If the alignment network is used, the model learns the text-to-speech alignment
|
||||
from the data instead of using pre-computed durations.
|
||||
|
||||
Paper abstract::
|
||||
We present FastPitch, a fully-parallel text-to-speech model based on FastSpeech, conditioned on fundamental
|
||||
frequency contours. The model predicts pitch contours during inference. By altering these predictions,
|
||||
the generated speech can be more expressive, better match the semantic of the utterance, and in the end
|
||||
more engaging to the listener. Uniformly increasing or decreasing pitch with FastPitch generates speech
|
||||
that resembles the voluntary modulation of voice. Conditioning on frequency contours improves the overall
|
||||
quality of synthesized speech, making it comparable to state-of-the-art. It does not introduce an overhead,
|
||||
and FastPitch retains the favorable, fully-parallel Transformer architecture, with over 900x real-time
|
||||
factor for mel-spectrogram synthesis of a typical utterance."
|
||||
If the pitch predictor is used, the model trains a pitch predictor that predicts average pitch value for each
|
||||
input character as in the FastPitch model.
|
||||
|
||||
`ForwardTTS` can be configured to one of these architectures,
|
||||
|
||||
- FastPitch
|
||||
- SpeedySpeech
|
||||
- FastSpeech
|
||||
- TODO: FastSpeech2 (requires average speech energy predictor)
|
||||
|
||||
Args:
|
||||
config (Coqpit): Model coqpit class.
|
||||
|
||||
Examples:
|
||||
>>> from TTS.tts.models.fast_pitch import FastPitch, FastPitchArgs
|
||||
>>> config = FastPitchArgs()
|
||||
>>> model = FastPitch(config)
|
||||
>>> from TTS.tts.models.fast_pitch import ForwardTTS, ForwardTTSArgs
|
||||
>>> config = ForwardTTSArgs()
|
||||
>>> model = ForwardTTS(config)
|
||||
"""
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
|
@ -157,24 +164,25 @@ class FastPitch(BaseTTS):
|
|||
super().__init__()
|
||||
|
||||
# don't use isintance not to import recursively
|
||||
if config.__class__.__name__ == "FastPitchConfig":
|
||||
if "Config" in config.__class__.__name__:
|
||||
if "characters" in config:
|
||||
# loading from FasrPitchConfig
|
||||
_, self.config, num_chars = self.get_characters(config)
|
||||
config.model_args.num_chars = num_chars
|
||||
self.args = self.config.model_args
|
||||
else:
|
||||
# loading from FastPitchArgs
|
||||
# loading from ForwardTTSArgs
|
||||
self.config = config
|
||||
self.args = config.model_args
|
||||
elif isinstance(config, FastPitchArgs):
|
||||
elif isinstance(config, ForwardTTSArgs):
|
||||
self.args = config
|
||||
self.config = config
|
||||
else:
|
||||
raise ValueError("config must be either a VitsConfig or Vitsself.args")
|
||||
raise ValueError("config must be either a *Config or ForwardTTSArgs")
|
||||
|
||||
self.max_duration = self.args.max_duration
|
||||
self.use_aligner = self.args.use_aligner
|
||||
self.use_pitch = self.args.use_pitch
|
||||
self.use_binary_alignment_loss = False
|
||||
|
||||
self.length_scale = (
|
||||
|
@ -208,13 +216,13 @@ class FastPitch(BaseTTS):
|
|||
self.args.duration_predictor_dropout_p,
|
||||
)
|
||||
|
||||
if self.args.use_pitch:
|
||||
self.pitch_predictor = DurationPredictor(
|
||||
self.args.hidden_channels + self.args.d_vector_dim,
|
||||
self.args.pitch_predictor_hidden_channels,
|
||||
self.args.pitch_predictor_kernel_size,
|
||||
self.args.pitch_predictor_dropout_p,
|
||||
)
|
||||
|
||||
self.pitch_emb = nn.Conv1d(
|
||||
1,
|
||||
self.args.hidden_channels,
|
||||
|
@ -257,18 +265,22 @@ class FastPitch(BaseTTS):
|
|||
"""Generate attention alignment map from durations and
|
||||
expand encoder outputs
|
||||
|
||||
Shapes
|
||||
Shapes:
|
||||
- en: :math:`(B, D_{en}, T_{en})`
|
||||
- dr: :math:`(B, T_{en})`
|
||||
- x_mask: :math:`(B, T_{en})`
|
||||
- y_mask: :math:`(B, T_{de})`
|
||||
|
||||
Examples:
|
||||
- encoder output: :math:`[a,b,c,d]`
|
||||
- durations: :math:`[1, 3, 2, 1]`
|
||||
Examples::
|
||||
|
||||
- expanded: :math:`[a, b, b, b, c, c, d]`
|
||||
- attention map: :math:`[[0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0]]`
|
||||
encoder output: [a,b,c,d]
|
||||
durations: [1, 3, 2, 1]
|
||||
|
||||
expanded: [a, b, b, b, c, c, d]
|
||||
attention map: [[0, 0, 0, 0, 0, 0, 1],
|
||||
[0, 0, 0, 0, 1, 1, 0],
|
||||
[0, 1, 1, 1, 0, 0, 0],
|
||||
[1, 0, 0, 0, 0, 0, 0]]
|
||||
"""
|
||||
attn = self.generate_attn(dr, x_mask, y_mask)
|
||||
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2).to(en.dtype), en.transpose(1, 2)).transpose(1, 2)
|
||||
|
@ -416,7 +428,7 @@ class FastPitch(BaseTTS):
|
|||
"""
|
||||
o_pitch = self.pitch_predictor(o_en, x_mask)
|
||||
if pitch is not None:
|
||||
avg_pitch = average_pitch(pitch, dr)
|
||||
avg_pitch = average_over_durations(pitch, dr)
|
||||
o_pitch_emb = self.pitch_emb(avg_pitch)
|
||||
return o_pitch_emb, o_pitch, avg_pitch
|
||||
o_pitch_emb = self.pitch_emb(o_pitch)
|
||||
|
@ -471,7 +483,7 @@ class FastPitch(BaseTTS):
|
|||
y: torch.FloatTensor = None,
|
||||
dr: torch.IntTensor = None,
|
||||
pitch: torch.FloatTensor = None,
|
||||
aux_input: Dict = {"d_vectors": 0, "speaker_ids": None}, # pylint: disable=unused-argument
|
||||
aux_input: Dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument
|
||||
) -> Dict:
|
||||
"""Model's forward pass.
|
||||
|
||||
|
@ -479,10 +491,10 @@ class FastPitch(BaseTTS):
|
|||
x (torch.LongTensor): Input character sequences.
|
||||
x_lengths (torch.LongTensor): Input sequence lengths.
|
||||
y_lengths (torch.LongTensor): Output sequnce lengths. Defaults to None.
|
||||
y (torch.FloatTensor): Spectrogram frames. Defaults to None.
|
||||
dr (torch.IntTensor): Character durations over the spectrogram frames. Defaults to None.
|
||||
pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Defaults to None.
|
||||
aux_input (Dict): Auxiliary model inputs. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.
|
||||
y (torch.FloatTensor): Spectrogram frames. Only used when the alignment network is on. Defaults to None.
|
||||
dr (torch.IntTensor): Character durations over the spectrogram frames. Only used when the alignment network is off. Defaults to None.
|
||||
pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Only used when the pitch predictor is on. Defaults to None.
|
||||
aux_input (Dict): Auxiliary model inputs for multi-speaker training. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.
|
||||
|
||||
Shapes:
|
||||
- x: :math:`[B, T_max]`
|
||||
|
@ -495,8 +507,8 @@ class FastPitch(BaseTTS):
|
|||
"""
|
||||
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
||||
# compute sequence masks
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(y.dtype)
|
||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(y.dtype)
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).float()
|
||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).float()
|
||||
# encoder pass
|
||||
o_en, o_en_dp, x_mask, g, x_emb = self._forward_encoder(x, x_mask, g)
|
||||
# duration predictor pass
|
||||
|
@ -507,27 +519,36 @@ class FastPitch(BaseTTS):
|
|||
o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration)
|
||||
# generate attn mask from predicted durations
|
||||
o_attn = self.generate_attn(o_dr.squeeze(1), x_mask)
|
||||
# aligner pass
|
||||
# aligner
|
||||
o_alignment_dur = None
|
||||
alignment_soft = None
|
||||
alignment_logprob = None
|
||||
alignment_mas = None
|
||||
if self.use_aligner:
|
||||
o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas = self._forward_aligner(
|
||||
x_emb, y, x_mask, y_mask
|
||||
)
|
||||
alignment_soft = alignment_soft.transpose(1, 2)
|
||||
alignment_mas = alignment_mas.transpose(1, 2)
|
||||
dr = o_alignment_dur
|
||||
# pitch predictor pass
|
||||
o_pitch = None
|
||||
avg_pitch = None
|
||||
if self.args.use_pitch:
|
||||
o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en_dp, x_mask, pitch, dr)
|
||||
o_en = o_en + o_pitch_emb
|
||||
# decoder pass
|
||||
o_de, attn = self._forward_decoder(o_en, dr, x_mask, y_lengths, g=g)
|
||||
outputs = {
|
||||
"model_outputs": o_de,
|
||||
"durations_log": o_dr_log.squeeze(1),
|
||||
"durations": o_dr.squeeze(1),
|
||||
"attn_durations": o_attn, # for visualization
|
||||
"model_outputs": o_de, # [B, T, C]
|
||||
"durations_log": o_dr_log.squeeze(1), # [B, T]
|
||||
"durations": o_dr.squeeze(1), # [B, T]
|
||||
"attn_durations": o_attn, # for visualization [B, T_en, T_de']
|
||||
"pitch_avg": o_pitch,
|
||||
"pitch_avg_gt": avg_pitch,
|
||||
"alignments": attn,
|
||||
"alignment_soft": alignment_soft.transpose(1, 2),
|
||||
"alignment_mas": alignment_mas.transpose(1, 2),
|
||||
"alignments": attn, # [B, T_de, T_en]
|
||||
"alignment_soft": alignment_soft,
|
||||
"alignment_mas": alignment_mas,
|
||||
"o_alignment_dur": o_alignment_dur,
|
||||
"alignment_logprob": alignment_logprob,
|
||||
"x_mask": x_mask,
|
||||
|
@ -558,6 +579,8 @@ class FastPitch(BaseTTS):
|
|||
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
||||
y_lengths = o_dr.sum(1)
|
||||
# pitch predictor pass
|
||||
o_pitch = None
|
||||
if self.args.use_pitch:
|
||||
o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en_dp, x_mask)
|
||||
o_en = o_en + o_pitch_emb
|
||||
# decoder pass
|
||||
|
@ -575,7 +598,7 @@ class FastPitch(BaseTTS):
|
|||
text_lengths = batch["text_lengths"]
|
||||
mel_input = batch["mel_input"]
|
||||
mel_lengths = batch["mel_lengths"]
|
||||
pitch = batch["pitch"]
|
||||
pitch = batch["pitch"] if self.args.use_pitch else None
|
||||
d_vectors = batch["d_vectors"]
|
||||
speaker_ids = batch["speaker_ids"]
|
||||
durations = batch["durations"]
|
||||
|
@ -597,10 +620,10 @@ class FastPitch(BaseTTS):
|
|||
decoder_output_lens=mel_lengths,
|
||||
dur_output=outputs["durations_log"],
|
||||
dur_target=durations,
|
||||
pitch_output=outputs["pitch_avg"],
|
||||
pitch_target=outputs["pitch_avg_gt"],
|
||||
pitch_output=outputs["pitch_avg"] if self.use_pitch else None,
|
||||
pitch_target=outputs["pitch_avg_gt"] if self.use_pitch else None,
|
||||
input_lens=text_lengths,
|
||||
alignment_logprob=outputs["alignment_logprob"],
|
||||
alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None,
|
||||
alignment_soft=outputs["alignment_soft"] if self.use_binary_alignment_loss else None,
|
||||
alignment_hard=outputs["alignment_mas"] if self.use_binary_alignment_loss else None,
|
||||
)
|
||||
|
@ -615,27 +638,32 @@ class FastPitch(BaseTTS):
|
|||
model_outputs = outputs["model_outputs"]
|
||||
alignments = outputs["alignments"]
|
||||
mel_input = batch["mel_input"]
|
||||
pitch = batch["pitch"]
|
||||
pitch_avg_expanded, _ = self.expand_encoder_outputs(
|
||||
outputs["pitch_avg"], outputs["durations"], outputs["x_mask"], outputs["y_mask"]
|
||||
)
|
||||
|
||||
pred_spec = model_outputs[0].data.cpu().numpy()
|
||||
gt_spec = mel_input[0].data.cpu().numpy()
|
||||
align_img = alignments[0].data.cpu().numpy()
|
||||
pitch = pitch[0, 0].data.cpu().numpy()
|
||||
|
||||
# TODO: denormalize before plotting
|
||||
pitch = abs(pitch)
|
||||
pitch_avg_expanded = abs(pitch_avg_expanded[0, 0]).data.cpu().numpy()
|
||||
|
||||
figures = {
|
||||
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
||||
"alignment": plot_alignment(align_img, output_fig=False),
|
||||
}
|
||||
|
||||
# plot pitch figures
|
||||
if self.args.use_pitch:
|
||||
pitch = batch["pitch"]
|
||||
pitch_avg_expanded, _ = self.expand_encoder_outputs(
|
||||
outputs["pitch_avg"], outputs["durations"], outputs["x_mask"], outputs["y_mask"]
|
||||
)
|
||||
pitch = pitch[0, 0].data.cpu().numpy()
|
||||
# TODO: denormalize before plotting
|
||||
pitch = abs(pitch)
|
||||
pitch_avg_expanded = abs(pitch_avg_expanded[0, 0]).data.cpu().numpy()
|
||||
pitch_figures = {
|
||||
"pitch_ground_truth": plot_pitch(pitch, gt_spec, ap, output_fig=False),
|
||||
"pitch_avg_predicted": plot_pitch(pitch_avg_expanded, pred_spec, ap, output_fig=False),
|
||||
}
|
||||
figures.update(pitch_figures)
|
||||
|
||||
# plot the attention mask computed from the predicted durations
|
||||
if "attn_durations" in outputs:
|
||||
|
@ -662,36 +690,11 @@ class FastPitch(BaseTTS):
|
|||
assert not self.training
|
||||
|
||||
def get_criterion(self):
|
||||
from TTS.tts.layers.losses import FastPitchLoss # pylint: disable=import-outside-toplevel
|
||||
from TTS.tts.layers.losses import ForwardTTSLoss # pylint: disable=import-outside-toplevel
|
||||
|
||||
return FastPitchLoss(self.config)
|
||||
return ForwardTTSLoss(self.config)
|
||||
|
||||
def on_train_step_start(self, trainer):
|
||||
"""Enable binary alignment loss when needed"""
|
||||
if trainer.total_steps_done > self.config.binary_align_loss_start_step:
|
||||
self.use_binary_alignment_loss = True
|
||||
|
||||
|
||||
def average_pitch(pitch, durs):
|
||||
"""Compute the average pitch value for each input character based on the durations.
|
||||
|
||||
Shapes:
|
||||
- pitch: :math:`[B, 1, T_de]`
|
||||
- durs: :math:`[B, T_en]`
|
||||
"""
|
||||
|
||||
durs_cums_ends = torch.cumsum(durs, dim=1).long()
|
||||
durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
|
||||
pitch_nonzero_cums = torch.nn.functional.pad(torch.cumsum(pitch != 0.0, dim=2), (1, 0))
|
||||
pitch_cums = torch.nn.functional.pad(torch.cumsum(pitch, dim=2), (1, 0))
|
||||
|
||||
bs, l = durs_cums_ends.size()
|
||||
n_formants = pitch.size(1)
|
||||
dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
|
||||
dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
|
||||
|
||||
pitch_sums = (torch.gather(pitch_cums, 2, dce) - torch.gather(pitch_cums, 2, dcs)).float()
|
||||
pitch_nelems = (torch.gather(pitch_nonzero_cums, 2, dce) - torch.gather(pitch_nonzero_cums, 2, dcs)).float()
|
||||
|
||||
pitch_avg = torch.where(pitch_nelems == 0.0, pitch_nelems, pitch_sums / pitch_nelems)
|
||||
return pitch_avg
|
|
@ -7,9 +7,8 @@ from torch.nn import functional as F
|
|||
from TTS.tts.configs import GlowTTSConfig
|
||||
from TTS.tts.layers.glow_tts.decoder import Decoder
|
||||
from TTS.tts.layers.glow_tts.encoder import Encoder
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||
from TTS.tts.utils.speakers import get_speaker_manager
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
|
@ -133,7 +132,7 @@ class GlowTTS(BaseTTS):
|
|||
return y_mean, y_log_scale, o_attn_dur
|
||||
|
||||
def forward(
|
||||
self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, 'speaker_ids':None}
|
||||
self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
|
||||
): # pylint: disable=dangerous-default-value
|
||||
"""
|
||||
Shapes:
|
||||
|
@ -185,7 +184,7 @@ class GlowTTS(BaseTTS):
|
|||
|
||||
@torch.no_grad()
|
||||
def inference_with_MAS(
|
||||
self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, 'speaker_ids':None}
|
||||
self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
|
||||
): # pylint: disable=dangerous-default-value
|
||||
"""
|
||||
It's similar to the teacher forcing in Tacotron.
|
||||
|
@ -246,7 +245,7 @@ class GlowTTS(BaseTTS):
|
|||
|
||||
@torch.no_grad()
|
||||
def decoder_inference(
|
||||
self, y, y_lengths=None, aux_input={"d_vectors": None, 'speaker_ids':None}
|
||||
self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
|
||||
): # pylint: disable=dangerous-default-value
|
||||
"""
|
||||
Shapes:
|
||||
|
@ -278,7 +277,9 @@ class GlowTTS(BaseTTS):
|
|||
return outputs
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids":None}): # pylint: disable=dangerous-default-value
|
||||
def inference(
|
||||
self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
|
||||
): # pylint: disable=dangerous-default-value
|
||||
x_lengths = aux_input["x_lengths"]
|
||||
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None
|
||||
|
||||
|
@ -331,7 +332,13 @@ class GlowTTS(BaseTTS):
|
|||
d_vectors = batch["d_vectors"]
|
||||
speaker_ids = batch["speaker_ids"]
|
||||
|
||||
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": d_vectors, "speaker_ids":speaker_ids})
|
||||
outputs = self.forward(
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_input,
|
||||
mel_lengths,
|
||||
aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},
|
||||
)
|
||||
|
||||
loss_dict = criterion(
|
||||
outputs["model_outputs"],
|
||||
|
|
|
@ -1,320 +0,0 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
import torch
|
||||
from coqpit import Coqpit
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.feed_forward.decoder import Decoder
|
||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_fsspec
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpeedySpeechArgs(Coqpit):
|
||||
"""
|
||||
Args:
|
||||
num_chars (int): number of unique input to characters
|
||||
out_channels (int): number of output tensor channels. It is equal to the expected spectrogram size.
|
||||
hidden_channels (int): number of channels in all the model layers.
|
||||
positional_encoding (bool, optional): enable/disable Positional encoding on encoder outputs. Defaults to True.
|
||||
length_scale (int, optional): coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
|
||||
encoder_type (str, optional): set the encoder type. Defaults to 'residual_conv_bn'.
|
||||
encoder_params (dict, optional): set encoder parameters depending on 'encoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13 }.
|
||||
decoder_type (str, optional): decoder type. Defaults to 'residual_conv_bn'.
|
||||
decoder_params (dict, optional): set decoder parameters depending on 'decoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17 }.
|
||||
num_speakers (int, optional): number of speakers for multi-speaker training. Defaults to 0.
|
||||
use_d_vector (bool, optional): enable external speaker embeddings. Defaults to False.
|
||||
d_vector_dim (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
|
||||
"""
|
||||
|
||||
num_chars: int = None
|
||||
out_channels: int = 80
|
||||
hidden_channels: int = 128
|
||||
num_speakers: int = 0
|
||||
positional_encoding: bool = True
|
||||
length_scale: int = 1
|
||||
encoder_type: str = "residual_conv_bn"
|
||||
encoder_params: dict = field(
|
||||
default_factory=lambda: {
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13,
|
||||
}
|
||||
)
|
||||
decoder_type: str = "residual_conv_bn"
|
||||
decoder_params: dict = field(
|
||||
default_factory=lambda: {
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17,
|
||||
}
|
||||
)
|
||||
use_d_vector: bool = False
|
||||
d_vector_dim: int = 0
|
||||
|
||||
|
||||
class SpeedySpeech(BaseTTS):
|
||||
"""Speedy Speech model
|
||||
https://arxiv.org/abs/2008.03802
|
||||
|
||||
Encoder -> DurationPredictor -> Decoder
|
||||
|
||||
Paper abstract:
|
||||
While recent neural sequence-to-sequence models have greatly improved the quality of speech
|
||||
synthesis, there has not been a system capable of fast training, fast inference and high-quality audio synthesis
|
||||
at the same time. We propose a student-teacher network capable of high-quality faster-than-real-time spectrogram
|
||||
synthesis, with low requirements on computational resources and fast training time. We show that self-attention
|
||||
layers are not necessary for generation of high quality audio. We utilize simple convolutional blocks with
|
||||
residual connections in both student and teacher networks and use only a single attention layer in the teacher
|
||||
model. Coupled with a MelGAN vocoder, our model's voice quality was rated significantly higher than Tacotron 2.
|
||||
Our model can be efficiently trained on a single GPU and can run in real time even on a CPU. We provide both
|
||||
our source code and audio samples in our GitHub repository.
|
||||
|
||||
Notes:
|
||||
The vanilla model is able to achieve a reasonable performance with only
|
||||
~3M model parameters and convolutional layers.
|
||||
|
||||
This model requires precomputed phoneme durations to train a duration predictor. At inference
|
||||
it only uses the duration predictor to compute durations and expand encoder outputs respectively.
|
||||
|
||||
You can also mix and match different encoder and decoder networks beyond the paper.
|
||||
|
||||
Check `SpeedySpeechArgs` for arguments.
|
||||
"""
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
|
||||
def __init__(self, config: Coqpit):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
if "characters" in config:
|
||||
_, self.config, self.num_chars = self.get_characters(config)
|
||||
|
||||
self.length_scale = (
|
||||
float(config.model_args.length_scale)
|
||||
if isinstance(config.model_args.length_scale, int)
|
||||
else config.model_args.length_scale
|
||||
)
|
||||
self.emb = nn.Embedding(self.num_chars, config.model_args.hidden_channels)
|
||||
self.encoder = Encoder(
|
||||
config.model_args.hidden_channels,
|
||||
config.model_args.hidden_channels,
|
||||
config.model_args.encoder_type,
|
||||
config.model_args.encoder_params,
|
||||
config.model_args.d_vector_dim,
|
||||
)
|
||||
if config.model_args.positional_encoding:
|
||||
self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels)
|
||||
self.decoder = Decoder(
|
||||
config.model_args.out_channels,
|
||||
config.model_args.hidden_channels,
|
||||
config.model_args.decoder_type,
|
||||
config.model_args.decoder_params,
|
||||
)
|
||||
self.duration_predictor = DurationPredictor(config.model_args.hidden_channels + config.model_args.d_vector_dim)
|
||||
|
||||
if config.model_args.num_speakers > 1 and not config.model_args.use_d_vector:
|
||||
# speaker embedding layer
|
||||
self.emb_g = nn.Embedding(config.model_args.num_speakers, config.model_args.d_vector_dim)
|
||||
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
||||
|
||||
if config.model_args.d_vector_dim > 0 and config.model_args.d_vector_dim != config.model_args.hidden_channels:
|
||||
self.proj_g = nn.Conv1d(config.model_args.d_vector_dim, config.model_args.hidden_channels, 1)
|
||||
|
||||
@staticmethod
|
||||
def expand_encoder_outputs(en, dr, x_mask, y_mask):
|
||||
"""Generate attention alignment map from durations and
|
||||
expand encoder outputs
|
||||
|
||||
Example:
|
||||
encoder output: [a,b,c,d]
|
||||
durations: [1, 3, 2, 1]
|
||||
|
||||
expanded: [a, b, b, b, c, c, d]
|
||||
attention map: [[0, 0, 0, 0, 0, 0, 1],
|
||||
[0, 0, 0, 0, 1, 1, 0],
|
||||
[0, 1, 1, 1, 0, 0, 0],
|
||||
[1, 0, 0, 0, 0, 0, 0]]
|
||||
"""
|
||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
||||
attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype)
|
||||
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2)
|
||||
return o_en_ex, attn
|
||||
|
||||
def format_durations(self, o_dr_log, x_mask):
|
||||
o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
|
||||
o_dr[o_dr < 1] = 1.0
|
||||
o_dr = torch.round(o_dr)
|
||||
return o_dr
|
||||
|
||||
@staticmethod
|
||||
def _concat_speaker_embedding(o_en, g):
|
||||
g_exp = g.expand(-1, -1, o_en.size(-1)) # [B, C, T_en]
|
||||
o_en = torch.cat([o_en, g_exp], 1)
|
||||
return o_en
|
||||
|
||||
def _sum_speaker_embedding(self, x, g):
|
||||
# project g to decoder dim.
|
||||
if hasattr(self, "proj_g"):
|
||||
g = self.proj_g(g)
|
||||
return x + g
|
||||
|
||||
def _forward_encoder(self, x, x_lengths, g=None):
|
||||
if hasattr(self, "emb_g"):
|
||||
g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1]
|
||||
|
||||
if g is not None:
|
||||
g = g.unsqueeze(-1)
|
||||
|
||||
# [B, T, C]
|
||||
x_emb = self.emb(x)
|
||||
# [B, C, T]
|
||||
x_emb = torch.transpose(x_emb, 1, -1)
|
||||
|
||||
# compute sequence masks
|
||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype)
|
||||
|
||||
# encoder pass
|
||||
o_en = self.encoder(x_emb, x_mask)
|
||||
|
||||
# speaker conditioning for duration predictor
|
||||
if g is not None:
|
||||
o_en_dp = self._concat_speaker_embedding(o_en, g)
|
||||
else:
|
||||
o_en_dp = o_en
|
||||
return o_en, o_en_dp, x_mask, g
|
||||
|
||||
def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g):
|
||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
|
||||
# expand o_en with durations
|
||||
o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
|
||||
# positional encoding
|
||||
if hasattr(self, "pos_encoder"):
|
||||
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
|
||||
# speaker embedding
|
||||
if g is not None:
|
||||
o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
|
||||
# decoder pass
|
||||
o_de = self.decoder(o_en_ex, y_mask, g=g)
|
||||
return o_de, attn.transpose(1, 2)
|
||||
|
||||
def forward(
|
||||
self, x, x_lengths, y_lengths, dr, aux_input={"d_vectors": None, "speaker_ids": None}
|
||||
): # pylint: disable=unused-argument
|
||||
"""
|
||||
TODO: speaker embedding for speaker_ids
|
||||
Shapes:
|
||||
x: [B, T_max]
|
||||
x_lengths: [B]
|
||||
y_lengths: [B]
|
||||
dr: [B, T_max]
|
||||
g: [B, C]
|
||||
"""
|
||||
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||
o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
||||
outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn}
|
||||
return outputs
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument
|
||||
"""
|
||||
Shapes:
|
||||
x: [B, T_max]
|
||||
x_lengths: [B]
|
||||
g: [B, C]
|
||||
"""
|
||||
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
||||
x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
|
||||
# input sequence should be greated than the max convolution size
|
||||
inference_padding = 5
|
||||
if x.shape[1] < 13:
|
||||
inference_padding += 13 - x.shape[1]
|
||||
# pad input to prevent dropping the last word
|
||||
x = torch.nn.functional.pad(x, pad=(0, inference_padding), mode="constant", value=0)
|
||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||
# duration predictor pass
|
||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
||||
y_lengths = o_dr.sum(1)
|
||||
o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
|
||||
outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn, "durations_log": None}
|
||||
return outputs
|
||||
|
||||
def train_step(self, batch: dict, criterion: nn.Module):
|
||||
text_input = batch["text_input"]
|
||||
text_lengths = batch["text_lengths"]
|
||||
mel_input = batch["mel_input"]
|
||||
mel_lengths = batch["mel_lengths"]
|
||||
d_vectors = batch["d_vectors"]
|
||||
speaker_ids = batch["speaker_ids"]
|
||||
durations = batch["durations"]
|
||||
|
||||
aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids}
|
||||
outputs = self.forward(text_input, text_lengths, mel_lengths, durations, aux_input)
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(
|
||||
outputs["model_outputs"],
|
||||
mel_input,
|
||||
mel_lengths,
|
||||
outputs["durations_log"],
|
||||
torch.log(1 + durations),
|
||||
text_lengths,
|
||||
)
|
||||
|
||||
# compute alignment error (the lower the better )
|
||||
align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True)
|
||||
loss_dict["align_error"] = align_error
|
||||
return outputs, loss_dict
|
||||
|
||||
def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use
|
||||
model_outputs = outputs["model_outputs"]
|
||||
alignments = outputs["alignments"]
|
||||
mel_input = batch["mel_input"]
|
||||
|
||||
pred_spec = model_outputs[0].data.cpu().numpy()
|
||||
gt_spec = mel_input[0].data.cpu().numpy()
|
||||
align_img = alignments[0].data.cpu().numpy()
|
||||
|
||||
figures = {
|
||||
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
||||
"alignment": plot_alignment(align_img, output_fig=False),
|
||||
}
|
||||
|
||||
# Sample audio
|
||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||
return figures, {"audio": train_audio}
|
||||
|
||||
def eval_step(self, batch: dict, criterion: nn.Module):
|
||||
return self.train_step(batch, criterion)
|
||||
|
||||
def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict):
|
||||
return self.train_log(ap, batch, outputs)
|
||||
|
||||
def load_checkpoint(
|
||||
self, config, checkpoint_path, eval=False
|
||||
): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
||||
self.load_state_dict(state["model"])
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
||||
|
||||
def get_criterion(self):
|
||||
from TTS.tts.layers.losses import SpeedySpeechLoss # pylint: disable=import-outside-toplevel
|
||||
|
||||
return SpeedySpeechLoss(self.config)
|
|
@ -9,12 +9,11 @@ from torch import nn
|
|||
from torch.cuda.amp.autocast_mode import autocast
|
||||
|
||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
||||
from TTS.tts.layers.vits.discriminator import VitsDiscriminator
|
||||
from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
|
||||
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
|
||||
from TTS.tts.utils.speakers import get_speaker_manager
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.visual import plot_alignment
|
||||
|
@ -24,28 +23,6 @@ from TTS.vocoder.models.hifigan_generator import HifiganGenerator
|
|||
from TTS.vocoder.utils.generic_utils import plot_results
|
||||
|
||||
|
||||
def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4):
|
||||
"""Segment each sample in a batch based on the provided segment indices"""
|
||||
segments = torch.zeros_like(x[:, :, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
index_start = segment_indices[i]
|
||||
index_end = index_start + segment_size
|
||||
segments[i] = x[i, :, index_start:index_end]
|
||||
return segments
|
||||
|
||||
|
||||
def rand_segment(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4):
|
||||
"""Create random segments based on the input lengths."""
|
||||
B, _, T = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = T
|
||||
max_idxs = x_lengths - segment_size + 1
|
||||
assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size."
|
||||
segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long()
|
||||
ret = segment(x, segment_indices, segment_size)
|
||||
return ret, segment_indices
|
||||
|
||||
|
||||
@dataclass
|
||||
class VitsArgs(Coqpit):
|
||||
"""VITS model arguments.
|
||||
|
@ -451,7 +428,7 @@ class Vits(BaseTTS):
|
|||
logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p])
|
||||
|
||||
# select a random feature segment for the waveform decoder
|
||||
z_slice, slice_ids = rand_segment(z, y_lengths, self.spec_segment_size)
|
||||
z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size)
|
||||
o = self.waveform_decoder(z_slice, g=g)
|
||||
outputs.update(
|
||||
{
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def _pad_data(x, length):
|
||||
|
@ -52,35 +51,3 @@ def prepare_stop_target(inputs, out_steps):
|
|||
|
||||
def pad_per_step(inputs, pad_len):
|
||||
return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
|
||||
|
||||
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
class StandardScaler:
|
||||
def set_stats(self, mean, scale):
|
||||
self.mean_ = mean
|
||||
self.scale_ = scale
|
||||
|
||||
def reset_stats(self):
|
||||
delattr(self, "mean_")
|
||||
delattr(self, "scale_")
|
||||
|
||||
def transform(self, X):
|
||||
X = np.asarray(X)
|
||||
X -= self.mean_
|
||||
X /= self.scale_
|
||||
return X
|
||||
|
||||
def inverse_transform(self, X):
|
||||
X = np.asarray(X)
|
||||
X *= self.scale_
|
||||
X += self.mean_
|
||||
return X
|
||||
|
||||
|
||||
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
|
||||
def sequence_mask(sequence_length, max_len=None):
|
||||
if max_len is None:
|
||||
max_len = sequence_length.data.max()
|
||||
seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
|
||||
# B x T_max
|
||||
return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
|
||||
|
|
|
@ -0,0 +1,213 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
try:
|
||||
from TTS.tts.utils.monotonic_align.core import maximum_path_c
|
||||
|
||||
CYTHON = True
|
||||
except ModuleNotFoundError:
|
||||
CYTHON = False
|
||||
|
||||
|
||||
class StandardScaler:
|
||||
"""StandardScaler for mean-scale normalization with the given mean and scale values."""
|
||||
|
||||
def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
|
||||
self.mean_ = mean
|
||||
self.scale_ = scale
|
||||
|
||||
def set_stats(self, mean, scale):
|
||||
self.mean_ = mean
|
||||
self.scale_ = scale
|
||||
|
||||
def reset_stats(self):
|
||||
delattr(self, "mean_")
|
||||
delattr(self, "scale_")
|
||||
|
||||
def transform(self, X):
|
||||
X = np.asarray(X)
|
||||
X -= self.mean_
|
||||
X /= self.scale_
|
||||
return X
|
||||
|
||||
def inverse_transform(self, X):
|
||||
X = np.asarray(X)
|
||||
X *= self.scale_
|
||||
X += self.mean_
|
||||
return X
|
||||
|
||||
|
||||
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
|
||||
def sequence_mask(sequence_length, max_len=None):
|
||||
"""Create a sequence mask for filtering padding in a sequence tensor.
|
||||
|
||||
Args:
|
||||
sequence_length (torch.tensor): Sequence lengths.
|
||||
max_len (int, Optional): Maximum sequence length. Defaults to None.
|
||||
|
||||
Shapes:
|
||||
- mask: :math:`[B, T_max]`
|
||||
"""
|
||||
if max_len is None:
|
||||
max_len = sequence_length.data.max()
|
||||
seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
|
||||
# B x T_max
|
||||
mask = seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
|
||||
return mask
|
||||
|
||||
|
||||
def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4):
|
||||
"""Segment each sample in a batch based on the provided segment indices
|
||||
|
||||
Args:
|
||||
x (torch.tensor): Input tensor.
|
||||
segment_indices (torch.tensor): Segment indices.
|
||||
segment_size (int): Expected output segment size.
|
||||
"""
|
||||
segments = torch.zeros_like(x[:, :, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
index_start = segment_indices[i]
|
||||
index_end = index_start + segment_size
|
||||
segments[i] = x[i, :, index_start:index_end]
|
||||
return segments
|
||||
|
||||
|
||||
def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4):
|
||||
"""Create random segments based on the input lengths.
|
||||
|
||||
Args:
|
||||
x (torch.tensor): Input tensor.
|
||||
x_lengths (torch.tensor): Input lengths.
|
||||
segment_size (int): Expected output segment size.
|
||||
|
||||
Shapes:
|
||||
- x: :math:`[B, C, T]`
|
||||
- x_lengths: :math:`[B]`
|
||||
"""
|
||||
B, _, T = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = T
|
||||
max_idxs = x_lengths - segment_size + 1
|
||||
assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size."
|
||||
segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long()
|
||||
ret = segment(x, segment_indices, segment_size)
|
||||
return ret, segment_indices
|
||||
|
||||
|
||||
def average_over_durations(values, durs):
|
||||
"""Average values over durations.
|
||||
|
||||
Shapes:
|
||||
- values: :math:`[B, 1, T_de]`
|
||||
- durs: :math:`[B, T_en]`
|
||||
- avg: :math:`[B, 1, T_en]`
|
||||
"""
|
||||
durs_cums_ends = torch.cumsum(durs, dim=1).long()
|
||||
durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
|
||||
values_nonzero_cums = torch.nn.functional.pad(torch.cumsum(values != 0.0, dim=2), (1, 0))
|
||||
values_cums = torch.nn.functional.pad(torch.cumsum(values, dim=2), (1, 0))
|
||||
|
||||
bs, l = durs_cums_ends.size()
|
||||
n_formants = values.size(1)
|
||||
dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
|
||||
dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
|
||||
|
||||
values_sums = (torch.gather(values_cums, 2, dce) - torch.gather(values_cums, 2, dcs)).float()
|
||||
values_nelems = (torch.gather(values_nonzero_cums, 2, dce) - torch.gather(values_nonzero_cums, 2, dcs)).float()
|
||||
|
||||
avg = torch.where(values_nelems == 0.0, values_nelems, values_sums / values_nelems)
|
||||
return avg
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
l = pad_shape[::-1]
|
||||
pad_shape = [item for sublist in l for item in sublist]
|
||||
return pad_shape
|
||||
|
||||
|
||||
def generate_path(duration, mask):
|
||||
"""
|
||||
Shapes:
|
||||
- duration: :math:`[B, T_en]`
|
||||
- mask: :math:'[B, T_en, T_de]`
|
||||
- path: :math:`[B, T_en, T_de]`
|
||||
"""
|
||||
device = duration.device
|
||||
b, t_x, t_y = mask.shape
|
||||
cum_duration = torch.cumsum(duration, 1)
|
||||
path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
|
||||
|
||||
cum_duration_flat = cum_duration.view(b * t_x)
|
||||
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
||||
path = path.view(b, t_x, t_y)
|
||||
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
||||
path = path * mask
|
||||
return path
|
||||
|
||||
|
||||
def maximum_path(value, mask):
|
||||
if CYTHON:
|
||||
return maximum_path_cython(value, mask)
|
||||
return maximum_path_numpy(value, mask)
|
||||
|
||||
|
||||
def maximum_path_cython(value, mask):
|
||||
"""Cython optimised version.
|
||||
Shapes:
|
||||
- value: :math:`[B, T_en, T_de]`
|
||||
- mask: :math:`[B, T_en, T_de]`
|
||||
"""
|
||||
value = value * mask
|
||||
device = value.device
|
||||
dtype = value.dtype
|
||||
value = value.data.cpu().numpy().astype(np.float32)
|
||||
path = np.zeros_like(value).astype(np.int32)
|
||||
mask = mask.data.cpu().numpy()
|
||||
|
||||
t_x_max = mask.sum(1)[:, 0].astype(np.int32)
|
||||
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
|
||||
maximum_path_c(path, value, t_x_max, t_y_max)
|
||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
|
||||
|
||||
def maximum_path_numpy(value, mask, max_neg_val=None):
|
||||
"""
|
||||
Monotonic alignment search algorithm
|
||||
Numpy-friendly version. It's about 4 times faster than torch version.
|
||||
value: [b, t_x, t_y]
|
||||
mask: [b, t_x, t_y]
|
||||
"""
|
||||
if max_neg_val is None:
|
||||
max_neg_val = -np.inf # Patch for Sphinx complaint
|
||||
value = value * mask
|
||||
|
||||
device = value.device
|
||||
dtype = value.dtype
|
||||
value = value.cpu().detach().numpy()
|
||||
mask = mask.cpu().detach().numpy().astype(np.bool)
|
||||
|
||||
b, t_x, t_y = value.shape
|
||||
direction = np.zeros(value.shape, dtype=np.int64)
|
||||
v = np.zeros((b, t_x), dtype=np.float32)
|
||||
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
|
||||
for j in range(t_y):
|
||||
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
|
||||
v1 = v
|
||||
max_mask = v1 >= v0
|
||||
v_max = np.where(max_mask, v1, v0)
|
||||
direction[:, :, j] = max_mask
|
||||
|
||||
index_mask = x_range <= j
|
||||
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
|
||||
direction = np.where(mask, direction, 1)
|
||||
|
||||
path = np.zeros(value.shape, dtype=np.float32)
|
||||
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
|
||||
index_range = np.arange(b)
|
||||
for j in reversed(range(t_y)):
|
||||
path[index_range, index, j] = 1
|
||||
index = index + direction[index_range, index, j] - 1
|
||||
path = path * mask.astype(np.float32)
|
||||
path = torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
return path
|
File diff suppressed because it is too large
Load Diff
|
@ -101,6 +101,7 @@ def visualize(
|
|||
figsize=(8, 24),
|
||||
output_fig=False,
|
||||
):
|
||||
"""Intended to be used in Notebooks."""
|
||||
|
||||
if decoder_output is not None:
|
||||
num_plot = 4
|
||||
|
|
|
@ -9,7 +9,7 @@ import soundfile as sf
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.utils.data import StandardScaler
|
||||
from TTS.tts.utils.helpers import StandardScaler
|
||||
|
||||
|
||||
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||
|
@ -608,6 +608,9 @@ class AudioProcessor(object):
|
|||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
y = self._istft(S_complex * angles)
|
||||
if not np.isfinite(y).all():
|
||||
print(" [!] Waveform is not finite everywhere. Skipping the GL.")
|
||||
return np.array([0.0])
|
||||
for _ in range(self.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(self._stft(y)))
|
||||
y = self._istft(S_complex * angles)
|
||||
|
|
|
@ -59,7 +59,7 @@ def load_wav_feat_data(data_path, feat_path, eval_split_size):
|
|||
wav_paths.sort(key=lambda x: Path(x).stem)
|
||||
feat_paths.sort(key=lambda x: Path(x).stem)
|
||||
|
||||
assert len(wav_paths) == len(feat_paths)
|
||||
assert len(wav_paths) == len(feat_paths), f" [!] {len(wav_paths)} vs {feat_paths}"
|
||||
for wav, feat in zip(wav_paths, feat_paths):
|
||||
wav_name = Path(wav).stem
|
||||
feat_name = Path(feat).stem
|
||||
|
|
|
@ -7,7 +7,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
|
|||
- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny.
|
||||
|
||||
## What are the requirements of a good 🐸TTS dataset?
|
||||
* https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset
|
||||
* {ref}`See this page <what_makes_a_good_dataset>`
|
||||
|
||||
## How should I choose the right model?
|
||||
- First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2.
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
# Fine-tuning a 🐸 TTS model
|
||||
|
||||
## Fine-tuning
|
||||
|
||||
Fine-tuning takes a pre-trained model, and retrains it to improve the model performance on a different task or dataset.
|
||||
In 🐸TTS we provide different pre-trained models in different languages and different pros and cons. You can take one of
|
||||
them and fine-tune it for your own dataset. This will help you in two main ways:
|
||||
|
||||
1. Faster learning
|
||||
|
||||
Since a pre-trained model has already learned features that are relevant for the task, it will converge faster on
|
||||
a new dataset. This will reduce the cost of training and let you experient faster.
|
||||
|
||||
2. Better resutls with small datasets
|
||||
|
||||
Deep learning models are data hungry and they give better performance with more data. However, it is not always
|
||||
possible to have this abondance, especially in domain. For instance, LJSpeech dataset, that we released most of
|
||||
our English models with, is almost 24 hours long. And it requires for someone to collect thid amount of data with
|
||||
a help of a voice talent takes weeks.
|
||||
|
||||
Fine-tuning cames to rescue in this case. You can take one of our pre-trained models and fine-tune it for your own
|
||||
speech dataset and achive reasonable results with only a couple of hours in the worse case.
|
||||
|
||||
However, note that, fine-tuning does not promise great results. The model performance is still depends on the
|
||||
{ref}`dataset quality <what_makes_a_good_dataset>` and the hyper-parameters you choose for fine-tuning. Therefore,
|
||||
it still demands a bit of tinkering.
|
||||
|
||||
|
||||
## Steps to fine-tune a 🐸 TTS model
|
||||
|
||||
1. Setup your dataset.
|
||||
|
||||
You need to format your target dataset in a certain way so that 🐸TTS data loader would be able to load it for the
|
||||
training. Please see {ref}`this page <formatting_your_dataset>` for more information about formatting.
|
||||
|
||||
2. Choose the model you want to fine-tune.
|
||||
|
||||
You can list the availabe models on terminal as
|
||||
|
||||
```bash
|
||||
tts --list-models
|
||||
```
|
||||
|
||||
The command above lists the the models in a naming format as ```<model_type>/<language>/<dataset>/<model_name>```.
|
||||
|
||||
Or you can manually check `.model.json` file in the project directory.
|
||||
|
||||
You should choose the model based on your requirements. Some models are fast and some are better in speech quality.
|
||||
One lazy way to check a model is running the model on the hardware you want to use and see how it works. For
|
||||
simple testing, you can use the `tts` command on the terminal. For more info see {ref}`here <synthesizing_speech>`.
|
||||
|
||||
3. Download the model.
|
||||
|
||||
You can download the model by `tts` command. If you run `tts` with a particular model, it will download automatically
|
||||
and the model path will be printed on the terminal.
|
||||
|
||||
```bash
|
||||
tts --model_name tts_models/es/mai/tacotron2-DDC --text "Ola."
|
||||
|
||||
> Downloading model to /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
|
||||
...
|
||||
```
|
||||
|
||||
In the example above, we called the Spanish Tacotron model and give the sample output showing use the path where
|
||||
the model is downloaded.
|
||||
|
||||
4. Setup the model config for fine-tuning.
|
||||
|
||||
You need to change certain fields in the model config. You have 3 options for playing with the configuration.
|
||||
|
||||
1. Edit the fields in the ```config.json``` file if you want to use ```TTS/bin/train_tts.py``` to train the model.
|
||||
2. Edit the fields in one of the training scripts in the ```recipes``` directory if you want to use python.
|
||||
3. Use the command-line arguments to override the fields like ```--coqpit.lr 0.00001``` to change the learning rate.
|
||||
|
||||
Some of the important fields are as follows:
|
||||
|
||||
- `datasets` field: This is set to the dataset you want to fine-tune the model on.
|
||||
- `run_name` field: This is the name of the run. This is used to name the output directory and the entry in the
|
||||
logging dashboard.
|
||||
- `output_path` field: This is the path where the fine-tuned model is saved.
|
||||
- `lr` field: You may need to use a smaller learning rate for fine-tuning not to impair the features learned by the
|
||||
pre-trained model with big update steps.
|
||||
- `audio` fields: Different datasets have different audio characteristics. You must check the current audio parameters and
|
||||
make sure that the values reflect your dataset. For instance, your dataset might have a different audio sampling rate.
|
||||
|
||||
Apart from these above, you should check the whole configuration file and make sure that the values are correct for
|
||||
your dataset and training.
|
||||
|
||||
5. Start fine-tuning.
|
||||
|
||||
Whether you use one of the training scripts under ```recipes``` folder or the ```train_tts.py``` to start
|
||||
your training, you should use the ```--restore_path``` flag to specify the path to the pre-trained model.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
|
||||
```
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \
|
||||
--config_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
|
||||
```
|
||||
|
||||
As stated above, you can also use command-line arguments to change the model configuration.
|
||||
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
|
||||
--coqpit.run_name "glow-tts-finetune" \
|
||||
--coqpit.lr 0.00001
|
||||
```
|
||||
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
(formatting_your_dataset)=
|
||||
# Formatting Your Dataset
|
||||
|
||||
For training a TTS model, you need a dataset with speech recordings and transcriptions. The speech must be divided into audio clips and each clip needs transcription.
|
||||
|
@ -18,15 +19,15 @@ Let's assume you created the audio clips and their transcription. You can collec
|
|||
|
||||
You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text.
|
||||
|
||||
We recommend the following format delimited by `|`.
|
||||
We recommend the following format delimited by `||`.
|
||||
|
||||
```
|
||||
# metadata.txt
|
||||
|
||||
audio1.wav | This is my sentence.
|
||||
audio2.wav | This is maybe my sentence.
|
||||
audio3.wav | This is certainly my sentence.
|
||||
audio4.wav | Let this be your sentence.
|
||||
audio1.wav || This is my sentence.
|
||||
audio2.wav || This is maybe my sentence.
|
||||
audio3.wav || This is certainly my sentence.
|
||||
audio4.wav || Let this be your sentence.
|
||||
...
|
||||
```
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
inference
|
||||
implementing_a_new_model
|
||||
training_a_model
|
||||
finetuning
|
||||
configuration
|
||||
formatting_your_dataset
|
||||
what_makes_a_good_dataset
|
||||
|
@ -45,7 +46,7 @@
|
|||
|
||||
models/glow_tts.md
|
||||
models/vits.md
|
||||
models/fast_pitch.md
|
||||
models/forward_tts.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
# Forward TTS model(s)
|
||||
|
||||
A general feed-forward TTS model implementation that can be configured to different architectures by setting different
|
||||
encoder and decoder networks. It can be trained with either pre-computed durations (from pre-trained Tacotron) or
|
||||
an alignment network that learns the text to audio alignment from the input data.
|
||||
|
||||
Currently we provide the following pre-configured architectures:
|
||||
|
||||
- **FastSpeech:**
|
||||
|
||||
It's a feed-forward model TTS model that uses Feed Forward Transformer (FFT) modules as the encoder and decoder.
|
||||
|
||||
- **FastPitch:**
|
||||
|
||||
It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the
|
||||
promise of more expressive speech.
|
||||
|
||||
- **SpeedySpeech:**
|
||||
|
||||
It uses Residual Convolution layers instead of Transformers that leads to a more compute friendly model.
|
||||
|
||||
- **FastSpeech2 (TODO):**
|
||||
|
||||
Similar to FastPitch but it also uses a spectral energy values as an addition.
|
||||
|
||||
## Important resources & papers
|
||||
- FastPitch: https://arxiv.org/abs/2006.06873
|
||||
- SpeedySpeech: https://arxiv.org/abs/2008.03802
|
||||
- FastSpeech: https://arxiv.org/pdf/1905.09263
|
||||
- FastSpeech2: https://arxiv.org/abs/2006.04558
|
||||
- Aligner Network: https://arxiv.org/abs/2108.10447
|
||||
- What is Pitch: https://www.britannica.com/topic/pitch-speech
|
||||
|
||||
|
||||
## ForwardTTSArgs
|
||||
```{eval-rst}
|
||||
.. autoclass:: TTS.tts.models.forward_tts.ForwardTTSArgs
|
||||
:members:
|
||||
```
|
||||
|
||||
## ForwardTTS Model
|
||||
```{eval-rst}
|
||||
.. autoclass:: TTS.tts.models.forward_tts.ForwardTTS
|
||||
:members:
|
||||
```
|
||||
|
||||
## FastPitchConfig
|
||||
```{eval-rst}
|
||||
.. autoclass:: TTS.tts.configs.fast_pitch_config.FastPitchConfig
|
||||
:members:
|
||||
```
|
||||
|
||||
## SpeedySpeechConfig
|
||||
```{eval-rst}
|
||||
.. autoclass:: TTS.tts.configs.speedy_speech_config.SpeedySpeechConfig
|
||||
:members:
|
||||
```
|
||||
|
||||
## FastSpeechConfig
|
||||
```{eval-rst}
|
||||
.. autoclass:: TTS.tts.configs.fast_speech_config.FastSpeechConfig
|
||||
:members:
|
||||
```
|
||||
|
||||
|
|
@ -54,7 +54,7 @@
|
|||
|
||||
4. Run the training.
|
||||
|
||||
You need to call the python training script.
|
||||
You need to run the training script.
|
||||
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py
|
||||
|
@ -63,7 +63,7 @@
|
|||
Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable.
|
||||
To see available GPUs on your system, you can use `nvidia-smi` command on the terminal.
|
||||
|
||||
If you like to run a multi-gpu training
|
||||
If you like to run a multi-gpu training using DDP back-end,
|
||||
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
(what_makes_a_good_dataset)=
|
||||
# What makes a good TTS dataset
|
||||
|
||||
## What Makes a Good Dataset
|
||||
|
|
|
@ -2,16 +2,14 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training."
|
||||
]
|
||||
"This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training."
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
|
@ -25,22 +23,23 @@
|
|||
"from TTS.tts.datasets.TTSDataset import TTSDataset\n",
|
||||
"from TTS.tts.layers.losses import L1LossMasked\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.config import load_config\n",
|
||||
"from TTS.tts.utils.visual import plot_spectrogram\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model, sequence_mask\n",
|
||||
"from TTS.tts.utils.helpers import sequence_mask\n",
|
||||
"from TTS.tts.models import setup_model\n",
|
||||
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"os.environ['CUDA_VISIBLE_DEVICES']='0'"
|
||||
]
|
||||
"os.environ['CUDA_VISIBLE_DEVICES']='2'"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def set_filename(wav_path, out_path):\n",
|
||||
" wav_file = os.path.basename(wav_path)\n",
|
||||
|
@ -52,20 +51,20 @@
|
|||
" mel_path = os.path.join(out_path, \"mel\", file_name)\n",
|
||||
" wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n",
|
||||
" return file_name, wavq_path, mel_path, wav_path"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"OUT_PATH = \"/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA\"\n",
|
||||
"DATA_PATH = \"/home/erogol/gdrive/Datasets/non-binary-voice-files/\"\n",
|
||||
"DATASET = \"sam_accenture\"\n",
|
||||
"METADATA_FILE = \"recording_script.xml\"\n",
|
||||
"CONFIG_PATH = \"/home/erogol/gdrive/Trainings/sam/ljspeech-dcattn-April-03-2021_05+02-2344379/config.json\"\n",
|
||||
"MODEL_FILE = \"/home/erogol/gdrive/Trainings/sam/ljspeech-dcattn-April-03-2021_05+02-2344379/best_model.pth.tar\"\n",
|
||||
"OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n",
|
||||
"DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n",
|
||||
"DATASET = \"ljspeech\"\n",
|
||||
"METADATA_FILE = \"metadata.csv\"\n",
|
||||
"CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n",
|
||||
"MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n",
|
||||
"BATCH_SIZE = 32\n",
|
||||
"\n",
|
||||
"QUANTIZED_WAV = False\n",
|
||||
|
@ -78,56 +77,63 @@
|
|||
"C = load_config(CONFIG_PATH)\n",
|
||||
"C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n",
|
||||
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(C['r'])\n",
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in C.keys():\n",
|
||||
"if 'characters' in C and C['characters']:\n",
|
||||
" symbols, phonemes = make_symbols(**C.characters)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
||||
"# TODO: multiple speaker\n",
|
||||
"model = setup_model(num_chars, num_speakers=0, c=C)\n",
|
||||
"checkpoint = torch.load(MODEL_FILE)\n",
|
||||
"model.load_state_dict(checkpoint['model'])\n",
|
||||
"print(checkpoint['step'])\n",
|
||||
"model.eval()\n",
|
||||
"model.decoder.set_r(checkpoint['r'])\n",
|
||||
"if use_cuda:\n",
|
||||
" model = model.cuda()"
|
||||
]
|
||||
"model = setup_model(C)\n",
|
||||
"model.load_checkpoint(C, MODEL_FILE, eval=True)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
|
||||
"preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n",
|
||||
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
||||
"meta_data = preprocessor(DATA_PATH, METADATA_FILE)\n",
|
||||
"dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,characters=c.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
|
||||
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
|
||||
]
|
||||
"dataset = TTSDataset(\n",
|
||||
" checkpoint[\"config\"][\"r\"],\n",
|
||||
" C.text_cleaner,\n",
|
||||
" False,\n",
|
||||
" ap,\n",
|
||||
" meta_data,\n",
|
||||
" characters=C.get('characters', None),\n",
|
||||
" use_phonemes=C.use_phonemes,\n",
|
||||
" phoneme_cache_path=C.phoneme_cache_path,\n",
|
||||
" enable_eos_bos=C.enable_eos_bos_chars,\n",
|
||||
")\n",
|
||||
"loader = DataLoader(\n",
|
||||
" dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n",
|
||||
")\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Generate model outputs "
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pickle\n",
|
||||
"\n",
|
||||
|
@ -206,42 +212,42 @@
|
|||
"\n",
|
||||
" print(np.mean(losses))\n",
|
||||
" print(np.mean(postnet_losses))"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# for pwgan\n",
|
||||
"with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n",
|
||||
" for data in metadata:\n",
|
||||
" f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sanity Check"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"idx = 1\n",
|
||||
"ap.melspectrogram(ap.load_wav(item_idx[idx])).shape"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import soundfile as sf\n",
|
||||
"wav, sr = sf.read(item_idx[idx])\n",
|
||||
|
@ -249,46 +255,46 @@
|
|||
"mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n",
|
||||
"mel_truth = ap.melspectrogram(wav)\n",
|
||||
"print(mel_truth.shape)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot posnet output\n",
|
||||
"print(mel_postnet[:mel_lengths[idx], :].shape)\n",
|
||||
"plot_spectrogram(mel_postnet, ap)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot decoder output\n",
|
||||
"print(mel_decoder.shape)\n",
|
||||
"plot_spectrogram(mel_decoder, ap)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot GT specgrogram\n",
|
||||
"print(mel_truth.shape)\n",
|
||||
"plot_spectrogram(mel_truth.T, ap)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# postnet, decoder diff\n",
|
||||
"from matplotlib import pylab as plt\n",
|
||||
|
@ -297,13 +303,13 @@
|
|||
"plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.tight_layout()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# PLOT GT SPECTROGRAM diff\n",
|
||||
"from matplotlib import pylab as plt\n",
|
||||
|
@ -312,13 +318,13 @@
|
|||
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.tight_layout()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# PLOT GT SPECTROGRAM diff\n",
|
||||
"from matplotlib import pylab as plt\n",
|
||||
|
@ -328,21 +334,22 @@
|
|||
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.tight_layout()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"source": [],
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "python3",
|
||||
"display_name": "Python 3.9.7 64-bit ('base': conda)"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -354,7 +361,10 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.9.7"
|
||||
},
|
||||
"interpreter": {
|
||||
"hash": "822ce188d9bce5372c4adbb11364eeb49293228c2224eb55307f4664778e7f56"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
import os
|
||||
|
||||
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
||||
from TTS.tts.configs import SpeedySpeechConfig
|
||||
from TTS.utils.manage import ModelManager
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# init configs
|
||||
dataset_config = BaseDatasetConfig(
|
||||
name="ljspeech",
|
||||
meta_file_train="metadata.csv",
|
||||
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
|
||||
path=os.path.join(output_path, "../LJSpeech-1.1/"),
|
||||
)
|
||||
|
||||
audio_config = BaseAudioConfig(
|
||||
sample_rate=22050,
|
||||
do_trim_silence=True,
|
||||
trim_db=60.0,
|
||||
signal_norm=False,
|
||||
mel_fmin=0.0,
|
||||
mel_fmax=8000,
|
||||
spec_gain=1.0,
|
||||
log_func="np.log",
|
||||
ref_level_db=20,
|
||||
preemphasis=0.0,
|
||||
)
|
||||
|
||||
config = SpeedySpeechConfig(
|
||||
run_name="speedy_speech_ljspeech",
|
||||
audio=audio_config,
|
||||
batch_size=32,
|
||||
eval_batch_size=16,
|
||||
num_loader_workers=4,
|
||||
num_eval_loader_workers=4,
|
||||
compute_input_seq_cache=True,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1000,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=True,
|
||||
use_espeak_phonemes=False,
|
||||
phoneme_language="en-us",
|
||||
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||
print_step=50,
|
||||
print_eval=False,
|
||||
mixed_precision=False,
|
||||
sort_by_audio_len=True,
|
||||
max_seq_len=500000,
|
||||
output_path=output_path,
|
||||
datasets=[dataset_config],
|
||||
)
|
||||
|
||||
# compute alignments
|
||||
if not config.model_args.use_aligner:
|
||||
manager = ModelManager()
|
||||
model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
|
||||
# TODO: make compute_attention python callable
|
||||
os.system(
|
||||
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
|
||||
)
|
||||
|
||||
# train the model
|
||||
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
|
||||
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
|
||||
trainer.fit()
|
4
setup.py
4
setup.py
|
@ -54,8 +54,8 @@ with open("README.md", "r", encoding="utf-8") as readme_file:
|
|||
|
||||
exts = [
|
||||
Extension(
|
||||
name="TTS.tts.layers.glow_tts.monotonic_align.core",
|
||||
sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"],
|
||||
name="TTS.tts.utils.monotonic_align.core",
|
||||
sources=["TTS/tts/utils/monotonic_align/core.pyx"],
|
||||
)
|
||||
]
|
||||
setup(
|
||||
|
|
|
@ -7,8 +7,8 @@ from TTS.utils.generic_utils import get_cuda
|
|||
def get_device_id():
|
||||
use_cuda, _ = get_cuda()
|
||||
if use_cuda:
|
||||
if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ['CUDA_VISIBLE_DEVICES'] != "":
|
||||
GPU_ID = os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]
|
||||
if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] != "":
|
||||
GPU_ID = os.environ["CUDA_VISIBLE_DEVICES"].split(",")[0]
|
||||
else:
|
||||
GPU_ID = "0"
|
||||
else:
|
||||
|
|
|
@ -68,15 +68,15 @@ class TestTTSDataset(unittest.TestCase):
|
|||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
break
|
||||
text_input = data['text']
|
||||
text_lengths = data['text_lengths']
|
||||
speaker_name = data['speaker_names']
|
||||
linear_input = data['linear']
|
||||
mel_input = data['mel']
|
||||
mel_lengths = data['mel_lengths']
|
||||
stop_target = data['stop_targets']
|
||||
item_idx = data['item_idxs']
|
||||
wavs = data['waveform']
|
||||
text_input = data["text"]
|
||||
text_lengths = data["text_lengths"]
|
||||
speaker_name = data["speaker_names"]
|
||||
linear_input = data["linear"]
|
||||
mel_input = data["mel"]
|
||||
mel_lengths = data["mel_lengths"]
|
||||
stop_target = data["stop_targets"]
|
||||
item_idx = data["item_idxs"]
|
||||
wavs = data["waveform"]
|
||||
|
||||
neg_values = text_input[text_input < 0]
|
||||
check_count = len(neg_values)
|
||||
|
@ -113,14 +113,14 @@ class TestTTSDataset(unittest.TestCase):
|
|||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
break
|
||||
text_input = data['text']
|
||||
text_lengths = data['text_lengths']
|
||||
speaker_name = data['speaker_names']
|
||||
linear_input = data['linear']
|
||||
mel_input = data['mel']
|
||||
mel_lengths = data['mel_lengths']
|
||||
stop_target = data['stop_targets']
|
||||
item_idx = data['item_idxs']
|
||||
text_input = data["text"]
|
||||
text_lengths = data["text_lengths"]
|
||||
speaker_name = data["speaker_names"]
|
||||
linear_input = data["linear"]
|
||||
mel_input = data["mel"]
|
||||
mel_lengths = data["mel_lengths"]
|
||||
stop_target = data["stop_targets"]
|
||||
item_idx = data["item_idxs"]
|
||||
|
||||
avg_length = mel_lengths.numpy().mean()
|
||||
assert avg_length >= last_length
|
||||
|
@ -139,14 +139,14 @@ class TestTTSDataset(unittest.TestCase):
|
|||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
break
|
||||
text_input = data['text']
|
||||
text_lengths = data['text_lengths']
|
||||
speaker_name = data['speaker_names']
|
||||
linear_input = data['linear']
|
||||
mel_input = data['mel']
|
||||
mel_lengths = data['mel_lengths']
|
||||
stop_target = data['stop_targets']
|
||||
item_idx = data['item_idxs']
|
||||
text_input = data["text"]
|
||||
text_lengths = data["text_lengths"]
|
||||
speaker_name = data["speaker_names"]
|
||||
linear_input = data["linear"]
|
||||
mel_input = data["mel"]
|
||||
mel_lengths = data["mel_lengths"]
|
||||
stop_target = data["stop_targets"]
|
||||
item_idx = data["item_idxs"]
|
||||
|
||||
# check mel_spec consistency
|
||||
wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
|
||||
|
@ -188,14 +188,14 @@ class TestTTSDataset(unittest.TestCase):
|
|||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
break
|
||||
text_input = data['text']
|
||||
text_lengths = data['text_lengths']
|
||||
speaker_name = data['speaker_names']
|
||||
linear_input = data['linear']
|
||||
mel_input = data['mel']
|
||||
mel_lengths = data['mel_lengths']
|
||||
stop_target = data['stop_targets']
|
||||
item_idx = data['item_idxs']
|
||||
text_input = data["text"]
|
||||
text_lengths = data["text_lengths"]
|
||||
speaker_name = data["speaker_names"]
|
||||
linear_input = data["linear"]
|
||||
mel_input = data["mel"]
|
||||
mel_lengths = data["mel_lengths"]
|
||||
stop_target = data["stop_targets"]
|
||||
item_idx = data["item_idxs"]
|
||||
|
||||
if mel_lengths[0] > mel_lengths[1]:
|
||||
idx = 0
|
||||
|
|
|
@ -11,11 +11,10 @@ def test_synthesize():
|
|||
# single speaker model
|
||||
run_cli(f'tts --text "This is an example." --out_path "{output_path}"')
|
||||
run_cli(
|
||||
"tts --model_name tts_models/en/ljspeech/speedy-speech-wn "
|
||||
f'--text "This is an example." --out_path "{output_path}"'
|
||||
"tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"'
|
||||
)
|
||||
run_cli(
|
||||
"tts --model_name tts_models/en/ljspeech/speedy-speech-wn "
|
||||
"tts --model_name tts_models/en/ljspeech/glow-tts "
|
||||
"--vocoder_name vocoder_models/en/ljspeech/multiband-melgan "
|
||||
f'--text "This is an example." --out_path "{output_path}"'
|
||||
)
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
import unittest
|
||||
|
||||
import torch as T
|
||||
|
||||
from TTS.tts.models.fast_pitch import FastPitch, FastPitchArgs, average_pitch
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
|
||||
class AveragePitchTests(unittest.TestCase):
|
||||
def test_in_out(self): # pylint: disable=no-self-use
|
||||
pitch = T.rand(1, 1, 128)
|
||||
|
||||
durations = T.randint(1, 5, (1, 21))
|
||||
coeff = 128.0 / durations.sum()
|
||||
durations = T.round(durations * coeff)
|
||||
diff = 128.0 - durations.sum()
|
||||
durations[0, -1] += diff
|
||||
durations = durations.long()
|
||||
|
||||
pitch_avg = average_pitch(pitch, durations)
|
||||
|
||||
index = 0
|
||||
for idx, dur in enumerate(durations[0]):
|
||||
assert abs(pitch_avg[0, 0, idx] - pitch[0, 0, index : index + dur.item()].mean()) < 1e-5
|
||||
index += dur
|
||||
|
||||
|
||||
def expand_encoder_outputs_test():
|
||||
model = FastPitch(FastPitchArgs(num_chars=10))
|
||||
|
||||
inputs = T.rand(2, 5, 57)
|
||||
durations = T.randint(1, 4, (2, 57))
|
||||
|
||||
x_mask = T.ones(2, 1, 57)
|
||||
y_mask = T.ones(2, 1, durations.sum(1).max())
|
||||
|
||||
expanded, _ = model.expand_encoder_outputs(inputs, durations, x_mask, y_mask)
|
||||
|
||||
for b in range(durations.shape[0]):
|
||||
index = 0
|
||||
for idx, dur in enumerate(durations[b]):
|
||||
diff = (
|
||||
expanded[b, :, index : index + dur.item()]
|
||||
- inputs[b, :, idx].repeat(dur.item()).view(expanded[b, :, index : index + dur.item()].shape)
|
||||
).sum()
|
||||
assert abs(diff) < 1e-6, diff
|
||||
index += dur
|
|
@ -0,0 +1,68 @@
|
|||
import glob
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_device_id, get_tests_output_path, run_cli
|
||||
from TTS.config.shared_configs import BaseAudioConfig
|
||||
from TTS.tts.configs import FastPitchConfig
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_fast_pitch_config.json")
|
||||
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||
|
||||
audio_config = BaseAudioConfig(
|
||||
sample_rate=22050,
|
||||
do_trim_silence=True,
|
||||
trim_db=60.0,
|
||||
signal_norm=False,
|
||||
mel_fmin=0.0,
|
||||
mel_fmax=8000,
|
||||
spec_gain=1.0,
|
||||
log_func="np.log",
|
||||
ref_level_db=20,
|
||||
preemphasis=0.0,
|
||||
)
|
||||
|
||||
config = FastPitchConfig(
|
||||
audio=audio_config,
|
||||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
num_eval_loader_workers=0,
|
||||
text_cleaner="english_cleaners",
|
||||
use_phonemes=True,
|
||||
phoneme_language="en-us",
|
||||
phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
|
||||
f0_cache_path="tests/data/ljspeech/f0_cache/",
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1,
|
||||
print_step=1,
|
||||
print_eval=True,
|
||||
test_sentences=[
|
||||
"Be a voice, not an echo.",
|
||||
],
|
||||
)
|
||||
config.audio.do_trim_silence = True
|
||||
config.audio.trim_db = 60
|
||||
config.save_json(config_path)
|
||||
|
||||
# train the model for one epoch
|
||||
command_train = (
|
||||
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
|
||||
f"--coqpit.output_path {output_path} "
|
||||
"--coqpit.datasets.0.name ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_train metadata.csv "
|
||||
"--coqpit.datasets.0.meta_file_val metadata.csv "
|
||||
"--coqpit.datasets.0.path tests/data/ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
|
||||
"--coqpit.test_delay_epochs 0"
|
||||
)
|
||||
run_cli(command_train)
|
||||
|
||||
# Find latest folder
|
||||
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
|
||||
run_cli(command_train)
|
||||
shutil.rmtree(continue_path)
|
|
@ -2,7 +2,7 @@ import torch
|
|||
|
||||
from TTS.tts.layers.feed_forward.decoder import Decoder
|
||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
|
|
@ -0,0 +1,147 @@
|
|||
import torch as T
|
||||
|
||||
from TTS.tts.models.forward_tts import ForwardTTS, ForwardTTSArgs
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
|
||||
def expand_encoder_outputs_test():
|
||||
model = ForwardTTS(ForwardTTSArgs(num_chars=10))
|
||||
|
||||
inputs = T.rand(2, 5, 57)
|
||||
durations = T.randint(1, 4, (2, 57))
|
||||
|
||||
x_mask = T.ones(2, 1, 57)
|
||||
y_mask = T.ones(2, 1, durations.sum(1).max())
|
||||
|
||||
expanded, _ = model.expand_encoder_outputs(inputs, durations, x_mask, y_mask)
|
||||
|
||||
for b in range(durations.shape[0]):
|
||||
index = 0
|
||||
for idx, dur in enumerate(durations[b]):
|
||||
diff = (
|
||||
expanded[b, :, index : index + dur.item()]
|
||||
- inputs[b, :, idx].repeat(dur.item()).view(expanded[b, :, index : index + dur.item()].shape)
|
||||
).sum()
|
||||
assert abs(diff) < 1e-6, diff
|
||||
index += dur
|
||||
|
||||
|
||||
def model_input_output_test():
|
||||
"""Assert the output shapes of the model in different modes"""
|
||||
|
||||
# VANILLA MODEL
|
||||
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=False, use_aligner=False))
|
||||
|
||||
x = T.randint(0, 10, (2, 21))
|
||||
x_lengths = T.randint(10, 22, (2,))
|
||||
x_lengths[-1] = 21
|
||||
x_mask = sequence_mask(x_lengths).unsqueeze(1).long()
|
||||
durations = T.randint(1, 4, (2, 21))
|
||||
durations = durations * x_mask.squeeze(1)
|
||||
y_lengths = durations.sum(1)
|
||||
y_mask = sequence_mask(y_lengths).unsqueeze(1).long()
|
||||
|
||||
outputs = model.forward(x, x_lengths, y_lengths, dr=durations)
|
||||
|
||||
assert outputs["model_outputs"].shape == (2, durations.sum(1).max(), 80)
|
||||
assert outputs["durations_log"].shape == (2, 21)
|
||||
assert outputs["durations"].shape == (2, 21)
|
||||
assert outputs["alignments"].shape == (2, durations.sum(1).max(), 21)
|
||||
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||
|
||||
assert outputs["alignment_soft"] is None
|
||||
assert outputs["alignment_mas"] is None
|
||||
assert outputs["alignment_logprob"] is None
|
||||
assert outputs["o_alignment_dur"] is None
|
||||
assert outputs["pitch_avg"] is None
|
||||
assert outputs["pitch_avg_gt"] is None
|
||||
|
||||
# USE PITCH
|
||||
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=True, use_aligner=False))
|
||||
|
||||
x = T.randint(0, 10, (2, 21))
|
||||
x_lengths = T.randint(10, 22, (2,))
|
||||
x_lengths[-1] = 21
|
||||
x_mask = sequence_mask(x_lengths).unsqueeze(1).long()
|
||||
durations = T.randint(1, 4, (2, 21))
|
||||
durations = durations * x_mask.squeeze(1)
|
||||
y_lengths = durations.sum(1)
|
||||
y_mask = sequence_mask(y_lengths).unsqueeze(1).long()
|
||||
pitch = T.rand(2, 1, y_lengths.max())
|
||||
|
||||
outputs = model.forward(x, x_lengths, y_lengths, dr=durations, pitch=pitch)
|
||||
|
||||
assert outputs["model_outputs"].shape == (2, durations.sum(1).max(), 80)
|
||||
assert outputs["durations_log"].shape == (2, 21)
|
||||
assert outputs["durations"].shape == (2, 21)
|
||||
assert outputs["alignments"].shape == (2, durations.sum(1).max(), 21)
|
||||
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||
assert outputs["pitch_avg"].shape == (2, 1, 21)
|
||||
assert outputs["pitch_avg_gt"].shape == (2, 1, 21)
|
||||
|
||||
assert outputs["alignment_soft"] is None
|
||||
assert outputs["alignment_mas"] is None
|
||||
assert outputs["alignment_logprob"] is None
|
||||
assert outputs["o_alignment_dur"] is None
|
||||
|
||||
# USE ALIGNER NETWORK
|
||||
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=False, use_aligner=True))
|
||||
|
||||
x = T.randint(0, 10, (2, 21))
|
||||
x_lengths = T.randint(10, 22, (2,))
|
||||
x_lengths[-1] = 21
|
||||
x_mask = sequence_mask(x_lengths).unsqueeze(1).long()
|
||||
durations = T.randint(1, 4, (2, 21))
|
||||
durations = durations * x_mask.squeeze(1)
|
||||
y_lengths = durations.sum(1)
|
||||
y_mask = sequence_mask(y_lengths).unsqueeze(1).long()
|
||||
y = T.rand(2, y_lengths.max(), 80)
|
||||
|
||||
outputs = model.forward(x, x_lengths, y_lengths, dr=durations, y=y)
|
||||
|
||||
assert outputs["model_outputs"].shape == (2, durations.sum(1).max(), 80)
|
||||
assert outputs["durations_log"].shape == (2, 21)
|
||||
assert outputs["durations"].shape == (2, 21)
|
||||
assert outputs["alignments"].shape == (2, durations.sum(1).max(), 21)
|
||||
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||
assert outputs["alignment_soft"].shape == (2, durations.sum(1).max(), 21)
|
||||
assert outputs["alignment_mas"].shape == (2, durations.sum(1).max(), 21)
|
||||
assert outputs["alignment_logprob"].shape == (2, 1, durations.sum(1).max(), 21)
|
||||
assert outputs["o_alignment_dur"].shape == (2, 21)
|
||||
|
||||
assert outputs["pitch_avg"] is None
|
||||
assert outputs["pitch_avg_gt"] is None
|
||||
|
||||
# USE ALIGNER NETWORK AND PITCH
|
||||
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=True, use_aligner=True))
|
||||
|
||||
x = T.randint(0, 10, (2, 21))
|
||||
x_lengths = T.randint(10, 22, (2,))
|
||||
x_lengths[-1] = 21
|
||||
x_mask = sequence_mask(x_lengths).unsqueeze(1).long()
|
||||
durations = T.randint(1, 4, (2, 21))
|
||||
durations = durations * x_mask.squeeze(1)
|
||||
y_lengths = durations.sum(1)
|
||||
y_mask = sequence_mask(y_lengths).unsqueeze(1).long()
|
||||
y = T.rand(2, y_lengths.max(), 80)
|
||||
pitch = T.rand(2, 1, y_lengths.max())
|
||||
|
||||
outputs = model.forward(x, x_lengths, y_lengths, dr=durations, pitch=pitch, y=y)
|
||||
|
||||
assert outputs["model_outputs"].shape == (2, durations.sum(1).max(), 80)
|
||||
assert outputs["durations_log"].shape == (2, 21)
|
||||
assert outputs["durations"].shape == (2, 21)
|
||||
assert outputs["alignments"].shape == (2, durations.sum(1).max(), 21)
|
||||
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||
assert outputs["alignment_soft"].shape == (2, durations.sum(1).max(), 21)
|
||||
assert outputs["alignment_mas"].shape == (2, durations.sum(1).max(), 21)
|
||||
assert outputs["alignment_logprob"].shape == (2, 1, durations.sum(1).max(), 21)
|
||||
assert outputs["o_alignment_dur"].shape == (2, 21)
|
||||
assert outputs["pitch_avg"].shape == (2, 1, 21)
|
||||
assert outputs["pitch_avg_gt"].shape == (2, 1, 21)
|
|
@ -0,0 +1,60 @@
|
|||
import torch as T
|
||||
|
||||
from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask
|
||||
|
||||
|
||||
def average_over_durations_test(): # pylint: disable=no-self-use
|
||||
pitch = T.rand(1, 1, 128)
|
||||
|
||||
durations = T.randint(1, 5, (1, 21))
|
||||
coeff = 128.0 / durations.sum()
|
||||
durations = T.floor(durations * coeff)
|
||||
diff = 128.0 - durations.sum()
|
||||
durations[0, -1] += diff
|
||||
durations = durations.long()
|
||||
|
||||
pitch_avg = average_over_durations(pitch, durations)
|
||||
|
||||
index = 0
|
||||
for idx, dur in enumerate(durations[0]):
|
||||
assert abs(pitch_avg[0, 0, idx] - pitch[0, 0, index : index + dur.item()].mean()) < 1e-5
|
||||
index += dur
|
||||
|
||||
|
||||
def seqeunce_mask_test():
|
||||
lengths = T.randint(10, 15, (8,))
|
||||
mask = sequence_mask(lengths)
|
||||
for i in range(8):
|
||||
l = lengths[i].item()
|
||||
assert mask[i, :l].sum() == l
|
||||
assert mask[i, l:].sum() == 0
|
||||
|
||||
|
||||
def segment_test():
|
||||
x = T.range(0, 11)
|
||||
x = x.repeat(8, 1).unsqueeze(1)
|
||||
segment_ids = T.randint(0, 7, (8,))
|
||||
|
||||
segments = segment(x, segment_ids, segment_size=4)
|
||||
for idx, start_indx in enumerate(segment_ids):
|
||||
assert x[idx, :, start_indx : start_indx + 4].sum() == segments[idx, :, :].sum()
|
||||
|
||||
|
||||
def generate_path_test():
|
||||
durations = T.randint(1, 4, (10, 21))
|
||||
x_length = T.randint(18, 22, (10,))
|
||||
x_mask = sequence_mask(x_length).unsqueeze(1).long()
|
||||
durations = durations * x_mask.squeeze(1)
|
||||
y_length = durations.sum(1)
|
||||
y_mask = sequence_mask(y_length).unsqueeze(1).long()
|
||||
attn_mask = (T.unsqueeze(x_mask, -1) * T.unsqueeze(y_mask, 2)).squeeze(1).long()
|
||||
print(attn_mask.shape)
|
||||
path = generate_path(durations, attn_mask)
|
||||
assert path.shape == (10, 21, durations.sum(1).max().item())
|
||||
for b in range(durations.shape[0]):
|
||||
current_idx = 0
|
||||
for t in range(durations.shape[1]):
|
||||
assert all(path[b, t, current_idx : current_idx + durations[b, t].item()] == 1.0)
|
||||
assert all(path[b, t, :current_idx] == 0.0)
|
||||
assert all(path[b, t, current_idx + durations[b, t].item() :] == 0.0)
|
||||
current_idx += durations[b, t].item()
|
|
@ -1,96 +0,0 @@
|
|||
import torch
|
||||
|
||||
from TTS.tts.configs import SpeedySpeechConfig
|
||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def test_duration_predictor():
|
||||
input_dummy = torch.rand(8, 128, 27).to(device)
|
||||
input_lengths = torch.randint(20, 27, (8,)).long().to(device)
|
||||
input_lengths[-1] = 27
|
||||
|
||||
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
||||
|
||||
layer = DurationPredictor(hidden_channels=128).to(device)
|
||||
|
||||
output = layer(input_dummy, x_mask)
|
||||
assert list(output.shape) == [8, 1, 27]
|
||||
|
||||
|
||||
def test_speedy_speech():
|
||||
num_chars = 7
|
||||
B = 8
|
||||
T_en = 37
|
||||
T_de = 74
|
||||
|
||||
x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device)
|
||||
x_lengths = torch.randint(31, T_en, (B,)).long().to(device)
|
||||
x_lengths[-1] = T_en
|
||||
|
||||
# set durations. max total duration should be equal to T_de
|
||||
durations = torch.randint(1, 4, (B, T_en))
|
||||
durations = durations * (T_de / durations.sum(1)).unsqueeze(1)
|
||||
durations = durations.to(torch.long).to(device)
|
||||
max_dur = durations.sum(1).max()
|
||||
durations[:, 0] += T_de - max_dur if T_de > max_dur else 0
|
||||
|
||||
y_lengths = durations.sum(1)
|
||||
|
||||
config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128))
|
||||
model = SpeedySpeech(config)
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
|
||||
# forward pass
|
||||
outputs = model(x_dummy, x_lengths, y_lengths, durations)
|
||||
o_de = outputs["model_outputs"]
|
||||
attn = outputs["alignments"]
|
||||
o_dr = outputs["durations_log"]
|
||||
|
||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
||||
# with speaker embedding
|
||||
config = SpeedySpeechConfig(
|
||||
model_args=SpeedySpeechArgs(
|
||||
num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256
|
||||
)
|
||||
)
|
||||
model = SpeedySpeech(config).to(device)
|
||||
model.forward(
|
||||
x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)}
|
||||
)
|
||||
o_de = outputs["model_outputs"]
|
||||
attn = outputs["alignments"]
|
||||
o_dr = outputs["durations_log"]
|
||||
|
||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
||||
|
||||
# with speaker external embedding
|
||||
config = SpeedySpeechConfig(
|
||||
model_args=SpeedySpeechArgs(
|
||||
num_chars=num_chars,
|
||||
out_channels=80,
|
||||
hidden_channels=128,
|
||||
num_speakers=10,
|
||||
use_d_vector=True,
|
||||
d_vector_dim=256,
|
||||
)
|
||||
)
|
||||
model = SpeedySpeech(config).to(device)
|
||||
model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)})
|
||||
o_de = outputs["model_outputs"]
|
||||
attn = outputs["alignments"]
|
||||
o_dr = outputs["durations_log"]
|
||||
|
||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
||||
assert list(attn.shape) == [B, T_de, T_en]
|
||||
assert list(o_dr.shape) == [B, T_en]
|
|
@ -4,14 +4,12 @@ import shutil
|
|||
|
||||
from tests import get_device_id, get_tests_output_path, run_cli
|
||||
from TTS.tts.configs import SpeedySpeechConfig
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
|
||||
|
||||
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
|
||||
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||
|
||||
|
||||
config = SpeedySpeechConfig(
|
||||
model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0),
|
||||
batch_size=8,
|
||||
eval_batch_size=8,
|
||||
num_loader_workers=0,
|
||||
|
|
|
@ -38,6 +38,7 @@ class TacotronTFTrainTest(unittest.TestCase):
|
|||
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
|
||||
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths, stop_targets, speaker_ids
|
||||
|
||||
@unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.")
|
||||
def test_train_step(self):
|
||||
"""test forward pass"""
|
||||
(
|
||||
|
@ -70,6 +71,7 @@ class TacotronTFTrainTest(unittest.TestCase):
|
|||
# inference pass
|
||||
output = model(chars_seq, training=False)
|
||||
|
||||
@unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.")
|
||||
def test_forward_attention(
|
||||
self,
|
||||
):
|
||||
|
@ -103,6 +105,7 @@ class TacotronTFTrainTest(unittest.TestCase):
|
|||
# inference pass
|
||||
output = model(chars_seq, training=False)
|
||||
|
||||
@unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.")
|
||||
def test_tflite_conversion(
|
||||
self,
|
||||
): # pylint:disable=no-self-use
|
||||
|
|
|
@ -4,7 +4,7 @@ import torch as T
|
|||
|
||||
from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
|
||||
from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet
|
||||
from TTS.tts.utils.data import sequence_mask
|
||||
from TTS.tts.utils.helpers import sequence_mask
|
||||
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
|
||||
from TTS.vocoder.tf.models.melgan_generator import MelganGenerator
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
|
||||
|
||||
@unittest.skipIf(use_cuda, " [!] Skip Test: Loosy TF support.")
|
||||
def test_melgan_generator():
|
||||
hop_length = 256
|
||||
model = MelganGenerator()
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import os
|
||||
import unittest
|
||||
|
||||
import soundfile as sf
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
from librosa.core import load
|
||||
|
||||
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
|
||||
|
@ -9,8 +11,10 @@ from TTS.vocoder.tf.layers.pqmf import PQMF
|
|||
|
||||
TESTS_PATH = get_tests_path()
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
use_cuda = torch.cuda.is_available()
|
||||
|
||||
|
||||
@unittest.skipIf(use_cuda, " [!] Skip Test: Loosy TF support.")
|
||||
def test_pqmf():
|
||||
w, sr = load(WAV_FILE)
|
||||
|
||||
|
|
Loading…
Reference in New Issue