mirror of https://github.com/coqui-ai/TTS.git
commit
0592a5805c
|
@ -142,7 +142,6 @@ old_configs/*
|
||||||
model_importers/*
|
model_importers/*
|
||||||
model_profiling/*
|
model_profiling/*
|
||||||
docs/source/TODO/*
|
docs/source/TODO/*
|
||||||
docs/source/models/*
|
|
||||||
.noseids
|
.noseids
|
||||||
.dccache
|
.dccache
|
||||||
log.txt
|
log.txt
|
||||||
|
|
5
Makefile
5
Makefile
|
@ -1,5 +1,5 @@
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
.PHONY: test system-deps dev-deps deps style lint install help
|
.PHONY: test system-deps dev-deps deps style lint install help docs
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
||||||
|
@ -45,3 +45,6 @@ deps: ## install 🐸 requirements.
|
||||||
|
|
||||||
install: ## install 🐸 TTS for development.
|
install: ## install 🐸 TTS for development.
|
||||||
pip install -e .[all]
|
pip install -e .[all]
|
||||||
|
|
||||||
|
docs: ## build the docs
|
||||||
|
$(MAKE) -C docs clean && $(MAKE) -C docs html
|
||||||
|
|
|
@ -72,6 +72,8 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
|
||||||
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
|
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
|
||||||
- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
|
- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
|
||||||
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
|
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
|
||||||
|
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
|
||||||
|
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
|
||||||
|
|
||||||
### End-to-End Models
|
### End-to-End Models
|
||||||
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
|
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
|
||||||
|
@ -82,6 +84,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
|
||||||
- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
|
- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
|
||||||
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
|
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
|
||||||
- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
|
- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
|
||||||
|
- Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
|
||||||
|
|
||||||
### Speaker Encoder
|
### Speaker Encoder
|
||||||
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
|
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
|
||||||
|
|
|
@ -38,6 +38,16 @@
|
||||||
"license": "MPL",
|
"license": "MPL",
|
||||||
"contact": "egolge@coqui.com"
|
"contact": "egolge@coqui.com"
|
||||||
},
|
},
|
||||||
|
"speedy-speech": {
|
||||||
|
"description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
|
||||||
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip",
|
||||||
|
"stats_file": null,
|
||||||
|
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||||
|
"commit": "4581e3d",
|
||||||
|
"author": "Eren Gölge @erogol",
|
||||||
|
"license": "TBD",
|
||||||
|
"contact": "egolge@coqui.com"
|
||||||
|
},
|
||||||
"tacotron2-DCA": {
|
"tacotron2-DCA": {
|
||||||
"description": "",
|
"description": "",
|
||||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip",
|
||||||
|
@ -47,15 +57,6 @@
|
||||||
"license": "MPL",
|
"license": "MPL",
|
||||||
"contact": "egolge@coqui.com"
|
"contact": "egolge@coqui.com"
|
||||||
},
|
},
|
||||||
"speedy-speech-wn": {
|
|
||||||
"description": "Speedy Speech model with wavenet decoder.",
|
|
||||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ljspeech--speedy-speech-wn.zip",
|
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
|
||||||
"commit": "77b6145",
|
|
||||||
"author": "Eren Gölge @erogol",
|
|
||||||
"license": "MPL",
|
|
||||||
"contact": "egolge@coqui.com"
|
|
||||||
},
|
|
||||||
"vits": {
|
"vits": {
|
||||||
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
||||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--ljspeech--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--ljspeech--vits.zip",
|
||||||
|
@ -218,11 +219,11 @@
|
||||||
"contact": "egolge@coqui.ai"
|
"contact": "egolge@coqui.ai"
|
||||||
},
|
},
|
||||||
"univnet": {
|
"univnet": {
|
||||||
"description": "UnivNet model trained on LJSpeech to complement the TacotronDDC_ph model.",
|
"description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
|
||||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/vocoder_models--en--ljspeech--univnet.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip",
|
||||||
"commit": "3900448",
|
"commit": "4581e3d",
|
||||||
"author": "Eren @erogol",
|
"author": "Eren @erogol",
|
||||||
"license": "",
|
"license": "TBD",
|
||||||
"contact": "egolge@coqui.ai"
|
"contact": "egolge@coqui.ai"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
0.2.2
|
0.3.0
|
|
@ -16,7 +16,6 @@ from TTS.tts.models import setup_model
|
||||||
from TTS.tts.utils.speakers import get_speaker_manager
|
from TTS.tts.utils.speakers import get_speaker_manager
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.generic_utils import count_parameters
|
from TTS.utils.generic_utils import count_parameters
|
||||||
from TTS.utils.io import load_fsspec
|
|
||||||
|
|
||||||
use_cuda = torch.cuda.is_available()
|
use_cuda = torch.cuda.is_available()
|
||||||
|
|
||||||
|
@ -77,14 +76,14 @@ def set_filename(wav_path, out_path):
|
||||||
|
|
||||||
def format_data(data):
|
def format_data(data):
|
||||||
# setup input data
|
# setup input data
|
||||||
text_input = data['text']
|
text_input = data["text"]
|
||||||
text_lengths = data['text_lengths']
|
text_lengths = data["text_lengths"]
|
||||||
mel_input = data['mel']
|
mel_input = data["mel"]
|
||||||
mel_lengths = data['mel_lengths']
|
mel_lengths = data["mel_lengths"]
|
||||||
item_idx = data['item_idxs']
|
item_idx = data["item_idxs"]
|
||||||
d_vectors = data['d_vectors']
|
d_vectors = data["d_vectors"]
|
||||||
speaker_ids = data['speaker_ids']
|
speaker_ids = data["speaker_ids"]
|
||||||
attn_mask = data['attns']
|
attn_mask = data["attns"]
|
||||||
avg_text_length = torch.mean(text_lengths.float())
|
avg_text_length = torch.mean(text_lengths.float())
|
||||||
avg_spec_length = torch.mean(mel_lengths.float())
|
avg_spec_length = torch.mean(mel_lengths.float())
|
||||||
|
|
||||||
|
@ -133,7 +132,11 @@ def inference(
|
||||||
elif d_vectors is not None:
|
elif d_vectors is not None:
|
||||||
speaker_c = d_vectors
|
speaker_c = d_vectors
|
||||||
outputs = model.inference_with_MAS(
|
outputs = model.inference_with_MAS(
|
||||||
text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids}
|
text_input,
|
||||||
|
text_lengths,
|
||||||
|
mel_input,
|
||||||
|
mel_lengths,
|
||||||
|
aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
|
||||||
)
|
)
|
||||||
model_output = outputs["model_outputs"]
|
model_output = outputs["model_outputs"]
|
||||||
model_output = model_output.transpose(1, 2).detach().cpu().numpy()
|
model_output = model_output.transpose(1, 2).detach().cpu().numpy()
|
||||||
|
@ -239,8 +242,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
model = setup_model(c)
|
model = setup_model(c)
|
||||||
|
|
||||||
# restore model
|
# restore model
|
||||||
checkpoint = load_fsspec(args.checkpoint_path, map_location="cpu")
|
model.load_checkpoint(c, args.checkpoint_path, eval=True)
|
||||||
model.load_state_dict(checkpoint["model"])
|
|
||||||
|
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
model.cuda()
|
model.cuda()
|
||||||
|
|
|
@ -205,7 +205,7 @@ class Trainer:
|
||||||
# load data for `tts` models
|
# load data for `tts` models
|
||||||
self.data_train, self.data_eval = load_meta_data(self.config.datasets)
|
self.data_train, self.data_eval = load_meta_data(self.config.datasets)
|
||||||
elif self.config.feature_path is not None:
|
elif self.config.feature_path is not None:
|
||||||
# load data for `vocoder`models
|
# load pre-comnputed features for `vocoder`models
|
||||||
print(f" > Loading features from: {self.config.feature_path}")
|
print(f" > Loading features from: {self.config.feature_path}")
|
||||||
self.data_eval, self.data_train = load_wav_feat_data(
|
self.data_eval, self.data_train = load_wav_feat_data(
|
||||||
self.config.data_path, self.config.feature_path, self.config.eval_split_size
|
self.config.data_path, self.config.feature_path, self.config.eval_split_size
|
||||||
|
@ -275,7 +275,8 @@ class Trainer:
|
||||||
if self.args.continue_path:
|
if self.args.continue_path:
|
||||||
if isinstance(self.scheduler, list):
|
if isinstance(self.scheduler, list):
|
||||||
for scheduler in self.scheduler:
|
for scheduler in self.scheduler:
|
||||||
scheduler.last_epoch = self.restore_step
|
if scheduler is not None:
|
||||||
|
scheduler.last_epoch = self.restore_step
|
||||||
else:
|
else:
|
||||||
self.scheduler.last_epoch = self.restore_step
|
self.scheduler.last_epoch = self.restore_step
|
||||||
|
|
||||||
|
@ -662,6 +663,7 @@ class Trainer:
|
||||||
lrs = {"current_lr": current_lr}
|
lrs = {"current_lr": current_lr}
|
||||||
|
|
||||||
# log run-time stats
|
# log run-time stats
|
||||||
|
loss_dict.update(lrs)
|
||||||
loss_dict.update(
|
loss_dict.update(
|
||||||
{
|
{
|
||||||
"step_time": round(step_time, 4),
|
"step_time": round(step_time, 4),
|
||||||
|
@ -878,7 +880,7 @@ class Trainer:
|
||||||
"""Restore the best loss from the args.best_path if provided else
|
"""Restore the best loss from the args.best_path if provided else
|
||||||
from the model (`args.restore_path` or `args.continue_path`) used for resuming the training"""
|
from the model (`args.restore_path` or `args.continue_path`) used for resuming the training"""
|
||||||
if self.restore_step != 0 or self.args.best_path:
|
if self.restore_step != 0 or self.args.best_path:
|
||||||
print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...")
|
print(f" > Restoring best loss from {os.path.basename(self.args.best_path)} ...")
|
||||||
ch = load_fsspec(self.args.restore_path, map_location="cpu")
|
ch = load_fsspec(self.args.restore_path, map_location="cpu")
|
||||||
if "model_loss" in ch:
|
if "model_loss" in ch:
|
||||||
self.best_loss = ch["model_loss"]
|
self.best_loss = ch["model_loss"]
|
||||||
|
@ -1125,7 +1127,7 @@ def get_last_checkpoint(path: str) -> Tuple[str, str]:
|
||||||
last_model_num = model_num
|
last_model_num = model_num
|
||||||
last_model = file_name
|
last_model = file_name
|
||||||
|
|
||||||
# if there is not checkpoint found above
|
# if there is no checkpoint found above
|
||||||
# find the checkpoint with the latest
|
# find the checkpoint with the latest
|
||||||
# modification date.
|
# modification date.
|
||||||
key_file_names = [fn for fn in file_names if key in fn]
|
key_file_names = [fn for fn in file_names if key in fn]
|
||||||
|
@ -1144,7 +1146,7 @@ def get_last_checkpoint(path: str) -> Tuple[str, str]:
|
||||||
last_models["checkpoint"] = last_models["best_model"]
|
last_models["checkpoint"] = last_models["best_model"]
|
||||||
elif "best_model" not in last_models: # no best model
|
elif "best_model" not in last_models: # no best model
|
||||||
# this shouldn't happen, but let's handle it just in case
|
# this shouldn't happen, but let's handle it just in case
|
||||||
last_models["best_model"] = None
|
last_models["best_model"] = last_models["checkpoint"]
|
||||||
# finally check if last best model is more recent than checkpoint
|
# finally check if last best model is more recent than checkpoint
|
||||||
elif last_model_nums["best_model"] > last_model_nums["checkpoint"]:
|
elif last_model_nums["best_model"] > last_model_nums["checkpoint"]:
|
||||||
last_models["checkpoint"] = last_models["best_model"]
|
last_models["checkpoint"] = last_models["best_model"]
|
||||||
|
@ -1180,7 +1182,6 @@ def process_args(args, config=None):
|
||||||
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
|
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
|
||||||
if not args.best_path:
|
if not args.best_path:
|
||||||
args.best_path = best_model
|
args.best_path = best_model
|
||||||
|
|
||||||
# init config if not already defined
|
# init config if not already defined
|
||||||
if config is None:
|
if config is None:
|
||||||
if args.config_path:
|
if args.config_path:
|
||||||
|
|
|
@ -2,12 +2,12 @@ from dataclasses import dataclass, field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
from TTS.tts.models.fast_pitch import FastPitchArgs
|
from TTS.tts.models.forward_tts import ForwardTTSArgs
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FastPitchConfig(BaseTTSConfig):
|
class FastPitchConfig(BaseTTSConfig):
|
||||||
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
|
"""Configure `ForwardTTS` as FastPitch model.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
@ -18,6 +18,10 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
model (str):
|
model (str):
|
||||||
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
|
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
|
||||||
|
|
||||||
|
base_model (str):
|
||||||
|
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
|
||||||
|
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
|
||||||
|
|
||||||
model_args (Coqpit):
|
model_args (Coqpit):
|
||||||
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
|
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
|
||||||
|
|
||||||
|
@ -36,22 +40,43 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
d_vector_file (str):
|
d_vector_file (str):
|
||||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
|
|
||||||
noam_schedule (bool):
|
d_vector_dim (int):
|
||||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
Dimension of the external speaker embeddings. Defaults to 0.
|
||||||
|
|
||||||
warmup_steps (int):
|
optimizer (str):
|
||||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
Name of the model optimizer. Defaults to `Adam`.
|
||||||
|
|
||||||
|
optimizer_params (dict):
|
||||||
|
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
|
||||||
|
|
||||||
|
lr_scheduler (str):
|
||||||
|
Name of the learning rate scheduler. Defaults to `Noam`.
|
||||||
|
|
||||||
|
lr_scheduler_params (dict):
|
||||||
|
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
|
||||||
|
|
||||||
lr (float):
|
lr (float):
|
||||||
Initial learning rate. Defaults to `1e-3`.
|
Initial learning rate. Defaults to `1e-3`.
|
||||||
|
|
||||||
|
grad_clip (float):
|
||||||
|
Gradient norm clipping value. Defaults to `5.0`.
|
||||||
|
|
||||||
|
spec_loss_type (str):
|
||||||
|
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
||||||
|
|
||||||
|
duration_loss_type (str):
|
||||||
|
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
||||||
|
|
||||||
|
use_ssim_loss (bool):
|
||||||
|
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
|
||||||
|
|
||||||
wd (float):
|
wd (float):
|
||||||
Weight decay coefficient. Defaults to `1e-7`.
|
Weight decay coefficient. Defaults to `1e-7`.
|
||||||
|
|
||||||
ssim_loss_alpha (float):
|
ssim_loss_alpha (float):
|
||||||
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
|
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
|
||||||
|
|
||||||
huber_loss_alpha (float):
|
dur_loss_alpha (float):
|
||||||
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
|
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
|
||||||
|
|
||||||
spec_loss_alpha (float):
|
spec_loss_alpha (float):
|
||||||
|
@ -74,8 +99,10 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model: str = "fast_pitch"
|
model: str = "fast_pitch"
|
||||||
|
base_model: str = "forward_tts"
|
||||||
|
|
||||||
# model specific params
|
# model specific params
|
||||||
model_args: FastPitchArgs = field(default_factory=FastPitchArgs)
|
model_args: ForwardTTSArgs = ForwardTTSArgs()
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
|
@ -92,11 +119,13 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
grad_clip: float = 5.0
|
grad_clip: float = 5.0
|
||||||
|
|
||||||
# loss params
|
# loss params
|
||||||
|
spec_loss_type: str = "mse"
|
||||||
|
duration_loss_type: str = "mse"
|
||||||
|
use_ssim_loss: bool = True
|
||||||
ssim_loss_alpha: float = 1.0
|
ssim_loss_alpha: float = 1.0
|
||||||
dur_loss_alpha: float = 1.0
|
dur_loss_alpha: float = 1.0
|
||||||
spec_loss_alpha: float = 1.0
|
spec_loss_alpha: float = 1.0
|
||||||
pitch_loss_alpha: float = 1.0
|
pitch_loss_alpha: float = 1.0
|
||||||
dur_loss_alpha: float = 1.0
|
|
||||||
aligner_loss_alpha: float = 1.0
|
aligner_loss_alpha: float = 1.0
|
||||||
binary_align_loss_alpha: float = 1.0
|
binary_align_loss_alpha: float = 1.0
|
||||||
binary_align_loss_start_step: int = 20000
|
binary_align_loss_start_step: int = 20000
|
||||||
|
|
|
@ -0,0 +1,151 @@
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
|
from TTS.tts.models.forward_tts import ForwardTTSArgs
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FastSpeechConfig(BaseTTSConfig):
|
||||||
|
"""Configure `ForwardTTS` as FastSpeech model.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.tts.configs import FastSpeechConfig
|
||||||
|
>>> config = FastSpeechConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
|
||||||
|
|
||||||
|
base_model (str):
|
||||||
|
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
|
||||||
|
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
|
||||||
|
|
||||||
|
model_args (Coqpit):
|
||||||
|
Model class arguments. Check `FastSpeechArgs` for more details. Defaults to `FastSpeechArgs()`.
|
||||||
|
|
||||||
|
data_dep_init_steps (int):
|
||||||
|
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||||
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
|
for the rest. Defaults to 10.
|
||||||
|
|
||||||
|
use_speaker_embedding (bool):
|
||||||
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
|
||||||
|
use_d_vector_file (bool):
|
||||||
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
|
|
||||||
|
d_vector_file (str):
|
||||||
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
|
|
||||||
|
d_vector_dim (int):
|
||||||
|
Dimension of the external speaker embeddings. Defaults to 0.
|
||||||
|
|
||||||
|
optimizer (str):
|
||||||
|
Name of the model optimizer. Defaults to `Adam`.
|
||||||
|
|
||||||
|
optimizer_params (dict):
|
||||||
|
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
|
||||||
|
|
||||||
|
lr_scheduler (str):
|
||||||
|
Name of the learning rate scheduler. Defaults to `Noam`.
|
||||||
|
|
||||||
|
lr_scheduler_params (dict):
|
||||||
|
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
|
||||||
|
|
||||||
|
lr (float):
|
||||||
|
Initial learning rate. Defaults to `1e-3`.
|
||||||
|
|
||||||
|
grad_clip (float):
|
||||||
|
Gradient norm clipping value. Defaults to `5.0`.
|
||||||
|
|
||||||
|
spec_loss_type (str):
|
||||||
|
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
||||||
|
|
||||||
|
duration_loss_type (str):
|
||||||
|
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
|
||||||
|
|
||||||
|
use_ssim_loss (bool):
|
||||||
|
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
|
||||||
|
|
||||||
|
wd (float):
|
||||||
|
Weight decay coefficient. Defaults to `1e-7`.
|
||||||
|
|
||||||
|
ssim_loss_alpha (float):
|
||||||
|
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
|
||||||
|
|
||||||
|
dur_loss_alpha (float):
|
||||||
|
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
|
||||||
|
|
||||||
|
spec_loss_alpha (float):
|
||||||
|
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
|
||||||
|
|
||||||
|
pitch_loss_alpha (float):
|
||||||
|
Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
|
||||||
|
|
||||||
|
binary_loss_alpha (float):
|
||||||
|
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
|
||||||
|
|
||||||
|
binary_align_loss_start_step (int):
|
||||||
|
Start binary alignment loss after this many steps. Defaults to 20000.
|
||||||
|
|
||||||
|
min_seq_len (int):
|
||||||
|
Minimum input sequence length to be used at training.
|
||||||
|
|
||||||
|
max_seq_len (int):
|
||||||
|
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model: str = "fast_speech"
|
||||||
|
base_model: str = "forward_tts"
|
||||||
|
|
||||||
|
# model specific params
|
||||||
|
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
|
||||||
|
|
||||||
|
# multi-speaker settings
|
||||||
|
use_speaker_embedding: bool = False
|
||||||
|
use_d_vector_file: bool = False
|
||||||
|
d_vector_file: str = False
|
||||||
|
d_vector_dim: int = 0
|
||||||
|
|
||||||
|
# optimizer parameters
|
||||||
|
optimizer: str = "Adam"
|
||||||
|
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
|
||||||
|
lr_scheduler: str = "NoamLR"
|
||||||
|
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
|
||||||
|
lr: float = 1e-4
|
||||||
|
grad_clip: float = 5.0
|
||||||
|
|
||||||
|
# loss params
|
||||||
|
spec_loss_type: str = "mse"
|
||||||
|
duration_loss_type: str = "mse"
|
||||||
|
use_ssim_loss: bool = True
|
||||||
|
ssim_loss_alpha: float = 1.0
|
||||||
|
dur_loss_alpha: float = 1.0
|
||||||
|
spec_loss_alpha: float = 1.0
|
||||||
|
pitch_loss_alpha: float = 0.0
|
||||||
|
aligner_loss_alpha: float = 1.0
|
||||||
|
binary_align_loss_alpha: float = 1.0
|
||||||
|
binary_align_loss_start_step: int = 20000
|
||||||
|
|
||||||
|
# overrides
|
||||||
|
min_seq_len: int = 13
|
||||||
|
max_seq_len: int = 200
|
||||||
|
r: int = 1 # DO NOT CHANGE
|
||||||
|
|
||||||
|
# dataset configs
|
||||||
|
compute_f0: bool = True
|
||||||
|
f0_cache_path: str = None
|
||||||
|
|
||||||
|
# testing
|
||||||
|
test_sentences: List[str] = field(
|
||||||
|
default_factory=lambda: [
|
||||||
|
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||||
|
"Be a voice, not an echo.",
|
||||||
|
"I'm sorry Dave. I'm afraid I can't do that.",
|
||||||
|
"This cake is great. It's so delicious and moist.",
|
||||||
|
"Prior to November 22, 1963.",
|
||||||
|
]
|
||||||
|
)
|
|
@ -2,81 +2,160 @@ from dataclasses import dataclass, field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
|
from TTS.tts.models.forward_tts import ForwardTTSArgs
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class SpeedySpeechConfig(BaseTTSConfig):
|
class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
|
"""Configure `ForwardTTS` as SpeedySpeech model.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
>>> from TTS.tts.configs import SpeedySpeechConfig
|
>>> from TTS.tts.configs import SpeedySpeechConfig
|
||||||
>>> config = SpeedySpeechConfig()
|
>>> config = SpeedySpeechConfig()
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model (str):
|
model (str):
|
||||||
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
|
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
|
||||||
|
|
||||||
|
base_model (str):
|
||||||
|
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
|
||||||
|
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
|
||||||
|
|
||||||
model_args (Coqpit):
|
model_args (Coqpit):
|
||||||
Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`.
|
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
|
||||||
|
|
||||||
data_dep_init_steps (int):
|
data_dep_init_steps (int):
|
||||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
for the rest. Defaults to 10.
|
for the rest. Defaults to 10.
|
||||||
|
|
||||||
use_speaker_embedding (bool):
|
use_speaker_embedding (bool):
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
in the multi-speaker mode. Defaults to False.
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
|
||||||
use_d_vector_file (bool):
|
use_d_vector_file (bool):
|
||||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
|
|
||||||
d_vector_file (str):
|
d_vector_file (str):
|
||||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
noam_schedule (bool):
|
|
||||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
d_vector_dim (int):
|
||||||
warmup_steps (int):
|
Dimension of the external speaker embeddings. Defaults to 0.
|
||||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
|
||||||
|
optimizer (str):
|
||||||
|
Name of the model optimizer. Defaults to `RAdam`.
|
||||||
|
|
||||||
|
optimizer_params (dict):
|
||||||
|
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
|
||||||
|
|
||||||
|
lr_scheduler (str):
|
||||||
|
Name of the learning rate scheduler. Defaults to `Noam`.
|
||||||
|
|
||||||
|
lr_scheduler_params (dict):
|
||||||
|
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
|
||||||
|
|
||||||
lr (float):
|
lr (float):
|
||||||
Initial learning rate. Defaults to `1e-3`.
|
Initial learning rate. Defaults to `1e-3`.
|
||||||
|
|
||||||
|
grad_clip (float):
|
||||||
|
Gradient norm clipping value. Defaults to `5.0`.
|
||||||
|
|
||||||
|
spec_loss_type (str):
|
||||||
|
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `l1`.
|
||||||
|
|
||||||
|
duration_loss_type (str):
|
||||||
|
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `huber`.
|
||||||
|
|
||||||
|
use_ssim_loss (bool):
|
||||||
|
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
|
||||||
|
|
||||||
wd (float):
|
wd (float):
|
||||||
Weight decay coefficient. Defaults to `1e-7`.
|
Weight decay coefficient. Defaults to `1e-7`.
|
||||||
ssim_alpha (float):
|
|
||||||
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
|
ssim_loss_alpha (float):
|
||||||
huber_alpha (float):
|
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
|
||||||
Weight for the duration predictor's loss. Defaults to 1.0.
|
|
||||||
l1_alpha (float):
|
dur_loss_alpha (float):
|
||||||
Weight for the L1 spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
|
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
|
||||||
|
|
||||||
|
spec_loss_alpha (float):
|
||||||
|
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
|
||||||
|
|
||||||
|
binary_loss_alpha (float):
|
||||||
|
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
|
||||||
|
|
||||||
|
binary_align_loss_start_step (int):
|
||||||
|
Start binary alignment loss after this many steps. Defaults to 20000.
|
||||||
|
|
||||||
min_seq_len (int):
|
min_seq_len (int):
|
||||||
Minimum input sequence length to be used at training.
|
Minimum input sequence length to be used at training.
|
||||||
|
|
||||||
max_seq_len (int):
|
max_seq_len (int):
|
||||||
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model: str = "speedy_speech"
|
model: str = "speedy_speech"
|
||||||
# model specific params
|
base_model: str = "forward_tts"
|
||||||
model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs)
|
|
||||||
|
# set model args as SpeedySpeech
|
||||||
|
model_args: ForwardTTSArgs = ForwardTTSArgs(
|
||||||
|
use_pitch=False,
|
||||||
|
encoder_type="residual_conv_bn",
|
||||||
|
encoder_params={
|
||||||
|
"kernel_size": 4,
|
||||||
|
"dilations": 4 * [1, 2, 4] + [1],
|
||||||
|
"num_conv_blocks": 2,
|
||||||
|
"num_res_blocks": 13,
|
||||||
|
},
|
||||||
|
decoder_type="residual_conv_bn",
|
||||||
|
decoder_params={
|
||||||
|
"kernel_size": 4,
|
||||||
|
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||||
|
"num_conv_blocks": 2,
|
||||||
|
"num_res_blocks": 17,
|
||||||
|
},
|
||||||
|
out_channels=80,
|
||||||
|
hidden_channels=128,
|
||||||
|
num_speakers=0,
|
||||||
|
positional_encoding=True,
|
||||||
|
detach_duration_predictor=True
|
||||||
|
)
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
use_d_vector_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
d_vector_file: str = False
|
d_vector_file: str = False
|
||||||
|
d_vector_dim: int = 0
|
||||||
|
|
||||||
# optimizer parameters
|
# optimizer parameters
|
||||||
optimizer: str = "RAdam"
|
optimizer: str = "Adam"
|
||||||
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
|
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
|
||||||
lr_scheduler: str = None
|
lr_scheduler: str = "NoamLR"
|
||||||
lr_scheduler_params: dict = None
|
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
|
||||||
lr: float = 1e-4
|
lr: float = 1e-4
|
||||||
grad_clip: float = 5.0
|
grad_clip: float = 5.0
|
||||||
|
|
||||||
# loss params
|
# loss params
|
||||||
ssim_alpha: float = 1.0
|
spec_loss_type: str = "l1"
|
||||||
huber_alpha: float = 1.0
|
duration_loss_type: str = "huber"
|
||||||
l1_alpha: float = 1.0
|
use_ssim_loss: bool = False
|
||||||
|
ssim_loss_alpha: float = 1.0
|
||||||
|
dur_loss_alpha: float = 1.0
|
||||||
|
spec_loss_alpha: float = 1.0
|
||||||
|
aligner_loss_alpha: float = 1.0
|
||||||
|
binary_align_loss_alpha: float = 0.3
|
||||||
|
binary_align_loss_start_step: int = 50000
|
||||||
|
|
||||||
# overrides
|
# overrides
|
||||||
min_seq_len: int = 13
|
min_seq_len: int = 13
|
||||||
max_seq_len: int = 200
|
max_seq_len: int = 200
|
||||||
r: int = 1 # DO NOT CHANGE
|
r: int = 1 # DO NOT CHANGE
|
||||||
|
|
||||||
|
# dataset configs
|
||||||
|
compute_f0: bool = False
|
||||||
|
f0_cache_path: str = None
|
||||||
|
|
||||||
# testing
|
# testing
|
||||||
test_sentences: List[str] = field(
|
test_sentences: List[str] = field(
|
||||||
default_factory=lambda: [
|
default_factory=lambda: [
|
||||||
|
|
|
@ -1,15 +1 @@
|
||||||
from TTS.tts.layers.losses import *
|
from TTS.tts.layers.losses import *
|
||||||
|
|
||||||
|
|
||||||
def setup_loss(config):
|
|
||||||
if config.model.lower() in ["tacotron", "tacotron2"]:
|
|
||||||
model = TacotronLoss(config)
|
|
||||||
elif config.model.lower() == "glow_tts":
|
|
||||||
model = GlowTTSLoss()
|
|
||||||
elif config.model.lower() == "speedy_speech":
|
|
||||||
model = SpeedySpeechLoss(config)
|
|
||||||
elif config.model.lower() == "align_tts":
|
|
||||||
model = AlignTTSLoss(config)
|
|
||||||
else:
|
|
||||||
raise ValueError(f" [!] loss for model {config.model.lower()} cannot be found.")
|
|
||||||
return model
|
|
||||||
|
|
|
@ -70,7 +70,9 @@ class FFTransformerBlock(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class FFTDurationPredictor:
|
class FFTDurationPredictor:
|
||||||
def __init__(self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None): # pylint: disable=unused-argument
|
def __init__(
|
||||||
|
self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None
|
||||||
|
): # pylint: disable=unused-argument
|
||||||
self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p)
|
self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p)
|
||||||
self.proj = nn.Linear(in_channels, 1)
|
self.proj = nn.Linear(in_channels, 1)
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlo
|
||||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||||
from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
|
from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
|
||||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
|
|
||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
|
|
|
@ -1,106 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from torch.nn import functional as F
|
|
||||||
|
|
||||||
from TTS.tts.utils.data import sequence_mask
|
|
||||||
|
|
||||||
try:
|
|
||||||
# TODO: fix pypi cython installation problem.
|
|
||||||
from TTS.tts.layers.glow_tts.monotonic_align.core import maximum_path_c
|
|
||||||
|
|
||||||
CYTHON = True
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
CYTHON = False
|
|
||||||
|
|
||||||
|
|
||||||
def convert_pad_shape(pad_shape):
|
|
||||||
l = pad_shape[::-1]
|
|
||||||
pad_shape = [item for sublist in l for item in sublist]
|
|
||||||
return pad_shape
|
|
||||||
|
|
||||||
|
|
||||||
def generate_path(duration, mask):
|
|
||||||
"""
|
|
||||||
Shapes:
|
|
||||||
- duration: :math:`[B, T_en]`
|
|
||||||
- mask: :math:'[B, T_en, T_de]`
|
|
||||||
- path: :math:`[B, T_en, T_de]`
|
|
||||||
"""
|
|
||||||
device = duration.device
|
|
||||||
b, t_x, t_y = mask.shape
|
|
||||||
cum_duration = torch.cumsum(duration, 1)
|
|
||||||
path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
|
|
||||||
|
|
||||||
cum_duration_flat = cum_duration.view(b * t_x)
|
|
||||||
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
|
||||||
path = path.view(b, t_x, t_y)
|
|
||||||
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
|
||||||
path = path * mask
|
|
||||||
return path
|
|
||||||
|
|
||||||
|
|
||||||
def maximum_path(value, mask):
|
|
||||||
if CYTHON:
|
|
||||||
return maximum_path_cython(value, mask)
|
|
||||||
return maximum_path_numpy(value, mask)
|
|
||||||
|
|
||||||
|
|
||||||
def maximum_path_cython(value, mask):
|
|
||||||
"""Cython optimised version.
|
|
||||||
Shapes:
|
|
||||||
- value: :math:`[B, T_en, T_de]`
|
|
||||||
- mask: :math:`[B, T_en, T_de]`
|
|
||||||
"""
|
|
||||||
value = value * mask
|
|
||||||
device = value.device
|
|
||||||
dtype = value.dtype
|
|
||||||
value = value.data.cpu().numpy().astype(np.float32)
|
|
||||||
path = np.zeros_like(value).astype(np.int32)
|
|
||||||
mask = mask.data.cpu().numpy()
|
|
||||||
|
|
||||||
t_x_max = mask.sum(1)[:, 0].astype(np.int32)
|
|
||||||
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
|
|
||||||
maximum_path_c(path, value, t_x_max, t_y_max)
|
|
||||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def maximum_path_numpy(value, mask, max_neg_val=None):
|
|
||||||
"""
|
|
||||||
Monotonic alignment search algorithm
|
|
||||||
Numpy-friendly version. It's about 4 times faster than torch version.
|
|
||||||
value: [b, t_x, t_y]
|
|
||||||
mask: [b, t_x, t_y]
|
|
||||||
"""
|
|
||||||
if max_neg_val is None:
|
|
||||||
max_neg_val = -np.inf # Patch for Sphinx complaint
|
|
||||||
value = value * mask
|
|
||||||
|
|
||||||
device = value.device
|
|
||||||
dtype = value.dtype
|
|
||||||
value = value.cpu().detach().numpy()
|
|
||||||
mask = mask.cpu().detach().numpy().astype(np.bool)
|
|
||||||
|
|
||||||
b, t_x, t_y = value.shape
|
|
||||||
direction = np.zeros(value.shape, dtype=np.int64)
|
|
||||||
v = np.zeros((b, t_x), dtype=np.float32)
|
|
||||||
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
|
|
||||||
for j in range(t_y):
|
|
||||||
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
|
|
||||||
v1 = v
|
|
||||||
max_mask = v1 >= v0
|
|
||||||
v_max = np.where(max_mask, v1, v0)
|
|
||||||
direction[:, :, j] = max_mask
|
|
||||||
|
|
||||||
index_mask = x_range <= j
|
|
||||||
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
|
|
||||||
direction = np.where(mask, direction, 1)
|
|
||||||
|
|
||||||
path = np.zeros(value.shape, dtype=np.float32)
|
|
||||||
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
|
|
||||||
index_range = np.arange(b)
|
|
||||||
for j in reversed(range(t_y)):
|
|
||||||
path[index_range, index, j] = 1
|
|
||||||
index = index + direction[index_range, index, j] - 1
|
|
||||||
path = path * mask.astype(np.float32)
|
|
||||||
path = torch.from_numpy(path).to(device=device, dtype=dtype)
|
|
||||||
return path
|
|
|
@ -6,7 +6,7 @@ from coqpit import Coqpit
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional
|
from torch.nn import functional
|
||||||
|
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
from TTS.tts.utils.ssim import ssim
|
from TTS.tts.utils.ssim import ssim
|
||||||
from TTS.utils.audio import TorchSTFT
|
from TTS.utils.audio import TorchSTFT
|
||||||
|
|
||||||
|
@ -236,10 +236,40 @@ class Huber(nn.Module):
|
||||||
y: B x T
|
y: B x T
|
||||||
length: B
|
length: B
|
||||||
"""
|
"""
|
||||||
mask = sequence_mask(sequence_length=length, max_len=y.size(1)).float()
|
mask = sequence_mask(sequence_length=length, max_len=y.size(1)).unsqueeze(2).float()
|
||||||
return torch.nn.functional.smooth_l1_loss(x * mask, y * mask, reduction="sum") / mask.sum()
|
return torch.nn.functional.smooth_l1_loss(x * mask, y * mask, reduction="sum") / mask.sum()
|
||||||
|
|
||||||
|
|
||||||
|
class ForwardSumLoss(nn.Module):
|
||||||
|
def __init__(self, blank_logprob=-1):
|
||||||
|
super().__init__()
|
||||||
|
self.log_softmax = torch.nn.LogSoftmax(dim=3)
|
||||||
|
self.ctc_loss = torch.nn.CTCLoss(zero_infinity=True)
|
||||||
|
self.blank_logprob = blank_logprob
|
||||||
|
|
||||||
|
def forward(self, attn_logprob, in_lens, out_lens):
|
||||||
|
key_lens = in_lens
|
||||||
|
query_lens = out_lens
|
||||||
|
attn_logprob_padded = torch.nn.functional.pad(input=attn_logprob, pad=(1, 0), value=self.blank_logprob)
|
||||||
|
|
||||||
|
total_loss = 0.0
|
||||||
|
for bid in range(attn_logprob.shape[0]):
|
||||||
|
target_seq = torch.arange(1, key_lens[bid] + 1).unsqueeze(0)
|
||||||
|
curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2)[: query_lens[bid], :, : key_lens[bid] + 1]
|
||||||
|
|
||||||
|
curr_logprob = self.log_softmax(curr_logprob[None])[0]
|
||||||
|
loss = self.ctc_loss(
|
||||||
|
curr_logprob,
|
||||||
|
target_seq,
|
||||||
|
input_lengths=query_lens[bid : bid + 1],
|
||||||
|
target_lengths=key_lens[bid : bid + 1],
|
||||||
|
)
|
||||||
|
total_loss = total_loss + loss
|
||||||
|
|
||||||
|
total_loss = total_loss / attn_logprob.shape[0]
|
||||||
|
return total_loss
|
||||||
|
|
||||||
|
|
||||||
########################
|
########################
|
||||||
# MODEL LOSS LAYERS
|
# MODEL LOSS LAYERS
|
||||||
########################
|
########################
|
||||||
|
@ -413,25 +443,6 @@ class GlowTTSLoss(torch.nn.Module):
|
||||||
return return_dict
|
return return_dict
|
||||||
|
|
||||||
|
|
||||||
class SpeedySpeechLoss(nn.Module):
|
|
||||||
def __init__(self, c):
|
|
||||||
super().__init__()
|
|
||||||
self.l1 = L1LossMasked(False)
|
|
||||||
self.ssim = SSIMLoss()
|
|
||||||
self.huber = Huber()
|
|
||||||
|
|
||||||
self.ssim_alpha = c.ssim_alpha
|
|
||||||
self.huber_alpha = c.huber_alpha
|
|
||||||
self.l1_alpha = c.l1_alpha
|
|
||||||
|
|
||||||
def forward(self, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens):
|
|
||||||
l1_loss = self.l1(decoder_output, decoder_target, decoder_output_lens)
|
|
||||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
|
||||||
huber_loss = self.huber(dur_output, dur_target, input_lens)
|
|
||||||
loss = self.l1_alpha * l1_loss + self.ssim_alpha * ssim_loss + self.huber_alpha * huber_loss
|
|
||||||
return {"loss": loss, "loss_l1": l1_loss, "loss_ssim": ssim_loss, "loss_dur": huber_loss}
|
|
||||||
|
|
||||||
|
|
||||||
def mse_loss_custom(x, y):
|
def mse_loss_custom(x, y):
|
||||||
"""MSE loss using the torch back-end without reduction.
|
"""MSE loss using the torch back-end without reduction.
|
||||||
It uses less VRAM than the raw code"""
|
It uses less VRAM than the raw code"""
|
||||||
|
@ -660,51 +671,41 @@ class VitsDiscriminatorLoss(nn.Module):
|
||||||
return return_dict
|
return return_dict
|
||||||
|
|
||||||
|
|
||||||
class ForwardSumLoss(nn.Module):
|
class ForwardTTSLoss(nn.Module):
|
||||||
def __init__(self, blank_logprob=-1):
|
"""Generic configurable ForwardTTS loss."""
|
||||||
super().__init__()
|
|
||||||
self.log_softmax = torch.nn.LogSoftmax(dim=3)
|
|
||||||
self.ctc_loss = torch.nn.CTCLoss(zero_infinity=True)
|
|
||||||
self.blank_logprob = blank_logprob
|
|
||||||
|
|
||||||
def forward(self, attn_logprob, in_lens, out_lens):
|
|
||||||
key_lens = in_lens
|
|
||||||
query_lens = out_lens
|
|
||||||
attn_logprob_padded = torch.nn.functional.pad(input=attn_logprob, pad=(1, 0), value=self.blank_logprob)
|
|
||||||
|
|
||||||
total_loss = 0.0
|
|
||||||
for bid in range(attn_logprob.shape[0]):
|
|
||||||
target_seq = torch.arange(1, key_lens[bid] + 1).unsqueeze(0)
|
|
||||||
curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2)[: query_lens[bid], :, : key_lens[bid] + 1]
|
|
||||||
|
|
||||||
curr_logprob = self.log_softmax(curr_logprob[None])[0]
|
|
||||||
loss = self.ctc_loss(
|
|
||||||
curr_logprob,
|
|
||||||
target_seq,
|
|
||||||
input_lengths=query_lens[bid : bid + 1],
|
|
||||||
target_lengths=key_lens[bid : bid + 1],
|
|
||||||
)
|
|
||||||
total_loss = total_loss + loss
|
|
||||||
|
|
||||||
total_loss = total_loss / attn_logprob.shape[0]
|
|
||||||
return total_loss
|
|
||||||
|
|
||||||
|
|
||||||
class FastPitchLoss(nn.Module):
|
|
||||||
def __init__(self, c):
|
def __init__(self, c):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.spec_loss = MSELossMasked(False)
|
if c.spec_loss_type == "mse":
|
||||||
self.ssim = SSIMLoss()
|
self.spec_loss = MSELossMasked(False)
|
||||||
self.dur_loss = MSELossMasked(False)
|
elif c.spec_loss_type == "l1":
|
||||||
self.pitch_loss = MSELossMasked(False)
|
self.spec_loss = L1LossMasked(False)
|
||||||
|
else:
|
||||||
|
raise ValueError(" [!] Unknown spec_loss_type {}".format(c.spec_loss_type))
|
||||||
|
|
||||||
|
if c.duration_loss_type == "mse":
|
||||||
|
self.dur_loss = MSELossMasked(False)
|
||||||
|
elif c.duration_loss_type == "l1":
|
||||||
|
self.dur_loss = L1LossMasked(False)
|
||||||
|
elif c.duration_loss_type == "huber":
|
||||||
|
self.dur_loss = Huber()
|
||||||
|
else:
|
||||||
|
raise ValueError(" [!] Unknown duration_loss_type {}".format(c.duration_loss_type))
|
||||||
|
|
||||||
if c.model_args.use_aligner:
|
if c.model_args.use_aligner:
|
||||||
self.aligner_loss = ForwardSumLoss()
|
self.aligner_loss = ForwardSumLoss()
|
||||||
|
self.aligner_loss_alpha = c.aligner_loss_alpha
|
||||||
|
|
||||||
|
if c.model_args.use_pitch:
|
||||||
|
self.pitch_loss = MSELossMasked(False)
|
||||||
|
self.pitch_loss_alpha = c.pitch_loss_alpha
|
||||||
|
|
||||||
|
if c.use_ssim_loss:
|
||||||
|
self.ssim = SSIMLoss() if c.use_ssim_loss else None
|
||||||
|
self.ssim_loss_alpha = c.ssim_loss_alpha
|
||||||
|
|
||||||
self.spec_loss_alpha = c.spec_loss_alpha
|
self.spec_loss_alpha = c.spec_loss_alpha
|
||||||
self.ssim_loss_alpha = c.ssim_loss_alpha
|
|
||||||
self.dur_loss_alpha = c.dur_loss_alpha
|
self.dur_loss_alpha = c.dur_loss_alpha
|
||||||
self.pitch_loss_alpha = c.pitch_loss_alpha
|
|
||||||
self.aligner_loss_alpha = c.aligner_loss_alpha
|
|
||||||
self.binary_alignment_loss_alpha = c.binary_align_loss_alpha
|
self.binary_alignment_loss_alpha = c.binary_align_loss_alpha
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -731,7 +732,7 @@ class FastPitchLoss(nn.Module):
|
||||||
):
|
):
|
||||||
loss = 0
|
loss = 0
|
||||||
return_dict = {}
|
return_dict = {}
|
||||||
if self.ssim_loss_alpha > 0:
|
if hasattr(self, "ssim_loss") and self.ssim_loss_alpha > 0:
|
||||||
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
|
||||||
loss = loss + self.ssim_loss_alpha * ssim_loss
|
loss = loss + self.ssim_loss_alpha * ssim_loss
|
||||||
return_dict["loss_ssim"] = self.ssim_loss_alpha * ssim_loss
|
return_dict["loss_ssim"] = self.ssim_loss_alpha * ssim_loss
|
||||||
|
@ -747,12 +748,12 @@ class FastPitchLoss(nn.Module):
|
||||||
loss = loss + self.dur_loss_alpha * dur_loss
|
loss = loss + self.dur_loss_alpha * dur_loss
|
||||||
return_dict["loss_dur"] = self.dur_loss_alpha * dur_loss
|
return_dict["loss_dur"] = self.dur_loss_alpha * dur_loss
|
||||||
|
|
||||||
if self.pitch_loss_alpha > 0:
|
if hasattr(self, "pitch_loss") and self.pitch_loss_alpha > 0:
|
||||||
pitch_loss = self.pitch_loss(pitch_output.transpose(1, 2), pitch_target.transpose(1, 2), input_lens)
|
pitch_loss = self.pitch_loss(pitch_output.transpose(1, 2), pitch_target.transpose(1, 2), input_lens)
|
||||||
loss = loss + self.pitch_loss_alpha * pitch_loss
|
loss = loss + self.pitch_loss_alpha * pitch_loss
|
||||||
return_dict["loss_pitch"] = self.pitch_loss_alpha * pitch_loss
|
return_dict["loss_pitch"] = self.pitch_loss_alpha * pitch_loss
|
||||||
|
|
||||||
if self.aligner_loss_alpha > 0:
|
if hasattr(self, "aligner_loss") and self.aligner_loss_alpha > 0:
|
||||||
aligner_loss = self.aligner_loss(alignment_logprob, input_lens, decoder_output_lens)
|
aligner_loss = self.aligner_loss(alignment_logprob, input_lens, decoder_output_lens)
|
||||||
loss = loss + self.aligner_loss_alpha * aligner_loss
|
loss = loss + self.aligner_loss_alpha * aligner_loss
|
||||||
return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss
|
return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss
|
||||||
|
|
|
@ -5,7 +5,7 @@ from torch import nn
|
||||||
|
|
||||||
from TTS.tts.layers.glow_tts.glow import WN
|
from TTS.tts.layers.glow_tts.glow import WN
|
||||||
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
|
|
||||||
LRELU_SLOPE = 0.1
|
LRELU_SLOPE = 0.1
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,11 @@ from TTS.utils.generic_utils import find_module
|
||||||
|
|
||||||
def setup_model(config):
|
def setup_model(config):
|
||||||
print(" > Using model: {}".format(config.model))
|
print(" > Using model: {}".format(config.model))
|
||||||
MyModel = find_module("TTS.tts.models", config.model.lower())
|
# fetch the right model implementation.
|
||||||
|
if "base_model" in config and config["base_model"] is not None:
|
||||||
|
MyModel = find_module("TTS.tts.models", config.base_model.lower())
|
||||||
|
else:
|
||||||
|
MyModel = find_module("TTS.tts.models", config.model.lower())
|
||||||
# define set of characters used by the model
|
# define set of characters used by the model
|
||||||
if config.characters is not None:
|
if config.characters is not None:
|
||||||
# set characters from config
|
# set characters from config
|
||||||
|
|
|
@ -10,9 +10,8 @@ from TTS.tts.layers.feed_forward.decoder import Decoder
|
||||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
||||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.io import load_fsspec
|
from TTS.utils.io import load_fsspec
|
||||||
|
@ -168,7 +167,12 @@ class AlignTTS(BaseTTS):
|
||||||
return dr_mas.squeeze(1), log_p
|
return dr_mas.squeeze(1), log_p
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def convert_dr_to_align(dr, x_mask, y_mask):
|
def generate_attn(dr, x_mask, y_mask=None):
|
||||||
|
# compute decode mask from the durations
|
||||||
|
if y_mask is None:
|
||||||
|
y_lengths = dr.sum(1).long()
|
||||||
|
y_lengths[y_lengths < 1] = 1
|
||||||
|
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
|
||||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
||||||
attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
|
attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
|
||||||
return attn
|
return attn
|
||||||
|
@ -187,7 +191,7 @@ class AlignTTS(BaseTTS):
|
||||||
[0, 1, 1, 1, 0, 0, 0],
|
[0, 1, 1, 1, 0, 0, 0],
|
||||||
[1, 0, 0, 0, 0, 0, 0]]
|
[1, 0, 0, 0, 0, 0, 0]]
|
||||||
"""
|
"""
|
||||||
attn = self.convert_dr_to_align(dr, x_mask, y_mask)
|
attn = self.generate_attn(dr, x_mask, y_mask)
|
||||||
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2)
|
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2)
|
||||||
return o_en_ex, attn
|
return o_en_ex, attn
|
||||||
|
|
||||||
|
@ -275,7 +279,7 @@ class AlignTTS(BaseTTS):
|
||||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||||
dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask)
|
dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask)
|
||||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
|
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
|
||||||
attn = self.convert_dr_to_align(dr_mas, x_mask, y_mask)
|
attn = self.generate_attn(dr_mas, x_mask, y_mask)
|
||||||
elif phase == 1:
|
elif phase == 1:
|
||||||
# train decoder
|
# train decoder
|
||||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||||
|
|
|
@ -9,7 +9,7 @@ from torch import nn
|
||||||
|
|
||||||
from TTS.tts.layers.losses import TacotronLoss
|
from TTS.tts.layers.losses import TacotronLoss
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
|
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
|
||||||
from TTS.tts.utils.text import make_symbols
|
from TTS.tts.utils.text import make_symbols
|
||||||
from TTS.utils.generic_utils import format_aux_input
|
from TTS.utils.generic_utils import format_aux_input
|
||||||
|
@ -115,12 +115,19 @@ class BaseTacotron(BaseTTS):
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
|
# TODO: set r in run-time by taking it from the new config
|
||||||
if "r" in state:
|
if "r" in state:
|
||||||
|
# set r from the state (for compatibility with older checkpoints)
|
||||||
self.decoder.set_r(state["r"])
|
self.decoder.set_r(state["r"])
|
||||||
else:
|
elif "config" in state:
|
||||||
|
# set r from config used at training time (for inference)
|
||||||
self.decoder.set_r(state["config"]["r"])
|
self.decoder.set_r(state["config"]["r"])
|
||||||
|
else:
|
||||||
|
# set r from the new config (for new-models)
|
||||||
|
self.decoder.set_r(config.r)
|
||||||
if eval:
|
if eval:
|
||||||
self.eval()
|
self.eval()
|
||||||
|
print(f" > Model's reduction rate `r` is set to: {self.decoder.r}")
|
||||||
assert not self.training
|
assert not self.training
|
||||||
|
|
||||||
def get_criterion(self) -> nn.Module:
|
def get_criterion(self) -> nn.Module:
|
||||||
|
|
|
@ -11,16 +11,15 @@ from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||||
from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
||||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FastPitchArgs(Coqpit):
|
class ForwardTTSArgs(Coqpit):
|
||||||
"""Fast Pitch Model arguments.
|
"""ForwardTTS Model arguments.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
||||||
|
@ -36,6 +35,14 @@ class FastPitchArgs(Coqpit):
|
||||||
num_speakers (int):
|
num_speakers (int):
|
||||||
Number of speakers for the speaker embedding layer. Defaults to 0.
|
Number of speakers for the speaker embedding layer. Defaults to 0.
|
||||||
|
|
||||||
|
use_aligner (bool):
|
||||||
|
Whether to use aligner network to learn the text to speech alignment or use pre-computed durations.
|
||||||
|
If set False, durations should be computed by `TTS/bin/compute_attention_masks.py` and path to the
|
||||||
|
pre-computed durations must be provided to `config.datasets[0].meta_file_attn_mask`. Defaults to True.
|
||||||
|
|
||||||
|
use_pitch (bool):
|
||||||
|
Use pitch predictor to learn the pitch. Defaults to True.
|
||||||
|
|
||||||
duration_predictor_hidden_channels (int):
|
duration_predictor_hidden_channels (int):
|
||||||
Number of hidden channels in the duration predictor. Defaults to 256.
|
Number of hidden channels in the duration predictor. Defaults to 256.
|
||||||
|
|
||||||
|
@ -93,21 +100,21 @@ class FastPitchArgs(Coqpit):
|
||||||
max_duration (int):
|
max_duration (int):
|
||||||
Maximum duration accepted by the model. Defaults to 75.
|
Maximum duration accepted by the model. Defaults to 75.
|
||||||
|
|
||||||
use_aligner (bool):
|
|
||||||
Use aligner network to learn the text to speech alignment. Defaults to True.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
num_chars: int = None
|
num_chars: int = None
|
||||||
out_channels: int = 80
|
out_channels: int = 80
|
||||||
hidden_channels: int = 384
|
hidden_channels: int = 384
|
||||||
num_speakers: int = 0
|
num_speakers: int = 0
|
||||||
duration_predictor_hidden_channels: int = 256
|
use_aligner: bool = True
|
||||||
duration_predictor_kernel_size: int = 3
|
use_pitch: bool = True
|
||||||
duration_predictor_dropout_p: float = 0.1
|
|
||||||
pitch_predictor_hidden_channels: int = 256
|
pitch_predictor_hidden_channels: int = 256
|
||||||
pitch_predictor_kernel_size: int = 3
|
pitch_predictor_kernel_size: int = 3
|
||||||
pitch_predictor_dropout_p: float = 0.1
|
pitch_predictor_dropout_p: float = 0.1
|
||||||
pitch_embedding_kernel_size: int = 3
|
pitch_embedding_kernel_size: int = 3
|
||||||
|
duration_predictor_hidden_channels: int = 256
|
||||||
|
duration_predictor_kernel_size: int = 3
|
||||||
|
duration_predictor_dropout_p: float = 0.1
|
||||||
positional_encoding: bool = True
|
positional_encoding: bool = True
|
||||||
poisitonal_encoding_use_scale: bool = True
|
poisitonal_encoding_use_scale: bool = True
|
||||||
length_scale: int = 1
|
length_scale: int = 1
|
||||||
|
@ -123,32 +130,32 @@ class FastPitchArgs(Coqpit):
|
||||||
d_vector_dim: int = 0
|
d_vector_dim: int = 0
|
||||||
detach_duration_predictor: bool = False
|
detach_duration_predictor: bool = False
|
||||||
max_duration: int = 75
|
max_duration: int = 75
|
||||||
use_aligner: bool = True
|
|
||||||
|
|
||||||
|
|
||||||
class FastPitch(BaseTTS):
|
class ForwardTTS(BaseTTS):
|
||||||
"""FastPitch model. Very similart to SpeedySpeech model but with pitch prediction.
|
"""General forward TTS model implementation that uses an encoder-decoder architecture with an optional alignment
|
||||||
|
network and a pitch predictor.
|
||||||
|
|
||||||
Paper::
|
If the alignment network is used, the model learns the text-to-speech alignment
|
||||||
https://arxiv.org/abs/2006.06873
|
from the data instead of using pre-computed durations.
|
||||||
|
|
||||||
Paper abstract::
|
If the pitch predictor is used, the model trains a pitch predictor that predicts average pitch value for each
|
||||||
We present FastPitch, a fully-parallel text-to-speech model based on FastSpeech, conditioned on fundamental
|
input character as in the FastPitch model.
|
||||||
frequency contours. The model predicts pitch contours during inference. By altering these predictions,
|
|
||||||
the generated speech can be more expressive, better match the semantic of the utterance, and in the end
|
`ForwardTTS` can be configured to one of these architectures,
|
||||||
more engaging to the listener. Uniformly increasing or decreasing pitch with FastPitch generates speech
|
|
||||||
that resembles the voluntary modulation of voice. Conditioning on frequency contours improves the overall
|
- FastPitch
|
||||||
quality of synthesized speech, making it comparable to state-of-the-art. It does not introduce an overhead,
|
- SpeedySpeech
|
||||||
and FastPitch retains the favorable, fully-parallel Transformer architecture, with over 900x real-time
|
- FastSpeech
|
||||||
factor for mel-spectrogram synthesis of a typical utterance."
|
- TODO: FastSpeech2 (requires average speech energy predictor)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model coqpit class.
|
config (Coqpit): Model coqpit class.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> from TTS.tts.models.fast_pitch import FastPitch, FastPitchArgs
|
>>> from TTS.tts.models.fast_pitch import ForwardTTS, ForwardTTSArgs
|
||||||
>>> config = FastPitchArgs()
|
>>> config = ForwardTTSArgs()
|
||||||
>>> model = FastPitch(config)
|
>>> model = ForwardTTS(config)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
|
@ -157,24 +164,25 @@ class FastPitch(BaseTTS):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
# don't use isintance not to import recursively
|
# don't use isintance not to import recursively
|
||||||
if config.__class__.__name__ == "FastPitchConfig":
|
if "Config" in config.__class__.__name__:
|
||||||
if "characters" in config:
|
if "characters" in config:
|
||||||
# loading from FasrPitchConfig
|
# loading from FasrPitchConfig
|
||||||
_, self.config, num_chars = self.get_characters(config)
|
_, self.config, num_chars = self.get_characters(config)
|
||||||
config.model_args.num_chars = num_chars
|
config.model_args.num_chars = num_chars
|
||||||
self.args = self.config.model_args
|
self.args = self.config.model_args
|
||||||
else:
|
else:
|
||||||
# loading from FastPitchArgs
|
# loading from ForwardTTSArgs
|
||||||
self.config = config
|
self.config = config
|
||||||
self.args = config.model_args
|
self.args = config.model_args
|
||||||
elif isinstance(config, FastPitchArgs):
|
elif isinstance(config, ForwardTTSArgs):
|
||||||
self.args = config
|
self.args = config
|
||||||
self.config = config
|
self.config = config
|
||||||
else:
|
else:
|
||||||
raise ValueError("config must be either a VitsConfig or Vitsself.args")
|
raise ValueError("config must be either a *Config or ForwardTTSArgs")
|
||||||
|
|
||||||
self.max_duration = self.args.max_duration
|
self.max_duration = self.args.max_duration
|
||||||
self.use_aligner = self.args.use_aligner
|
self.use_aligner = self.args.use_aligner
|
||||||
|
self.use_pitch = self.args.use_pitch
|
||||||
self.use_binary_alignment_loss = False
|
self.use_binary_alignment_loss = False
|
||||||
|
|
||||||
self.length_scale = (
|
self.length_scale = (
|
||||||
|
@ -208,19 +216,19 @@ class FastPitch(BaseTTS):
|
||||||
self.args.duration_predictor_dropout_p,
|
self.args.duration_predictor_dropout_p,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.pitch_predictor = DurationPredictor(
|
if self.args.use_pitch:
|
||||||
self.args.hidden_channels + self.args.d_vector_dim,
|
self.pitch_predictor = DurationPredictor(
|
||||||
self.args.pitch_predictor_hidden_channels,
|
self.args.hidden_channels + self.args.d_vector_dim,
|
||||||
self.args.pitch_predictor_kernel_size,
|
self.args.pitch_predictor_hidden_channels,
|
||||||
self.args.pitch_predictor_dropout_p,
|
self.args.pitch_predictor_kernel_size,
|
||||||
)
|
self.args.pitch_predictor_dropout_p,
|
||||||
|
)
|
||||||
self.pitch_emb = nn.Conv1d(
|
self.pitch_emb = nn.Conv1d(
|
||||||
1,
|
1,
|
||||||
self.args.hidden_channels,
|
self.args.hidden_channels,
|
||||||
kernel_size=self.args.pitch_embedding_kernel_size,
|
kernel_size=self.args.pitch_embedding_kernel_size,
|
||||||
padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
|
padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.args.num_speakers > 1 and not self.args.use_d_vector:
|
if self.args.num_speakers > 1 and not self.args.use_d_vector:
|
||||||
# speaker embedding layer
|
# speaker embedding layer
|
||||||
|
@ -257,18 +265,22 @@ class FastPitch(BaseTTS):
|
||||||
"""Generate attention alignment map from durations and
|
"""Generate attention alignment map from durations and
|
||||||
expand encoder outputs
|
expand encoder outputs
|
||||||
|
|
||||||
Shapes
|
Shapes:
|
||||||
- en: :math:`(B, D_{en}, T_{en})`
|
- en: :math:`(B, D_{en}, T_{en})`
|
||||||
- dr: :math:`(B, T_{en})`
|
- dr: :math:`(B, T_{en})`
|
||||||
- x_mask: :math:`(B, T_{en})`
|
- x_mask: :math:`(B, T_{en})`
|
||||||
- y_mask: :math:`(B, T_{de})`
|
- y_mask: :math:`(B, T_{de})`
|
||||||
|
|
||||||
Examples:
|
Examples::
|
||||||
- encoder output: :math:`[a,b,c,d]`
|
|
||||||
- durations: :math:`[1, 3, 2, 1]`
|
|
||||||
|
|
||||||
- expanded: :math:`[a, b, b, b, c, c, d]`
|
encoder output: [a,b,c,d]
|
||||||
- attention map: :math:`[[0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0]]`
|
durations: [1, 3, 2, 1]
|
||||||
|
|
||||||
|
expanded: [a, b, b, b, c, c, d]
|
||||||
|
attention map: [[0, 0, 0, 0, 0, 0, 1],
|
||||||
|
[0, 0, 0, 0, 1, 1, 0],
|
||||||
|
[0, 1, 1, 1, 0, 0, 0],
|
||||||
|
[1, 0, 0, 0, 0, 0, 0]]
|
||||||
"""
|
"""
|
||||||
attn = self.generate_attn(dr, x_mask, y_mask)
|
attn = self.generate_attn(dr, x_mask, y_mask)
|
||||||
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2).to(en.dtype), en.transpose(1, 2)).transpose(1, 2)
|
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2).to(en.dtype), en.transpose(1, 2)).transpose(1, 2)
|
||||||
|
@ -416,7 +428,7 @@ class FastPitch(BaseTTS):
|
||||||
"""
|
"""
|
||||||
o_pitch = self.pitch_predictor(o_en, x_mask)
|
o_pitch = self.pitch_predictor(o_en, x_mask)
|
||||||
if pitch is not None:
|
if pitch is not None:
|
||||||
avg_pitch = average_pitch(pitch, dr)
|
avg_pitch = average_over_durations(pitch, dr)
|
||||||
o_pitch_emb = self.pitch_emb(avg_pitch)
|
o_pitch_emb = self.pitch_emb(avg_pitch)
|
||||||
return o_pitch_emb, o_pitch, avg_pitch
|
return o_pitch_emb, o_pitch, avg_pitch
|
||||||
o_pitch_emb = self.pitch_emb(o_pitch)
|
o_pitch_emb = self.pitch_emb(o_pitch)
|
||||||
|
@ -471,7 +483,7 @@ class FastPitch(BaseTTS):
|
||||||
y: torch.FloatTensor = None,
|
y: torch.FloatTensor = None,
|
||||||
dr: torch.IntTensor = None,
|
dr: torch.IntTensor = None,
|
||||||
pitch: torch.FloatTensor = None,
|
pitch: torch.FloatTensor = None,
|
||||||
aux_input: Dict = {"d_vectors": 0, "speaker_ids": None}, # pylint: disable=unused-argument
|
aux_input: Dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""Model's forward pass.
|
"""Model's forward pass.
|
||||||
|
|
||||||
|
@ -479,10 +491,10 @@ class FastPitch(BaseTTS):
|
||||||
x (torch.LongTensor): Input character sequences.
|
x (torch.LongTensor): Input character sequences.
|
||||||
x_lengths (torch.LongTensor): Input sequence lengths.
|
x_lengths (torch.LongTensor): Input sequence lengths.
|
||||||
y_lengths (torch.LongTensor): Output sequnce lengths. Defaults to None.
|
y_lengths (torch.LongTensor): Output sequnce lengths. Defaults to None.
|
||||||
y (torch.FloatTensor): Spectrogram frames. Defaults to None.
|
y (torch.FloatTensor): Spectrogram frames. Only used when the alignment network is on. Defaults to None.
|
||||||
dr (torch.IntTensor): Character durations over the spectrogram frames. Defaults to None.
|
dr (torch.IntTensor): Character durations over the spectrogram frames. Only used when the alignment network is off. Defaults to None.
|
||||||
pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Defaults to None.
|
pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Only used when the pitch predictor is on. Defaults to None.
|
||||||
aux_input (Dict): Auxiliary model inputs. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.
|
aux_input (Dict): Auxiliary model inputs for multi-speaker training. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.
|
||||||
|
|
||||||
Shapes:
|
Shapes:
|
||||||
- x: :math:`[B, T_max]`
|
- x: :math:`[B, T_max]`
|
||||||
|
@ -495,8 +507,8 @@ class FastPitch(BaseTTS):
|
||||||
"""
|
"""
|
||||||
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
||||||
# compute sequence masks
|
# compute sequence masks
|
||||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(y.dtype)
|
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).float()
|
||||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(y.dtype)
|
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).float()
|
||||||
# encoder pass
|
# encoder pass
|
||||||
o_en, o_en_dp, x_mask, g, x_emb = self._forward_encoder(x, x_mask, g)
|
o_en, o_en_dp, x_mask, g, x_emb = self._forward_encoder(x, x_mask, g)
|
||||||
# duration predictor pass
|
# duration predictor pass
|
||||||
|
@ -507,27 +519,36 @@ class FastPitch(BaseTTS):
|
||||||
o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration)
|
o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration)
|
||||||
# generate attn mask from predicted durations
|
# generate attn mask from predicted durations
|
||||||
o_attn = self.generate_attn(o_dr.squeeze(1), x_mask)
|
o_attn = self.generate_attn(o_dr.squeeze(1), x_mask)
|
||||||
# aligner pass
|
# aligner
|
||||||
|
o_alignment_dur = None
|
||||||
|
alignment_soft = None
|
||||||
|
alignment_logprob = None
|
||||||
|
alignment_mas = None
|
||||||
if self.use_aligner:
|
if self.use_aligner:
|
||||||
o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas = self._forward_aligner(
|
o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas = self._forward_aligner(
|
||||||
x_emb, y, x_mask, y_mask
|
x_emb, y, x_mask, y_mask
|
||||||
)
|
)
|
||||||
|
alignment_soft = alignment_soft.transpose(1, 2)
|
||||||
|
alignment_mas = alignment_mas.transpose(1, 2)
|
||||||
dr = o_alignment_dur
|
dr = o_alignment_dur
|
||||||
# pitch predictor pass
|
# pitch predictor pass
|
||||||
o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en_dp, x_mask, pitch, dr)
|
o_pitch = None
|
||||||
o_en = o_en + o_pitch_emb
|
avg_pitch = None
|
||||||
|
if self.args.use_pitch:
|
||||||
|
o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en_dp, x_mask, pitch, dr)
|
||||||
|
o_en = o_en + o_pitch_emb
|
||||||
# decoder pass
|
# decoder pass
|
||||||
o_de, attn = self._forward_decoder(o_en, dr, x_mask, y_lengths, g=g)
|
o_de, attn = self._forward_decoder(o_en, dr, x_mask, y_lengths, g=g)
|
||||||
outputs = {
|
outputs = {
|
||||||
"model_outputs": o_de,
|
"model_outputs": o_de, # [B, T, C]
|
||||||
"durations_log": o_dr_log.squeeze(1),
|
"durations_log": o_dr_log.squeeze(1), # [B, T]
|
||||||
"durations": o_dr.squeeze(1),
|
"durations": o_dr.squeeze(1), # [B, T]
|
||||||
"attn_durations": o_attn, # for visualization
|
"attn_durations": o_attn, # for visualization [B, T_en, T_de']
|
||||||
"pitch_avg": o_pitch,
|
"pitch_avg": o_pitch,
|
||||||
"pitch_avg_gt": avg_pitch,
|
"pitch_avg_gt": avg_pitch,
|
||||||
"alignments": attn,
|
"alignments": attn, # [B, T_de, T_en]
|
||||||
"alignment_soft": alignment_soft.transpose(1, 2),
|
"alignment_soft": alignment_soft,
|
||||||
"alignment_mas": alignment_mas.transpose(1, 2),
|
"alignment_mas": alignment_mas,
|
||||||
"o_alignment_dur": o_alignment_dur,
|
"o_alignment_dur": o_alignment_dur,
|
||||||
"alignment_logprob": alignment_logprob,
|
"alignment_logprob": alignment_logprob,
|
||||||
"x_mask": x_mask,
|
"x_mask": x_mask,
|
||||||
|
@ -558,8 +579,10 @@ class FastPitch(BaseTTS):
|
||||||
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
||||||
y_lengths = o_dr.sum(1)
|
y_lengths = o_dr.sum(1)
|
||||||
# pitch predictor pass
|
# pitch predictor pass
|
||||||
o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en_dp, x_mask)
|
o_pitch = None
|
||||||
o_en = o_en + o_pitch_emb
|
if self.args.use_pitch:
|
||||||
|
o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en_dp, x_mask)
|
||||||
|
o_en = o_en + o_pitch_emb
|
||||||
# decoder pass
|
# decoder pass
|
||||||
o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=g)
|
o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=g)
|
||||||
outputs = {
|
outputs = {
|
||||||
|
@ -575,7 +598,7 @@ class FastPitch(BaseTTS):
|
||||||
text_lengths = batch["text_lengths"]
|
text_lengths = batch["text_lengths"]
|
||||||
mel_input = batch["mel_input"]
|
mel_input = batch["mel_input"]
|
||||||
mel_lengths = batch["mel_lengths"]
|
mel_lengths = batch["mel_lengths"]
|
||||||
pitch = batch["pitch"]
|
pitch = batch["pitch"] if self.args.use_pitch else None
|
||||||
d_vectors = batch["d_vectors"]
|
d_vectors = batch["d_vectors"]
|
||||||
speaker_ids = batch["speaker_ids"]
|
speaker_ids = batch["speaker_ids"]
|
||||||
durations = batch["durations"]
|
durations = batch["durations"]
|
||||||
|
@ -597,10 +620,10 @@ class FastPitch(BaseTTS):
|
||||||
decoder_output_lens=mel_lengths,
|
decoder_output_lens=mel_lengths,
|
||||||
dur_output=outputs["durations_log"],
|
dur_output=outputs["durations_log"],
|
||||||
dur_target=durations,
|
dur_target=durations,
|
||||||
pitch_output=outputs["pitch_avg"],
|
pitch_output=outputs["pitch_avg"] if self.use_pitch else None,
|
||||||
pitch_target=outputs["pitch_avg_gt"],
|
pitch_target=outputs["pitch_avg_gt"] if self.use_pitch else None,
|
||||||
input_lens=text_lengths,
|
input_lens=text_lengths,
|
||||||
alignment_logprob=outputs["alignment_logprob"],
|
alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None,
|
||||||
alignment_soft=outputs["alignment_soft"] if self.use_binary_alignment_loss else None,
|
alignment_soft=outputs["alignment_soft"] if self.use_binary_alignment_loss else None,
|
||||||
alignment_hard=outputs["alignment_mas"] if self.use_binary_alignment_loss else None,
|
alignment_hard=outputs["alignment_mas"] if self.use_binary_alignment_loss else None,
|
||||||
)
|
)
|
||||||
|
@ -615,28 +638,33 @@ class FastPitch(BaseTTS):
|
||||||
model_outputs = outputs["model_outputs"]
|
model_outputs = outputs["model_outputs"]
|
||||||
alignments = outputs["alignments"]
|
alignments = outputs["alignments"]
|
||||||
mel_input = batch["mel_input"]
|
mel_input = batch["mel_input"]
|
||||||
pitch = batch["pitch"]
|
|
||||||
pitch_avg_expanded, _ = self.expand_encoder_outputs(
|
|
||||||
outputs["pitch_avg"], outputs["durations"], outputs["x_mask"], outputs["y_mask"]
|
|
||||||
)
|
|
||||||
|
|
||||||
pred_spec = model_outputs[0].data.cpu().numpy()
|
pred_spec = model_outputs[0].data.cpu().numpy()
|
||||||
gt_spec = mel_input[0].data.cpu().numpy()
|
gt_spec = mel_input[0].data.cpu().numpy()
|
||||||
align_img = alignments[0].data.cpu().numpy()
|
align_img = alignments[0].data.cpu().numpy()
|
||||||
pitch = pitch[0, 0].data.cpu().numpy()
|
|
||||||
|
|
||||||
# TODO: denormalize before plotting
|
|
||||||
pitch = abs(pitch)
|
|
||||||
pitch_avg_expanded = abs(pitch_avg_expanded[0, 0]).data.cpu().numpy()
|
|
||||||
|
|
||||||
figures = {
|
figures = {
|
||||||
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
|
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
|
||||||
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
||||||
"alignment": plot_alignment(align_img, output_fig=False),
|
"alignment": plot_alignment(align_img, output_fig=False),
|
||||||
"pitch_ground_truth": plot_pitch(pitch, gt_spec, ap, output_fig=False),
|
|
||||||
"pitch_avg_predicted": plot_pitch(pitch_avg_expanded, pred_spec, ap, output_fig=False),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# plot pitch figures
|
||||||
|
if self.args.use_pitch:
|
||||||
|
pitch = batch["pitch"]
|
||||||
|
pitch_avg_expanded, _ = self.expand_encoder_outputs(
|
||||||
|
outputs["pitch_avg"], outputs["durations"], outputs["x_mask"], outputs["y_mask"]
|
||||||
|
)
|
||||||
|
pitch = pitch[0, 0].data.cpu().numpy()
|
||||||
|
# TODO: denormalize before plotting
|
||||||
|
pitch = abs(pitch)
|
||||||
|
pitch_avg_expanded = abs(pitch_avg_expanded[0, 0]).data.cpu().numpy()
|
||||||
|
pitch_figures = {
|
||||||
|
"pitch_ground_truth": plot_pitch(pitch, gt_spec, ap, output_fig=False),
|
||||||
|
"pitch_avg_predicted": plot_pitch(pitch_avg_expanded, pred_spec, ap, output_fig=False),
|
||||||
|
}
|
||||||
|
figures.update(pitch_figures)
|
||||||
|
|
||||||
# plot the attention mask computed from the predicted durations
|
# plot the attention mask computed from the predicted durations
|
||||||
if "attn_durations" in outputs:
|
if "attn_durations" in outputs:
|
||||||
alignments_hat = outputs["attn_durations"][0].data.cpu().numpy()
|
alignments_hat = outputs["attn_durations"][0].data.cpu().numpy()
|
||||||
|
@ -662,36 +690,11 @@ class FastPitch(BaseTTS):
|
||||||
assert not self.training
|
assert not self.training
|
||||||
|
|
||||||
def get_criterion(self):
|
def get_criterion(self):
|
||||||
from TTS.tts.layers.losses import FastPitchLoss # pylint: disable=import-outside-toplevel
|
from TTS.tts.layers.losses import ForwardTTSLoss # pylint: disable=import-outside-toplevel
|
||||||
|
|
||||||
return FastPitchLoss(self.config)
|
return ForwardTTSLoss(self.config)
|
||||||
|
|
||||||
def on_train_step_start(self, trainer):
|
def on_train_step_start(self, trainer):
|
||||||
"""Enable binary alignment loss when needed"""
|
"""Enable binary alignment loss when needed"""
|
||||||
if trainer.total_steps_done > self.config.binary_align_loss_start_step:
|
if trainer.total_steps_done > self.config.binary_align_loss_start_step:
|
||||||
self.use_binary_alignment_loss = True
|
self.use_binary_alignment_loss = True
|
||||||
|
|
||||||
|
|
||||||
def average_pitch(pitch, durs):
|
|
||||||
"""Compute the average pitch value for each input character based on the durations.
|
|
||||||
|
|
||||||
Shapes:
|
|
||||||
- pitch: :math:`[B, 1, T_de]`
|
|
||||||
- durs: :math:`[B, T_en]`
|
|
||||||
"""
|
|
||||||
|
|
||||||
durs_cums_ends = torch.cumsum(durs, dim=1).long()
|
|
||||||
durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
|
|
||||||
pitch_nonzero_cums = torch.nn.functional.pad(torch.cumsum(pitch != 0.0, dim=2), (1, 0))
|
|
||||||
pitch_cums = torch.nn.functional.pad(torch.cumsum(pitch, dim=2), (1, 0))
|
|
||||||
|
|
||||||
bs, l = durs_cums_ends.size()
|
|
||||||
n_formants = pitch.size(1)
|
|
||||||
dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
|
|
||||||
dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
|
|
||||||
|
|
||||||
pitch_sums = (torch.gather(pitch_cums, 2, dce) - torch.gather(pitch_cums, 2, dcs)).float()
|
|
||||||
pitch_nelems = (torch.gather(pitch_nonzero_cums, 2, dce) - torch.gather(pitch_nonzero_cums, 2, dcs)).float()
|
|
||||||
|
|
||||||
pitch_avg = torch.where(pitch_nelems == 0.0, pitch_nelems, pitch_sums / pitch_nelems)
|
|
||||||
return pitch_avg
|
|
|
@ -7,9 +7,8 @@ from torch.nn import functional as F
|
||||||
from TTS.tts.configs import GlowTTSConfig
|
from TTS.tts.configs import GlowTTSConfig
|
||||||
from TTS.tts.layers.glow_tts.decoder import Decoder
|
from TTS.tts.layers.glow_tts.decoder import Decoder
|
||||||
from TTS.tts.layers.glow_tts.encoder import Encoder
|
from TTS.tts.layers.glow_tts.encoder import Encoder
|
||||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||||
from TTS.tts.utils.speakers import get_speaker_manager
|
from TTS.tts.utils.speakers import get_speaker_manager
|
||||||
from TTS.tts.utils.synthesis import synthesis
|
from TTS.tts.utils.synthesis import synthesis
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||||
|
@ -133,7 +132,7 @@ class GlowTTS(BaseTTS):
|
||||||
return y_mean, y_log_scale, o_attn_dur
|
return y_mean, y_log_scale, o_attn_dur
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, 'speaker_ids':None}
|
self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
|
||||||
): # pylint: disable=dangerous-default-value
|
): # pylint: disable=dangerous-default-value
|
||||||
"""
|
"""
|
||||||
Shapes:
|
Shapes:
|
||||||
|
@ -185,7 +184,7 @@ class GlowTTS(BaseTTS):
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def inference_with_MAS(
|
def inference_with_MAS(
|
||||||
self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, 'speaker_ids':None}
|
self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
|
||||||
): # pylint: disable=dangerous-default-value
|
): # pylint: disable=dangerous-default-value
|
||||||
"""
|
"""
|
||||||
It's similar to the teacher forcing in Tacotron.
|
It's similar to the teacher forcing in Tacotron.
|
||||||
|
@ -246,7 +245,7 @@ class GlowTTS(BaseTTS):
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def decoder_inference(
|
def decoder_inference(
|
||||||
self, y, y_lengths=None, aux_input={"d_vectors": None, 'speaker_ids':None}
|
self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
|
||||||
): # pylint: disable=dangerous-default-value
|
): # pylint: disable=dangerous-default-value
|
||||||
"""
|
"""
|
||||||
Shapes:
|
Shapes:
|
||||||
|
@ -278,7 +277,9 @@ class GlowTTS(BaseTTS):
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids":None}): # pylint: disable=dangerous-default-value
|
def inference(
|
||||||
|
self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
|
||||||
|
): # pylint: disable=dangerous-default-value
|
||||||
x_lengths = aux_input["x_lengths"]
|
x_lengths = aux_input["x_lengths"]
|
||||||
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None
|
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None
|
||||||
|
|
||||||
|
@ -331,7 +332,13 @@ class GlowTTS(BaseTTS):
|
||||||
d_vectors = batch["d_vectors"]
|
d_vectors = batch["d_vectors"]
|
||||||
speaker_ids = batch["speaker_ids"]
|
speaker_ids = batch["speaker_ids"]
|
||||||
|
|
||||||
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": d_vectors, "speaker_ids":speaker_ids})
|
outputs = self.forward(
|
||||||
|
text_input,
|
||||||
|
text_lengths,
|
||||||
|
mel_input,
|
||||||
|
mel_lengths,
|
||||||
|
aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},
|
||||||
|
)
|
||||||
|
|
||||||
loss_dict = criterion(
|
loss_dict = criterion(
|
||||||
outputs["model_outputs"],
|
outputs["model_outputs"],
|
||||||
|
|
|
@ -1,320 +0,0 @@
|
||||||
from dataclasses import dataclass, field
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from coqpit import Coqpit
|
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
from TTS.tts.layers.feed_forward.decoder import Decoder
|
|
||||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
|
||||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
|
||||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
|
||||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path
|
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
|
||||||
from TTS.tts.utils.data import sequence_mask
|
|
||||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
from TTS.utils.io import load_fsspec
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class SpeedySpeechArgs(Coqpit):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
num_chars (int): number of unique input to characters
|
|
||||||
out_channels (int): number of output tensor channels. It is equal to the expected spectrogram size.
|
|
||||||
hidden_channels (int): number of channels in all the model layers.
|
|
||||||
positional_encoding (bool, optional): enable/disable Positional encoding on encoder outputs. Defaults to True.
|
|
||||||
length_scale (int, optional): coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
|
|
||||||
encoder_type (str, optional): set the encoder type. Defaults to 'residual_conv_bn'.
|
|
||||||
encoder_params (dict, optional): set encoder parameters depending on 'encoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13 }.
|
|
||||||
decoder_type (str, optional): decoder type. Defaults to 'residual_conv_bn'.
|
|
||||||
decoder_params (dict, optional): set decoder parameters depending on 'decoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17 }.
|
|
||||||
num_speakers (int, optional): number of speakers for multi-speaker training. Defaults to 0.
|
|
||||||
use_d_vector (bool, optional): enable external speaker embeddings. Defaults to False.
|
|
||||||
d_vector_dim (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
|
|
||||||
"""
|
|
||||||
|
|
||||||
num_chars: int = None
|
|
||||||
out_channels: int = 80
|
|
||||||
hidden_channels: int = 128
|
|
||||||
num_speakers: int = 0
|
|
||||||
positional_encoding: bool = True
|
|
||||||
length_scale: int = 1
|
|
||||||
encoder_type: str = "residual_conv_bn"
|
|
||||||
encoder_params: dict = field(
|
|
||||||
default_factory=lambda: {
|
|
||||||
"kernel_size": 4,
|
|
||||||
"dilations": 4 * [1, 2, 4] + [1],
|
|
||||||
"num_conv_blocks": 2,
|
|
||||||
"num_res_blocks": 13,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
decoder_type: str = "residual_conv_bn"
|
|
||||||
decoder_params: dict = field(
|
|
||||||
default_factory=lambda: {
|
|
||||||
"kernel_size": 4,
|
|
||||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
|
||||||
"num_conv_blocks": 2,
|
|
||||||
"num_res_blocks": 17,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
use_d_vector: bool = False
|
|
||||||
d_vector_dim: int = 0
|
|
||||||
|
|
||||||
|
|
||||||
class SpeedySpeech(BaseTTS):
|
|
||||||
"""Speedy Speech model
|
|
||||||
https://arxiv.org/abs/2008.03802
|
|
||||||
|
|
||||||
Encoder -> DurationPredictor -> Decoder
|
|
||||||
|
|
||||||
Paper abstract:
|
|
||||||
While recent neural sequence-to-sequence models have greatly improved the quality of speech
|
|
||||||
synthesis, there has not been a system capable of fast training, fast inference and high-quality audio synthesis
|
|
||||||
at the same time. We propose a student-teacher network capable of high-quality faster-than-real-time spectrogram
|
|
||||||
synthesis, with low requirements on computational resources and fast training time. We show that self-attention
|
|
||||||
layers are not necessary for generation of high quality audio. We utilize simple convolutional blocks with
|
|
||||||
residual connections in both student and teacher networks and use only a single attention layer in the teacher
|
|
||||||
model. Coupled with a MelGAN vocoder, our model's voice quality was rated significantly higher than Tacotron 2.
|
|
||||||
Our model can be efficiently trained on a single GPU and can run in real time even on a CPU. We provide both
|
|
||||||
our source code and audio samples in our GitHub repository.
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
The vanilla model is able to achieve a reasonable performance with only
|
|
||||||
~3M model parameters and convolutional layers.
|
|
||||||
|
|
||||||
This model requires precomputed phoneme durations to train a duration predictor. At inference
|
|
||||||
it only uses the duration predictor to compute durations and expand encoder outputs respectively.
|
|
||||||
|
|
||||||
You can also mix and match different encoder and decoder networks beyond the paper.
|
|
||||||
|
|
||||||
Check `SpeedySpeechArgs` for arguments.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
|
||||||
|
|
||||||
def __init__(self, config: Coqpit):
|
|
||||||
super().__init__()
|
|
||||||
self.config = config
|
|
||||||
|
|
||||||
if "characters" in config:
|
|
||||||
_, self.config, self.num_chars = self.get_characters(config)
|
|
||||||
|
|
||||||
self.length_scale = (
|
|
||||||
float(config.model_args.length_scale)
|
|
||||||
if isinstance(config.model_args.length_scale, int)
|
|
||||||
else config.model_args.length_scale
|
|
||||||
)
|
|
||||||
self.emb = nn.Embedding(self.num_chars, config.model_args.hidden_channels)
|
|
||||||
self.encoder = Encoder(
|
|
||||||
config.model_args.hidden_channels,
|
|
||||||
config.model_args.hidden_channels,
|
|
||||||
config.model_args.encoder_type,
|
|
||||||
config.model_args.encoder_params,
|
|
||||||
config.model_args.d_vector_dim,
|
|
||||||
)
|
|
||||||
if config.model_args.positional_encoding:
|
|
||||||
self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels)
|
|
||||||
self.decoder = Decoder(
|
|
||||||
config.model_args.out_channels,
|
|
||||||
config.model_args.hidden_channels,
|
|
||||||
config.model_args.decoder_type,
|
|
||||||
config.model_args.decoder_params,
|
|
||||||
)
|
|
||||||
self.duration_predictor = DurationPredictor(config.model_args.hidden_channels + config.model_args.d_vector_dim)
|
|
||||||
|
|
||||||
if config.model_args.num_speakers > 1 and not config.model_args.use_d_vector:
|
|
||||||
# speaker embedding layer
|
|
||||||
self.emb_g = nn.Embedding(config.model_args.num_speakers, config.model_args.d_vector_dim)
|
|
||||||
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
|
||||||
|
|
||||||
if config.model_args.d_vector_dim > 0 and config.model_args.d_vector_dim != config.model_args.hidden_channels:
|
|
||||||
self.proj_g = nn.Conv1d(config.model_args.d_vector_dim, config.model_args.hidden_channels, 1)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def expand_encoder_outputs(en, dr, x_mask, y_mask):
|
|
||||||
"""Generate attention alignment map from durations and
|
|
||||||
expand encoder outputs
|
|
||||||
|
|
||||||
Example:
|
|
||||||
encoder output: [a,b,c,d]
|
|
||||||
durations: [1, 3, 2, 1]
|
|
||||||
|
|
||||||
expanded: [a, b, b, b, c, c, d]
|
|
||||||
attention map: [[0, 0, 0, 0, 0, 0, 1],
|
|
||||||
[0, 0, 0, 0, 1, 1, 0],
|
|
||||||
[0, 1, 1, 1, 0, 0, 0],
|
|
||||||
[1, 0, 0, 0, 0, 0, 0]]
|
|
||||||
"""
|
|
||||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
|
||||||
attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype)
|
|
||||||
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2)
|
|
||||||
return o_en_ex, attn
|
|
||||||
|
|
||||||
def format_durations(self, o_dr_log, x_mask):
|
|
||||||
o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
|
|
||||||
o_dr[o_dr < 1] = 1.0
|
|
||||||
o_dr = torch.round(o_dr)
|
|
||||||
return o_dr
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _concat_speaker_embedding(o_en, g):
|
|
||||||
g_exp = g.expand(-1, -1, o_en.size(-1)) # [B, C, T_en]
|
|
||||||
o_en = torch.cat([o_en, g_exp], 1)
|
|
||||||
return o_en
|
|
||||||
|
|
||||||
def _sum_speaker_embedding(self, x, g):
|
|
||||||
# project g to decoder dim.
|
|
||||||
if hasattr(self, "proj_g"):
|
|
||||||
g = self.proj_g(g)
|
|
||||||
return x + g
|
|
||||||
|
|
||||||
def _forward_encoder(self, x, x_lengths, g=None):
|
|
||||||
if hasattr(self, "emb_g"):
|
|
||||||
g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1]
|
|
||||||
|
|
||||||
if g is not None:
|
|
||||||
g = g.unsqueeze(-1)
|
|
||||||
|
|
||||||
# [B, T, C]
|
|
||||||
x_emb = self.emb(x)
|
|
||||||
# [B, C, T]
|
|
||||||
x_emb = torch.transpose(x_emb, 1, -1)
|
|
||||||
|
|
||||||
# compute sequence masks
|
|
||||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype)
|
|
||||||
|
|
||||||
# encoder pass
|
|
||||||
o_en = self.encoder(x_emb, x_mask)
|
|
||||||
|
|
||||||
# speaker conditioning for duration predictor
|
|
||||||
if g is not None:
|
|
||||||
o_en_dp = self._concat_speaker_embedding(o_en, g)
|
|
||||||
else:
|
|
||||||
o_en_dp = o_en
|
|
||||||
return o_en, o_en_dp, x_mask, g
|
|
||||||
|
|
||||||
def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g):
|
|
||||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
|
|
||||||
# expand o_en with durations
|
|
||||||
o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
|
|
||||||
# positional encoding
|
|
||||||
if hasattr(self, "pos_encoder"):
|
|
||||||
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
|
|
||||||
# speaker embedding
|
|
||||||
if g is not None:
|
|
||||||
o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
|
|
||||||
# decoder pass
|
|
||||||
o_de = self.decoder(o_en_ex, y_mask, g=g)
|
|
||||||
return o_de, attn.transpose(1, 2)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self, x, x_lengths, y_lengths, dr, aux_input={"d_vectors": None, "speaker_ids": None}
|
|
||||||
): # pylint: disable=unused-argument
|
|
||||||
"""
|
|
||||||
TODO: speaker embedding for speaker_ids
|
|
||||||
Shapes:
|
|
||||||
x: [B, T_max]
|
|
||||||
x_lengths: [B]
|
|
||||||
y_lengths: [B]
|
|
||||||
dr: [B, T_max]
|
|
||||||
g: [B, C]
|
|
||||||
"""
|
|
||||||
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
|
||||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
|
||||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
|
||||||
o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
|
||||||
outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn}
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument
|
|
||||||
"""
|
|
||||||
Shapes:
|
|
||||||
x: [B, T_max]
|
|
||||||
x_lengths: [B]
|
|
||||||
g: [B, C]
|
|
||||||
"""
|
|
||||||
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
|
||||||
x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
|
|
||||||
# input sequence should be greated than the max convolution size
|
|
||||||
inference_padding = 5
|
|
||||||
if x.shape[1] < 13:
|
|
||||||
inference_padding += 13 - x.shape[1]
|
|
||||||
# pad input to prevent dropping the last word
|
|
||||||
x = torch.nn.functional.pad(x, pad=(0, inference_padding), mode="constant", value=0)
|
|
||||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
|
||||||
# duration predictor pass
|
|
||||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
|
||||||
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
|
||||||
y_lengths = o_dr.sum(1)
|
|
||||||
o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
|
|
||||||
outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn, "durations_log": None}
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
def train_step(self, batch: dict, criterion: nn.Module):
|
|
||||||
text_input = batch["text_input"]
|
|
||||||
text_lengths = batch["text_lengths"]
|
|
||||||
mel_input = batch["mel_input"]
|
|
||||||
mel_lengths = batch["mel_lengths"]
|
|
||||||
d_vectors = batch["d_vectors"]
|
|
||||||
speaker_ids = batch["speaker_ids"]
|
|
||||||
durations = batch["durations"]
|
|
||||||
|
|
||||||
aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids}
|
|
||||||
outputs = self.forward(text_input, text_lengths, mel_lengths, durations, aux_input)
|
|
||||||
|
|
||||||
# compute loss
|
|
||||||
loss_dict = criterion(
|
|
||||||
outputs["model_outputs"],
|
|
||||||
mel_input,
|
|
||||||
mel_lengths,
|
|
||||||
outputs["durations_log"],
|
|
||||||
torch.log(1 + durations),
|
|
||||||
text_lengths,
|
|
||||||
)
|
|
||||||
|
|
||||||
# compute alignment error (the lower the better )
|
|
||||||
align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True)
|
|
||||||
loss_dict["align_error"] = align_error
|
|
||||||
return outputs, loss_dict
|
|
||||||
|
|
||||||
def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use
|
|
||||||
model_outputs = outputs["model_outputs"]
|
|
||||||
alignments = outputs["alignments"]
|
|
||||||
mel_input = batch["mel_input"]
|
|
||||||
|
|
||||||
pred_spec = model_outputs[0].data.cpu().numpy()
|
|
||||||
gt_spec = mel_input[0].data.cpu().numpy()
|
|
||||||
align_img = alignments[0].data.cpu().numpy()
|
|
||||||
|
|
||||||
figures = {
|
|
||||||
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
|
|
||||||
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
|
||||||
"alignment": plot_alignment(align_img, output_fig=False),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Sample audio
|
|
||||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
|
||||||
return figures, {"audio": train_audio}
|
|
||||||
|
|
||||||
def eval_step(self, batch: dict, criterion: nn.Module):
|
|
||||||
return self.train_step(batch, criterion)
|
|
||||||
|
|
||||||
def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict):
|
|
||||||
return self.train_log(ap, batch, outputs)
|
|
||||||
|
|
||||||
def load_checkpoint(
|
|
||||||
self, config, checkpoint_path, eval=False
|
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
|
||||||
self.load_state_dict(state["model"])
|
|
||||||
if eval:
|
|
||||||
self.eval()
|
|
||||||
assert not self.training
|
|
||||||
|
|
||||||
def get_criterion(self):
|
|
||||||
from TTS.tts.layers.losses import SpeedySpeechLoss # pylint: disable=import-outside-toplevel
|
|
||||||
|
|
||||||
return SpeedySpeechLoss(self.config)
|
|
|
@ -9,12 +9,11 @@ from torch import nn
|
||||||
from torch.cuda.amp.autocast_mode import autocast
|
from torch.cuda.amp.autocast_mode import autocast
|
||||||
|
|
||||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||||
from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
|
|
||||||
from TTS.tts.layers.vits.discriminator import VitsDiscriminator
|
from TTS.tts.layers.vits.discriminator import VitsDiscriminator
|
||||||
from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
|
from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
|
||||||
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
|
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
|
||||||
from TTS.tts.utils.speakers import get_speaker_manager
|
from TTS.tts.utils.speakers import get_speaker_manager
|
||||||
from TTS.tts.utils.synthesis import synthesis
|
from TTS.tts.utils.synthesis import synthesis
|
||||||
from TTS.tts.utils.visual import plot_alignment
|
from TTS.tts.utils.visual import plot_alignment
|
||||||
|
@ -24,28 +23,6 @@ from TTS.vocoder.models.hifigan_generator import HifiganGenerator
|
||||||
from TTS.vocoder.utils.generic_utils import plot_results
|
from TTS.vocoder.utils.generic_utils import plot_results
|
||||||
|
|
||||||
|
|
||||||
def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4):
|
|
||||||
"""Segment each sample in a batch based on the provided segment indices"""
|
|
||||||
segments = torch.zeros_like(x[:, :, :segment_size])
|
|
||||||
for i in range(x.size(0)):
|
|
||||||
index_start = segment_indices[i]
|
|
||||||
index_end = index_start + segment_size
|
|
||||||
segments[i] = x[i, :, index_start:index_end]
|
|
||||||
return segments
|
|
||||||
|
|
||||||
|
|
||||||
def rand_segment(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4):
|
|
||||||
"""Create random segments based on the input lengths."""
|
|
||||||
B, _, T = x.size()
|
|
||||||
if x_lengths is None:
|
|
||||||
x_lengths = T
|
|
||||||
max_idxs = x_lengths - segment_size + 1
|
|
||||||
assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size."
|
|
||||||
segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long()
|
|
||||||
ret = segment(x, segment_indices, segment_size)
|
|
||||||
return ret, segment_indices
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class VitsArgs(Coqpit):
|
class VitsArgs(Coqpit):
|
||||||
"""VITS model arguments.
|
"""VITS model arguments.
|
||||||
|
@ -451,7 +428,7 @@ class Vits(BaseTTS):
|
||||||
logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p])
|
logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p])
|
||||||
|
|
||||||
# select a random feature segment for the waveform decoder
|
# select a random feature segment for the waveform decoder
|
||||||
z_slice, slice_ids = rand_segment(z, y_lengths, self.spec_segment_size)
|
z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size)
|
||||||
o = self.waveform_decoder(z_slice, g=g)
|
o = self.waveform_decoder(z_slice, g=g)
|
||||||
outputs.update(
|
outputs.update(
|
||||||
{
|
{
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
def _pad_data(x, length):
|
def _pad_data(x, length):
|
||||||
|
@ -52,35 +51,3 @@ def prepare_stop_target(inputs, out_steps):
|
||||||
|
|
||||||
def pad_per_step(inputs, pad_len):
|
def pad_per_step(inputs, pad_len):
|
||||||
return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
|
return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=attribute-defined-outside-init
|
|
||||||
class StandardScaler:
|
|
||||||
def set_stats(self, mean, scale):
|
|
||||||
self.mean_ = mean
|
|
||||||
self.scale_ = scale
|
|
||||||
|
|
||||||
def reset_stats(self):
|
|
||||||
delattr(self, "mean_")
|
|
||||||
delattr(self, "scale_")
|
|
||||||
|
|
||||||
def transform(self, X):
|
|
||||||
X = np.asarray(X)
|
|
||||||
X -= self.mean_
|
|
||||||
X /= self.scale_
|
|
||||||
return X
|
|
||||||
|
|
||||||
def inverse_transform(self, X):
|
|
||||||
X = np.asarray(X)
|
|
||||||
X *= self.scale_
|
|
||||||
X += self.mean_
|
|
||||||
return X
|
|
||||||
|
|
||||||
|
|
||||||
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
|
|
||||||
def sequence_mask(sequence_length, max_len=None):
|
|
||||||
if max_len is None:
|
|
||||||
max_len = sequence_length.data.max()
|
|
||||||
seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
|
|
||||||
# B x T_max
|
|
||||||
return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
|
|
||||||
|
|
|
@ -0,0 +1,213 @@
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
try:
|
||||||
|
from TTS.tts.utils.monotonic_align.core import maximum_path_c
|
||||||
|
|
||||||
|
CYTHON = True
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
CYTHON = False
|
||||||
|
|
||||||
|
|
||||||
|
class StandardScaler:
|
||||||
|
"""StandardScaler for mean-scale normalization with the given mean and scale values."""
|
||||||
|
|
||||||
|
def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
|
||||||
|
self.mean_ = mean
|
||||||
|
self.scale_ = scale
|
||||||
|
|
||||||
|
def set_stats(self, mean, scale):
|
||||||
|
self.mean_ = mean
|
||||||
|
self.scale_ = scale
|
||||||
|
|
||||||
|
def reset_stats(self):
|
||||||
|
delattr(self, "mean_")
|
||||||
|
delattr(self, "scale_")
|
||||||
|
|
||||||
|
def transform(self, X):
|
||||||
|
X = np.asarray(X)
|
||||||
|
X -= self.mean_
|
||||||
|
X /= self.scale_
|
||||||
|
return X
|
||||||
|
|
||||||
|
def inverse_transform(self, X):
|
||||||
|
X = np.asarray(X)
|
||||||
|
X *= self.scale_
|
||||||
|
X += self.mean_
|
||||||
|
return X
|
||||||
|
|
||||||
|
|
||||||
|
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
|
||||||
|
def sequence_mask(sequence_length, max_len=None):
|
||||||
|
"""Create a sequence mask for filtering padding in a sequence tensor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence_length (torch.tensor): Sequence lengths.
|
||||||
|
max_len (int, Optional): Maximum sequence length. Defaults to None.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- mask: :math:`[B, T_max]`
|
||||||
|
"""
|
||||||
|
if max_len is None:
|
||||||
|
max_len = sequence_length.data.max()
|
||||||
|
seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
|
||||||
|
# B x T_max
|
||||||
|
mask = seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4):
|
||||||
|
"""Segment each sample in a batch based on the provided segment indices
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (torch.tensor): Input tensor.
|
||||||
|
segment_indices (torch.tensor): Segment indices.
|
||||||
|
segment_size (int): Expected output segment size.
|
||||||
|
"""
|
||||||
|
segments = torch.zeros_like(x[:, :, :segment_size])
|
||||||
|
for i in range(x.size(0)):
|
||||||
|
index_start = segment_indices[i]
|
||||||
|
index_end = index_start + segment_size
|
||||||
|
segments[i] = x[i, :, index_start:index_end]
|
||||||
|
return segments
|
||||||
|
|
||||||
|
|
||||||
|
def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4):
|
||||||
|
"""Create random segments based on the input lengths.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (torch.tensor): Input tensor.
|
||||||
|
x_lengths (torch.tensor): Input lengths.
|
||||||
|
segment_size (int): Expected output segment size.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- x: :math:`[B, C, T]`
|
||||||
|
- x_lengths: :math:`[B]`
|
||||||
|
"""
|
||||||
|
B, _, T = x.size()
|
||||||
|
if x_lengths is None:
|
||||||
|
x_lengths = T
|
||||||
|
max_idxs = x_lengths - segment_size + 1
|
||||||
|
assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size."
|
||||||
|
segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long()
|
||||||
|
ret = segment(x, segment_indices, segment_size)
|
||||||
|
return ret, segment_indices
|
||||||
|
|
||||||
|
|
||||||
|
def average_over_durations(values, durs):
|
||||||
|
"""Average values over durations.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
- values: :math:`[B, 1, T_de]`
|
||||||
|
- durs: :math:`[B, T_en]`
|
||||||
|
- avg: :math:`[B, 1, T_en]`
|
||||||
|
"""
|
||||||
|
durs_cums_ends = torch.cumsum(durs, dim=1).long()
|
||||||
|
durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
|
||||||
|
values_nonzero_cums = torch.nn.functional.pad(torch.cumsum(values != 0.0, dim=2), (1, 0))
|
||||||
|
values_cums = torch.nn.functional.pad(torch.cumsum(values, dim=2), (1, 0))
|
||||||
|
|
||||||
|
bs, l = durs_cums_ends.size()
|
||||||
|
n_formants = values.size(1)
|
||||||
|
dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
|
||||||
|
dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
|
||||||
|
|
||||||
|
values_sums = (torch.gather(values_cums, 2, dce) - torch.gather(values_cums, 2, dcs)).float()
|
||||||
|
values_nelems = (torch.gather(values_nonzero_cums, 2, dce) - torch.gather(values_nonzero_cums, 2, dcs)).float()
|
||||||
|
|
||||||
|
avg = torch.where(values_nelems == 0.0, values_nelems, values_sums / values_nelems)
|
||||||
|
return avg
|
||||||
|
|
||||||
|
|
||||||
|
def convert_pad_shape(pad_shape):
|
||||||
|
l = pad_shape[::-1]
|
||||||
|
pad_shape = [item for sublist in l for item in sublist]
|
||||||
|
return pad_shape
|
||||||
|
|
||||||
|
|
||||||
|
def generate_path(duration, mask):
|
||||||
|
"""
|
||||||
|
Shapes:
|
||||||
|
- duration: :math:`[B, T_en]`
|
||||||
|
- mask: :math:'[B, T_en, T_de]`
|
||||||
|
- path: :math:`[B, T_en, T_de]`
|
||||||
|
"""
|
||||||
|
device = duration.device
|
||||||
|
b, t_x, t_y = mask.shape
|
||||||
|
cum_duration = torch.cumsum(duration, 1)
|
||||||
|
path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
|
||||||
|
|
||||||
|
cum_duration_flat = cum_duration.view(b * t_x)
|
||||||
|
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
||||||
|
path = path.view(b, t_x, t_y)
|
||||||
|
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
||||||
|
path = path * mask
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def maximum_path(value, mask):
|
||||||
|
if CYTHON:
|
||||||
|
return maximum_path_cython(value, mask)
|
||||||
|
return maximum_path_numpy(value, mask)
|
||||||
|
|
||||||
|
|
||||||
|
def maximum_path_cython(value, mask):
|
||||||
|
"""Cython optimised version.
|
||||||
|
Shapes:
|
||||||
|
- value: :math:`[B, T_en, T_de]`
|
||||||
|
- mask: :math:`[B, T_en, T_de]`
|
||||||
|
"""
|
||||||
|
value = value * mask
|
||||||
|
device = value.device
|
||||||
|
dtype = value.dtype
|
||||||
|
value = value.data.cpu().numpy().astype(np.float32)
|
||||||
|
path = np.zeros_like(value).astype(np.int32)
|
||||||
|
mask = mask.data.cpu().numpy()
|
||||||
|
|
||||||
|
t_x_max = mask.sum(1)[:, 0].astype(np.int32)
|
||||||
|
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
|
||||||
|
maximum_path_c(path, value, t_x_max, t_y_max)
|
||||||
|
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def maximum_path_numpy(value, mask, max_neg_val=None):
|
||||||
|
"""
|
||||||
|
Monotonic alignment search algorithm
|
||||||
|
Numpy-friendly version. It's about 4 times faster than torch version.
|
||||||
|
value: [b, t_x, t_y]
|
||||||
|
mask: [b, t_x, t_y]
|
||||||
|
"""
|
||||||
|
if max_neg_val is None:
|
||||||
|
max_neg_val = -np.inf # Patch for Sphinx complaint
|
||||||
|
value = value * mask
|
||||||
|
|
||||||
|
device = value.device
|
||||||
|
dtype = value.dtype
|
||||||
|
value = value.cpu().detach().numpy()
|
||||||
|
mask = mask.cpu().detach().numpy().astype(np.bool)
|
||||||
|
|
||||||
|
b, t_x, t_y = value.shape
|
||||||
|
direction = np.zeros(value.shape, dtype=np.int64)
|
||||||
|
v = np.zeros((b, t_x), dtype=np.float32)
|
||||||
|
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
|
||||||
|
for j in range(t_y):
|
||||||
|
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
|
||||||
|
v1 = v
|
||||||
|
max_mask = v1 >= v0
|
||||||
|
v_max = np.where(max_mask, v1, v0)
|
||||||
|
direction[:, :, j] = max_mask
|
||||||
|
|
||||||
|
index_mask = x_range <= j
|
||||||
|
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
|
||||||
|
direction = np.where(mask, direction, 1)
|
||||||
|
|
||||||
|
path = np.zeros(value.shape, dtype=np.float32)
|
||||||
|
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
|
||||||
|
index_range = np.arange(b)
|
||||||
|
for j in reversed(range(t_y)):
|
||||||
|
path[index_range, index, j] = 1
|
||||||
|
index = index + direction[index_range, index, j] - 1
|
||||||
|
path = path * mask.astype(np.float32)
|
||||||
|
path = torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||||
|
return path
|
File diff suppressed because it is too large
Load Diff
|
@ -101,6 +101,7 @@ def visualize(
|
||||||
figsize=(8, 24),
|
figsize=(8, 24),
|
||||||
output_fig=False,
|
output_fig=False,
|
||||||
):
|
):
|
||||||
|
"""Intended to be used in Notebooks."""
|
||||||
|
|
||||||
if decoder_output is not None:
|
if decoder_output is not None:
|
||||||
num_plot = 4
|
num_plot = 4
|
||||||
|
|
|
@ -9,7 +9,7 @@ import soundfile as sf
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from TTS.tts.utils.data import StandardScaler
|
from TTS.tts.utils.helpers import StandardScaler
|
||||||
|
|
||||||
|
|
||||||
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||||
|
@ -608,6 +608,9 @@ class AudioProcessor(object):
|
||||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||||
S_complex = np.abs(S).astype(np.complex)
|
S_complex = np.abs(S).astype(np.complex)
|
||||||
y = self._istft(S_complex * angles)
|
y = self._istft(S_complex * angles)
|
||||||
|
if not np.isfinite(y).all():
|
||||||
|
print(" [!] Waveform is not finite everywhere. Skipping the GL.")
|
||||||
|
return np.array([0.0])
|
||||||
for _ in range(self.griffin_lim_iters):
|
for _ in range(self.griffin_lim_iters):
|
||||||
angles = np.exp(1j * np.angle(self._stft(y)))
|
angles = np.exp(1j * np.angle(self._stft(y)))
|
||||||
y = self._istft(S_complex * angles)
|
y = self._istft(S_complex * angles)
|
||||||
|
|
|
@ -59,7 +59,7 @@ def load_wav_feat_data(data_path, feat_path, eval_split_size):
|
||||||
wav_paths.sort(key=lambda x: Path(x).stem)
|
wav_paths.sort(key=lambda x: Path(x).stem)
|
||||||
feat_paths.sort(key=lambda x: Path(x).stem)
|
feat_paths.sort(key=lambda x: Path(x).stem)
|
||||||
|
|
||||||
assert len(wav_paths) == len(feat_paths)
|
assert len(wav_paths) == len(feat_paths), f" [!] {len(wav_paths)} vs {feat_paths}"
|
||||||
for wav, feat in zip(wav_paths, feat_paths):
|
for wav, feat in zip(wav_paths, feat_paths):
|
||||||
wav_name = Path(wav).stem
|
wav_name = Path(wav).stem
|
||||||
feat_name = Path(feat).stem
|
feat_name = Path(feat).stem
|
||||||
|
|
|
@ -7,7 +7,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
|
||||||
- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny.
|
- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny.
|
||||||
|
|
||||||
## What are the requirements of a good 🐸TTS dataset?
|
## What are the requirements of a good 🐸TTS dataset?
|
||||||
* https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset
|
* {ref}`See this page <what_makes_a_good_dataset>`
|
||||||
|
|
||||||
## How should I choose the right model?
|
## How should I choose the right model?
|
||||||
- First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2.
|
- First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2.
|
||||||
|
|
|
@ -0,0 +1,115 @@
|
||||||
|
# Fine-tuning a 🐸 TTS model
|
||||||
|
|
||||||
|
## Fine-tuning
|
||||||
|
|
||||||
|
Fine-tuning takes a pre-trained model, and retrains it to improve the model performance on a different task or dataset.
|
||||||
|
In 🐸TTS we provide different pre-trained models in different languages and different pros and cons. You can take one of
|
||||||
|
them and fine-tune it for your own dataset. This will help you in two main ways:
|
||||||
|
|
||||||
|
1. Faster learning
|
||||||
|
|
||||||
|
Since a pre-trained model has already learned features that are relevant for the task, it will converge faster on
|
||||||
|
a new dataset. This will reduce the cost of training and let you experient faster.
|
||||||
|
|
||||||
|
2. Better resutls with small datasets
|
||||||
|
|
||||||
|
Deep learning models are data hungry and they give better performance with more data. However, it is not always
|
||||||
|
possible to have this abondance, especially in domain. For instance, LJSpeech dataset, that we released most of
|
||||||
|
our English models with, is almost 24 hours long. And it requires for someone to collect thid amount of data with
|
||||||
|
a help of a voice talent takes weeks.
|
||||||
|
|
||||||
|
Fine-tuning cames to rescue in this case. You can take one of our pre-trained models and fine-tune it for your own
|
||||||
|
speech dataset and achive reasonable results with only a couple of hours in the worse case.
|
||||||
|
|
||||||
|
However, note that, fine-tuning does not promise great results. The model performance is still depends on the
|
||||||
|
{ref}`dataset quality <what_makes_a_good_dataset>` and the hyper-parameters you choose for fine-tuning. Therefore,
|
||||||
|
it still demands a bit of tinkering.
|
||||||
|
|
||||||
|
|
||||||
|
## Steps to fine-tune a 🐸 TTS model
|
||||||
|
|
||||||
|
1. Setup your dataset.
|
||||||
|
|
||||||
|
You need to format your target dataset in a certain way so that 🐸TTS data loader would be able to load it for the
|
||||||
|
training. Please see {ref}`this page <formatting_your_dataset>` for more information about formatting.
|
||||||
|
|
||||||
|
2. Choose the model you want to fine-tune.
|
||||||
|
|
||||||
|
You can list the availabe models on terminal as
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tts --list-models
|
||||||
|
```
|
||||||
|
|
||||||
|
The command above lists the the models in a naming format as ```<model_type>/<language>/<dataset>/<model_name>```.
|
||||||
|
|
||||||
|
Or you can manually check `.model.json` file in the project directory.
|
||||||
|
|
||||||
|
You should choose the model based on your requirements. Some models are fast and some are better in speech quality.
|
||||||
|
One lazy way to check a model is running the model on the hardware you want to use and see how it works. For
|
||||||
|
simple testing, you can use the `tts` command on the terminal. For more info see {ref}`here <synthesizing_speech>`.
|
||||||
|
|
||||||
|
3. Download the model.
|
||||||
|
|
||||||
|
You can download the model by `tts` command. If you run `tts` with a particular model, it will download automatically
|
||||||
|
and the model path will be printed on the terminal.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tts --model_name tts_models/es/mai/tacotron2-DDC --text "Ola."
|
||||||
|
|
||||||
|
> Downloading model to /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
In the example above, we called the Spanish Tacotron model and give the sample output showing use the path where
|
||||||
|
the model is downloaded.
|
||||||
|
|
||||||
|
4. Setup the model config for fine-tuning.
|
||||||
|
|
||||||
|
You need to change certain fields in the model config. You have 3 options for playing with the configuration.
|
||||||
|
|
||||||
|
1. Edit the fields in the ```config.json``` file if you want to use ```TTS/bin/train_tts.py``` to train the model.
|
||||||
|
2. Edit the fields in one of the training scripts in the ```recipes``` directory if you want to use python.
|
||||||
|
3. Use the command-line arguments to override the fields like ```--coqpit.lr 0.00001``` to change the learning rate.
|
||||||
|
|
||||||
|
Some of the important fields are as follows:
|
||||||
|
|
||||||
|
- `datasets` field: This is set to the dataset you want to fine-tune the model on.
|
||||||
|
- `run_name` field: This is the name of the run. This is used to name the output directory and the entry in the
|
||||||
|
logging dashboard.
|
||||||
|
- `output_path` field: This is the path where the fine-tuned model is saved.
|
||||||
|
- `lr` field: You may need to use a smaller learning rate for fine-tuning not to impair the features learned by the
|
||||||
|
pre-trained model with big update steps.
|
||||||
|
- `audio` fields: Different datasets have different audio characteristics. You must check the current audio parameters and
|
||||||
|
make sure that the values reflect your dataset. For instance, your dataset might have a different audio sampling rate.
|
||||||
|
|
||||||
|
Apart from these above, you should check the whole configuration file and make sure that the values are correct for
|
||||||
|
your dataset and training.
|
||||||
|
|
||||||
|
5. Start fine-tuning.
|
||||||
|
|
||||||
|
Whether you use one of the training scripts under ```recipes``` folder or the ```train_tts.py``` to start
|
||||||
|
your training, you should use the ```--restore_path``` flag to specify the path to the pre-trained model.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
|
||||||
|
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \
|
||||||
|
--config_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \
|
||||||
|
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
|
||||||
|
```
|
||||||
|
|
||||||
|
As stated above, you can also use command-line arguments to change the model configuration.
|
||||||
|
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
|
||||||
|
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
|
||||||
|
--coqpit.run_name "glow-tts-finetune" \
|
||||||
|
--coqpit.lr 0.00001
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
(formatting_your_dataset)=
|
||||||
# Formatting Your Dataset
|
# Formatting Your Dataset
|
||||||
|
|
||||||
For training a TTS model, you need a dataset with speech recordings and transcriptions. The speech must be divided into audio clips and each clip needs transcription.
|
For training a TTS model, you need a dataset with speech recordings and transcriptions. The speech must be divided into audio clips and each clip needs transcription.
|
||||||
|
@ -18,15 +19,15 @@ Let's assume you created the audio clips and their transcription. You can collec
|
||||||
|
|
||||||
You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text.
|
You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text.
|
||||||
|
|
||||||
We recommend the following format delimited by `|`.
|
We recommend the following format delimited by `||`.
|
||||||
|
|
||||||
```
|
```
|
||||||
# metadata.txt
|
# metadata.txt
|
||||||
|
|
||||||
audio1.wav | This is my sentence.
|
audio1.wav || This is my sentence.
|
||||||
audio2.wav | This is maybe my sentence.
|
audio2.wav || This is maybe my sentence.
|
||||||
audio3.wav | This is certainly my sentence.
|
audio3.wav || This is certainly my sentence.
|
||||||
audio4.wav | Let this be your sentence.
|
audio4.wav || Let this be your sentence.
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
inference
|
inference
|
||||||
implementing_a_new_model
|
implementing_a_new_model
|
||||||
training_a_model
|
training_a_model
|
||||||
|
finetuning
|
||||||
configuration
|
configuration
|
||||||
formatting_your_dataset
|
formatting_your_dataset
|
||||||
what_makes_a_good_dataset
|
what_makes_a_good_dataset
|
||||||
|
@ -45,7 +46,7 @@
|
||||||
|
|
||||||
models/glow_tts.md
|
models/glow_tts.md
|
||||||
models/vits.md
|
models/vits.md
|
||||||
models/fast_pitch.md
|
models/forward_tts.md
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
# Forward TTS model(s)
|
||||||
|
|
||||||
|
A general feed-forward TTS model implementation that can be configured to different architectures by setting different
|
||||||
|
encoder and decoder networks. It can be trained with either pre-computed durations (from pre-trained Tacotron) or
|
||||||
|
an alignment network that learns the text to audio alignment from the input data.
|
||||||
|
|
||||||
|
Currently we provide the following pre-configured architectures:
|
||||||
|
|
||||||
|
- **FastSpeech:**
|
||||||
|
|
||||||
|
It's a feed-forward model TTS model that uses Feed Forward Transformer (FFT) modules as the encoder and decoder.
|
||||||
|
|
||||||
|
- **FastPitch:**
|
||||||
|
|
||||||
|
It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the
|
||||||
|
promise of more expressive speech.
|
||||||
|
|
||||||
|
- **SpeedySpeech:**
|
||||||
|
|
||||||
|
It uses Residual Convolution layers instead of Transformers that leads to a more compute friendly model.
|
||||||
|
|
||||||
|
- **FastSpeech2 (TODO):**
|
||||||
|
|
||||||
|
Similar to FastPitch but it also uses a spectral energy values as an addition.
|
||||||
|
|
||||||
|
## Important resources & papers
|
||||||
|
- FastPitch: https://arxiv.org/abs/2006.06873
|
||||||
|
- SpeedySpeech: https://arxiv.org/abs/2008.03802
|
||||||
|
- FastSpeech: https://arxiv.org/pdf/1905.09263
|
||||||
|
- FastSpeech2: https://arxiv.org/abs/2006.04558
|
||||||
|
- Aligner Network: https://arxiv.org/abs/2108.10447
|
||||||
|
- What is Pitch: https://www.britannica.com/topic/pitch-speech
|
||||||
|
|
||||||
|
|
||||||
|
## ForwardTTSArgs
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.models.forward_tts.ForwardTTSArgs
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
## ForwardTTS Model
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.models.forward_tts.ForwardTTS
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
## FastPitchConfig
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.configs.fast_pitch_config.FastPitchConfig
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
## SpeedySpeechConfig
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.configs.speedy_speech_config.SpeedySpeechConfig
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
## FastSpeechConfig
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.configs.fast_speech_config.FastSpeechConfig
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@
|
||||||
|
|
||||||
4. Run the training.
|
4. Run the training.
|
||||||
|
|
||||||
You need to call the python training script.
|
You need to run the training script.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py
|
$ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py
|
||||||
|
@ -63,7 +63,7 @@
|
||||||
Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable.
|
Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable.
|
||||||
To see available GPUs on your system, you can use `nvidia-smi` command on the terminal.
|
To see available GPUs on your system, you can use `nvidia-smi` command on the terminal.
|
||||||
|
|
||||||
If you like to run a multi-gpu training
|
If you like to run a multi-gpu training using DDP back-end,
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
|
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
(what_makes_a_good_dataset)=
|
||||||
# What makes a good TTS dataset
|
# What makes a good TTS dataset
|
||||||
|
|
||||||
## What Makes a Good Dataset
|
## What Makes a Good Dataset
|
||||||
|
|
|
@ -2,16 +2,14 @@
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
"source": [
|
||||||
"This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training."
|
"This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training."
|
||||||
]
|
],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"%load_ext autoreload\n",
|
"%load_ext autoreload\n",
|
||||||
"%autoreload 2\n",
|
"%autoreload 2\n",
|
||||||
|
@ -25,22 +23,23 @@
|
||||||
"from TTS.tts.datasets.TTSDataset import TTSDataset\n",
|
"from TTS.tts.datasets.TTSDataset import TTSDataset\n",
|
||||||
"from TTS.tts.layers.losses import L1LossMasked\n",
|
"from TTS.tts.layers.losses import L1LossMasked\n",
|
||||||
"from TTS.utils.audio import AudioProcessor\n",
|
"from TTS.utils.audio import AudioProcessor\n",
|
||||||
"from TTS.utils.io import load_config\n",
|
"from TTS.config import load_config\n",
|
||||||
"from TTS.tts.utils.visual import plot_spectrogram\n",
|
"from TTS.tts.utils.visual import plot_spectrogram\n",
|
||||||
"from TTS.tts.utils.generic_utils import setup_model, sequence_mask\n",
|
"from TTS.tts.utils.helpers import sequence_mask\n",
|
||||||
|
"from TTS.tts.models import setup_model\n",
|
||||||
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||||
"\n",
|
"\n",
|
||||||
"%matplotlib inline\n",
|
"%matplotlib inline\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"os.environ['CUDA_VISIBLE_DEVICES']='0'"
|
"os.environ['CUDA_VISIBLE_DEVICES']='2'"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"def set_filename(wav_path, out_path):\n",
|
"def set_filename(wav_path, out_path):\n",
|
||||||
" wav_file = os.path.basename(wav_path)\n",
|
" wav_file = os.path.basename(wav_path)\n",
|
||||||
|
@ -52,20 +51,20 @@
|
||||||
" mel_path = os.path.join(out_path, \"mel\", file_name)\n",
|
" mel_path = os.path.join(out_path, \"mel\", file_name)\n",
|
||||||
" wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n",
|
" wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n",
|
||||||
" return file_name, wavq_path, mel_path, wav_path"
|
" return file_name, wavq_path, mel_path, wav_path"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"OUT_PATH = \"/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA\"\n",
|
"OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n",
|
||||||
"DATA_PATH = \"/home/erogol/gdrive/Datasets/non-binary-voice-files/\"\n",
|
"DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n",
|
||||||
"DATASET = \"sam_accenture\"\n",
|
"DATASET = \"ljspeech\"\n",
|
||||||
"METADATA_FILE = \"recording_script.xml\"\n",
|
"METADATA_FILE = \"metadata.csv\"\n",
|
||||||
"CONFIG_PATH = \"/home/erogol/gdrive/Trainings/sam/ljspeech-dcattn-April-03-2021_05+02-2344379/config.json\"\n",
|
"CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n",
|
||||||
"MODEL_FILE = \"/home/erogol/gdrive/Trainings/sam/ljspeech-dcattn-April-03-2021_05+02-2344379/best_model.pth.tar\"\n",
|
"MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n",
|
||||||
"BATCH_SIZE = 32\n",
|
"BATCH_SIZE = 32\n",
|
||||||
"\n",
|
"\n",
|
||||||
"QUANTIZED_WAV = False\n",
|
"QUANTIZED_WAV = False\n",
|
||||||
|
@ -78,56 +77,63 @@
|
||||||
"C = load_config(CONFIG_PATH)\n",
|
"C = load_config(CONFIG_PATH)\n",
|
||||||
"C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n",
|
"C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n",
|
||||||
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
|
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
|
"print(C['r'])\n",
|
||||||
"# if the vocabulary was passed, replace the default\n",
|
"# if the vocabulary was passed, replace the default\n",
|
||||||
"if 'characters' in C.keys():\n",
|
"if 'characters' in C and C['characters']:\n",
|
||||||
" symbols, phonemes = make_symbols(**C.characters)\n",
|
" symbols, phonemes = make_symbols(**C.characters)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# load the model\n",
|
"# load the model\n",
|
||||||
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
||||||
"# TODO: multiple speaker\n",
|
"# TODO: multiple speaker\n",
|
||||||
"model = setup_model(num_chars, num_speakers=0, c=C)\n",
|
"model = setup_model(C)\n",
|
||||||
"checkpoint = torch.load(MODEL_FILE)\n",
|
"model.load_checkpoint(C, MODEL_FILE, eval=True)"
|
||||||
"model.load_state_dict(checkpoint['model'])\n",
|
],
|
||||||
"print(checkpoint['step'])\n",
|
"outputs": [],
|
||||||
"model.eval()\n",
|
"metadata": {}
|
||||||
"model.decoder.set_r(checkpoint['r'])\n",
|
|
||||||
"if use_cuda:\n",
|
|
||||||
" model = model.cuda()"
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
|
"preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n",
|
||||||
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
||||||
"meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
|
"meta_data = preprocessor(DATA_PATH, METADATA_FILE)\n",
|
||||||
"dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,characters=c.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
|
"dataset = TTSDataset(\n",
|
||||||
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
|
" checkpoint[\"config\"][\"r\"],\n",
|
||||||
]
|
" C.text_cleaner,\n",
|
||||||
|
" False,\n",
|
||||||
|
" ap,\n",
|
||||||
|
" meta_data,\n",
|
||||||
|
" characters=C.get('characters', None),\n",
|
||||||
|
" use_phonemes=C.use_phonemes,\n",
|
||||||
|
" phoneme_cache_path=C.phoneme_cache_path,\n",
|
||||||
|
" enable_eos_bos=C.enable_eos_bos_chars,\n",
|
||||||
|
")\n",
|
||||||
|
"loader = DataLoader(\n",
|
||||||
|
" dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n",
|
||||||
|
")\n"
|
||||||
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
"source": [
|
||||||
"### Generate model outputs "
|
"### Generate model outputs "
|
||||||
]
|
],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"import pickle\n",
|
"import pickle\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -206,42 +212,42 @@
|
||||||
"\n",
|
"\n",
|
||||||
" print(np.mean(losses))\n",
|
" print(np.mean(losses))\n",
|
||||||
" print(np.mean(postnet_losses))"
|
" print(np.mean(postnet_losses))"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# for pwgan\n",
|
"# for pwgan\n",
|
||||||
"with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n",
|
"with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n",
|
||||||
" for data in metadata:\n",
|
" for data in metadata:\n",
|
||||||
" f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")"
|
" f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
"source": [
|
||||||
"### Sanity Check"
|
"### Sanity Check"
|
||||||
]
|
],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"idx = 1\n",
|
"idx = 1\n",
|
||||||
"ap.melspectrogram(ap.load_wav(item_idx[idx])).shape"
|
"ap.melspectrogram(ap.load_wav(item_idx[idx])).shape"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"import soundfile as sf\n",
|
"import soundfile as sf\n",
|
||||||
"wav, sr = sf.read(item_idx[idx])\n",
|
"wav, sr = sf.read(item_idx[idx])\n",
|
||||||
|
@ -249,46 +255,46 @@
|
||||||
"mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n",
|
"mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n",
|
||||||
"mel_truth = ap.melspectrogram(wav)\n",
|
"mel_truth = ap.melspectrogram(wav)\n",
|
||||||
"print(mel_truth.shape)"
|
"print(mel_truth.shape)"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# plot posnet output\n",
|
"# plot posnet output\n",
|
||||||
"print(mel_postnet[:mel_lengths[idx], :].shape)\n",
|
"print(mel_postnet[:mel_lengths[idx], :].shape)\n",
|
||||||
"plot_spectrogram(mel_postnet, ap)"
|
"plot_spectrogram(mel_postnet, ap)"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# plot decoder output\n",
|
"# plot decoder output\n",
|
||||||
"print(mel_decoder.shape)\n",
|
"print(mel_decoder.shape)\n",
|
||||||
"plot_spectrogram(mel_decoder, ap)"
|
"plot_spectrogram(mel_decoder, ap)"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# plot GT specgrogram\n",
|
"# plot GT specgrogram\n",
|
||||||
"print(mel_truth.shape)\n",
|
"print(mel_truth.shape)\n",
|
||||||
"plot_spectrogram(mel_truth.T, ap)"
|
"plot_spectrogram(mel_truth.T, ap)"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# postnet, decoder diff\n",
|
"# postnet, decoder diff\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
|
@ -297,13 +303,13 @@
|
||||||
"plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
"plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
"plt.colorbar()\n",
|
"plt.colorbar()\n",
|
||||||
"plt.tight_layout()"
|
"plt.tight_layout()"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# PLOT GT SPECTROGRAM diff\n",
|
"# PLOT GT SPECTROGRAM diff\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
|
@ -312,13 +318,13 @@
|
||||||
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
"plt.colorbar()\n",
|
"plt.colorbar()\n",
|
||||||
"plt.tight_layout()"
|
"plt.tight_layout()"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# PLOT GT SPECTROGRAM diff\n",
|
"# PLOT GT SPECTROGRAM diff\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
|
@ -328,21 +334,22 @@
|
||||||
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
"plt.colorbar()\n",
|
"plt.colorbar()\n",
|
||||||
"plt.tight_layout()"
|
"plt.tight_layout()"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"source": [],
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"metadata": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"name": "python3",
|
||||||
"language": "python",
|
"display_name": "Python 3.9.7 64-bit ('base': conda)"
|
||||||
"name": "python3"
|
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
|
@ -354,7 +361,10 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.5"
|
"version": "3.9.7"
|
||||||
|
},
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "822ce188d9bce5372c4adbb11364eeb49293228c2224eb55307f4664778e7f56"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs, init_training
|
||||||
|
from TTS.tts.configs import SpeedySpeechConfig
|
||||||
|
from TTS.utils.manage import ModelManager
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# init configs
|
||||||
|
dataset_config = BaseDatasetConfig(
|
||||||
|
name="ljspeech",
|
||||||
|
meta_file_train="metadata.csv",
|
||||||
|
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
|
||||||
|
path=os.path.join(output_path, "../LJSpeech-1.1/"),
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=60.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = SpeedySpeechConfig(
|
||||||
|
run_name="speedy_speech_ljspeech",
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=32,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
compute_input_seq_cache=True,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="english_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
use_espeak_phonemes=False,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=50,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=False,
|
||||||
|
sort_by_audio_len=True,
|
||||||
|
max_seq_len=500000,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
)
|
||||||
|
|
||||||
|
# compute alignments
|
||||||
|
if not config.model_args.use_aligner:
|
||||||
|
manager = ModelManager()
|
||||||
|
model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
|
||||||
|
# TODO: make compute_attention python callable
|
||||||
|
os.system(
|
||||||
|
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
|
||||||
|
)
|
||||||
|
|
||||||
|
# train the model
|
||||||
|
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
|
||||||
|
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
|
||||||
|
trainer.fit()
|
4
setup.py
4
setup.py
|
@ -54,8 +54,8 @@ with open("README.md", "r", encoding="utf-8") as readme_file:
|
||||||
|
|
||||||
exts = [
|
exts = [
|
||||||
Extension(
|
Extension(
|
||||||
name="TTS.tts.layers.glow_tts.monotonic_align.core",
|
name="TTS.tts.utils.monotonic_align.core",
|
||||||
sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"],
|
sources=["TTS/tts/utils/monotonic_align/core.pyx"],
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
setup(
|
setup(
|
||||||
|
|
|
@ -7,8 +7,8 @@ from TTS.utils.generic_utils import get_cuda
|
||||||
def get_device_id():
|
def get_device_id():
|
||||||
use_cuda, _ = get_cuda()
|
use_cuda, _ = get_cuda()
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ['CUDA_VISIBLE_DEVICES'] != "":
|
if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] != "":
|
||||||
GPU_ID = os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]
|
GPU_ID = os.environ["CUDA_VISIBLE_DEVICES"].split(",")[0]
|
||||||
else:
|
else:
|
||||||
GPU_ID = "0"
|
GPU_ID = "0"
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -68,15 +68,15 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
for i, data in enumerate(dataloader):
|
for i, data in enumerate(dataloader):
|
||||||
if i == self.max_loader_iter:
|
if i == self.max_loader_iter:
|
||||||
break
|
break
|
||||||
text_input = data['text']
|
text_input = data["text"]
|
||||||
text_lengths = data['text_lengths']
|
text_lengths = data["text_lengths"]
|
||||||
speaker_name = data['speaker_names']
|
speaker_name = data["speaker_names"]
|
||||||
linear_input = data['linear']
|
linear_input = data["linear"]
|
||||||
mel_input = data['mel']
|
mel_input = data["mel"]
|
||||||
mel_lengths = data['mel_lengths']
|
mel_lengths = data["mel_lengths"]
|
||||||
stop_target = data['stop_targets']
|
stop_target = data["stop_targets"]
|
||||||
item_idx = data['item_idxs']
|
item_idx = data["item_idxs"]
|
||||||
wavs = data['waveform']
|
wavs = data["waveform"]
|
||||||
|
|
||||||
neg_values = text_input[text_input < 0]
|
neg_values = text_input[text_input < 0]
|
||||||
check_count = len(neg_values)
|
check_count = len(neg_values)
|
||||||
|
@ -113,14 +113,14 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
for i, data in enumerate(dataloader):
|
for i, data in enumerate(dataloader):
|
||||||
if i == self.max_loader_iter:
|
if i == self.max_loader_iter:
|
||||||
break
|
break
|
||||||
text_input = data['text']
|
text_input = data["text"]
|
||||||
text_lengths = data['text_lengths']
|
text_lengths = data["text_lengths"]
|
||||||
speaker_name = data['speaker_names']
|
speaker_name = data["speaker_names"]
|
||||||
linear_input = data['linear']
|
linear_input = data["linear"]
|
||||||
mel_input = data['mel']
|
mel_input = data["mel"]
|
||||||
mel_lengths = data['mel_lengths']
|
mel_lengths = data["mel_lengths"]
|
||||||
stop_target = data['stop_targets']
|
stop_target = data["stop_targets"]
|
||||||
item_idx = data['item_idxs']
|
item_idx = data["item_idxs"]
|
||||||
|
|
||||||
avg_length = mel_lengths.numpy().mean()
|
avg_length = mel_lengths.numpy().mean()
|
||||||
assert avg_length >= last_length
|
assert avg_length >= last_length
|
||||||
|
@ -139,14 +139,14 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
for i, data in enumerate(dataloader):
|
for i, data in enumerate(dataloader):
|
||||||
if i == self.max_loader_iter:
|
if i == self.max_loader_iter:
|
||||||
break
|
break
|
||||||
text_input = data['text']
|
text_input = data["text"]
|
||||||
text_lengths = data['text_lengths']
|
text_lengths = data["text_lengths"]
|
||||||
speaker_name = data['speaker_names']
|
speaker_name = data["speaker_names"]
|
||||||
linear_input = data['linear']
|
linear_input = data["linear"]
|
||||||
mel_input = data['mel']
|
mel_input = data["mel"]
|
||||||
mel_lengths = data['mel_lengths']
|
mel_lengths = data["mel_lengths"]
|
||||||
stop_target = data['stop_targets']
|
stop_target = data["stop_targets"]
|
||||||
item_idx = data['item_idxs']
|
item_idx = data["item_idxs"]
|
||||||
|
|
||||||
# check mel_spec consistency
|
# check mel_spec consistency
|
||||||
wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
|
wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
|
||||||
|
@ -188,14 +188,14 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
for i, data in enumerate(dataloader):
|
for i, data in enumerate(dataloader):
|
||||||
if i == self.max_loader_iter:
|
if i == self.max_loader_iter:
|
||||||
break
|
break
|
||||||
text_input = data['text']
|
text_input = data["text"]
|
||||||
text_lengths = data['text_lengths']
|
text_lengths = data["text_lengths"]
|
||||||
speaker_name = data['speaker_names']
|
speaker_name = data["speaker_names"]
|
||||||
linear_input = data['linear']
|
linear_input = data["linear"]
|
||||||
mel_input = data['mel']
|
mel_input = data["mel"]
|
||||||
mel_lengths = data['mel_lengths']
|
mel_lengths = data["mel_lengths"]
|
||||||
stop_target = data['stop_targets']
|
stop_target = data["stop_targets"]
|
||||||
item_idx = data['item_idxs']
|
item_idx = data["item_idxs"]
|
||||||
|
|
||||||
if mel_lengths[0] > mel_lengths[1]:
|
if mel_lengths[0] > mel_lengths[1]:
|
||||||
idx = 0
|
idx = 0
|
||||||
|
|
|
@ -11,11 +11,10 @@ def test_synthesize():
|
||||||
# single speaker model
|
# single speaker model
|
||||||
run_cli(f'tts --text "This is an example." --out_path "{output_path}"')
|
run_cli(f'tts --text "This is an example." --out_path "{output_path}"')
|
||||||
run_cli(
|
run_cli(
|
||||||
"tts --model_name tts_models/en/ljspeech/speedy-speech-wn "
|
"tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"'
|
||||||
f'--text "This is an example." --out_path "{output_path}"'
|
|
||||||
)
|
)
|
||||||
run_cli(
|
run_cli(
|
||||||
"tts --model_name tts_models/en/ljspeech/speedy-speech-wn "
|
"tts --model_name tts_models/en/ljspeech/glow-tts "
|
||||||
"--vocoder_name vocoder_models/en/ljspeech/multiband-melgan "
|
"--vocoder_name vocoder_models/en/ljspeech/multiband-melgan "
|
||||||
f'--text "This is an example." --out_path "{output_path}"'
|
f'--text "This is an example." --out_path "{output_path}"'
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
import unittest
|
|
||||||
|
|
||||||
import torch as T
|
|
||||||
|
|
||||||
from TTS.tts.models.fast_pitch import FastPitch, FastPitchArgs, average_pitch
|
|
||||||
# pylint: disable=unused-variable
|
|
||||||
|
|
||||||
|
|
||||||
class AveragePitchTests(unittest.TestCase):
|
|
||||||
def test_in_out(self): # pylint: disable=no-self-use
|
|
||||||
pitch = T.rand(1, 1, 128)
|
|
||||||
|
|
||||||
durations = T.randint(1, 5, (1, 21))
|
|
||||||
coeff = 128.0 / durations.sum()
|
|
||||||
durations = T.round(durations * coeff)
|
|
||||||
diff = 128.0 - durations.sum()
|
|
||||||
durations[0, -1] += diff
|
|
||||||
durations = durations.long()
|
|
||||||
|
|
||||||
pitch_avg = average_pitch(pitch, durations)
|
|
||||||
|
|
||||||
index = 0
|
|
||||||
for idx, dur in enumerate(durations[0]):
|
|
||||||
assert abs(pitch_avg[0, 0, idx] - pitch[0, 0, index : index + dur.item()].mean()) < 1e-5
|
|
||||||
index += dur
|
|
||||||
|
|
||||||
|
|
||||||
def expand_encoder_outputs_test():
|
|
||||||
model = FastPitch(FastPitchArgs(num_chars=10))
|
|
||||||
|
|
||||||
inputs = T.rand(2, 5, 57)
|
|
||||||
durations = T.randint(1, 4, (2, 57))
|
|
||||||
|
|
||||||
x_mask = T.ones(2, 1, 57)
|
|
||||||
y_mask = T.ones(2, 1, durations.sum(1).max())
|
|
||||||
|
|
||||||
expanded, _ = model.expand_encoder_outputs(inputs, durations, x_mask, y_mask)
|
|
||||||
|
|
||||||
for b in range(durations.shape[0]):
|
|
||||||
index = 0
|
|
||||||
for idx, dur in enumerate(durations[b]):
|
|
||||||
diff = (
|
|
||||||
expanded[b, :, index : index + dur.item()]
|
|
||||||
- inputs[b, :, idx].repeat(dur.item()).view(expanded[b, :, index : index + dur.item()].shape)
|
|
||||||
).sum()
|
|
||||||
assert abs(diff) < 1e-6, diff
|
|
||||||
index += dur
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from tests import get_device_id, get_tests_output_path, run_cli
|
||||||
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
|
from TTS.tts.configs import FastPitchConfig
|
||||||
|
|
||||||
|
config_path = os.path.join(get_tests_output_path(), "test_fast_pitch_config.json")
|
||||||
|
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=60.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = FastPitchConfig(
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=8,
|
||||||
|
eval_batch_size=8,
|
||||||
|
num_loader_workers=0,
|
||||||
|
num_eval_loader_workers=0,
|
||||||
|
text_cleaner="english_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
|
||||||
|
f0_cache_path="tests/data/ljspeech/f0_cache/",
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1,
|
||||||
|
print_step=1,
|
||||||
|
print_eval=True,
|
||||||
|
test_sentences=[
|
||||||
|
"Be a voice, not an echo.",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
config.audio.do_trim_silence = True
|
||||||
|
config.audio.trim_db = 60
|
||||||
|
config.save_json(config_path)
|
||||||
|
|
||||||
|
# train the model for one epoch
|
||||||
|
command_train = (
|
||||||
|
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
|
||||||
|
f"--coqpit.output_path {output_path} "
|
||||||
|
"--coqpit.datasets.0.name ljspeech "
|
||||||
|
"--coqpit.datasets.0.meta_file_train metadata.csv "
|
||||||
|
"--coqpit.datasets.0.meta_file_val metadata.csv "
|
||||||
|
"--coqpit.datasets.0.path tests/data/ljspeech "
|
||||||
|
"--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
|
||||||
|
"--coqpit.test_delay_epochs 0"
|
||||||
|
)
|
||||||
|
run_cli(command_train)
|
||||||
|
|
||||||
|
# Find latest folder
|
||||||
|
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
|
||||||
|
|
||||||
|
# restore the model and continue training for one more epoch
|
||||||
|
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
|
||||||
|
run_cli(command_train)
|
||||||
|
shutil.rmtree(continue_path)
|
|
@ -2,7 +2,7 @@ import torch
|
||||||
|
|
||||||
from TTS.tts.layers.feed_forward.decoder import Decoder
|
from TTS.tts.layers.feed_forward.decoder import Decoder
|
||||||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
|
|
||||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,147 @@
|
||||||
|
import torch as T
|
||||||
|
|
||||||
|
from TTS.tts.models.forward_tts import ForwardTTS, ForwardTTSArgs
|
||||||
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
|
|
||||||
|
# pylint: disable=unused-variable
|
||||||
|
|
||||||
|
|
||||||
|
def expand_encoder_outputs_test():
|
||||||
|
model = ForwardTTS(ForwardTTSArgs(num_chars=10))
|
||||||
|
|
||||||
|
inputs = T.rand(2, 5, 57)
|
||||||
|
durations = T.randint(1, 4, (2, 57))
|
||||||
|
|
||||||
|
x_mask = T.ones(2, 1, 57)
|
||||||
|
y_mask = T.ones(2, 1, durations.sum(1).max())
|
||||||
|
|
||||||
|
expanded, _ = model.expand_encoder_outputs(inputs, durations, x_mask, y_mask)
|
||||||
|
|
||||||
|
for b in range(durations.shape[0]):
|
||||||
|
index = 0
|
||||||
|
for idx, dur in enumerate(durations[b]):
|
||||||
|
diff = (
|
||||||
|
expanded[b, :, index : index + dur.item()]
|
||||||
|
- inputs[b, :, idx].repeat(dur.item()).view(expanded[b, :, index : index + dur.item()].shape)
|
||||||
|
).sum()
|
||||||
|
assert abs(diff) < 1e-6, diff
|
||||||
|
index += dur
|
||||||
|
|
||||||
|
|
||||||
|
def model_input_output_test():
|
||||||
|
"""Assert the output shapes of the model in different modes"""
|
||||||
|
|
||||||
|
# VANILLA MODEL
|
||||||
|
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=False, use_aligner=False))
|
||||||
|
|
||||||
|
x = T.randint(0, 10, (2, 21))
|
||||||
|
x_lengths = T.randint(10, 22, (2,))
|
||||||
|
x_lengths[-1] = 21
|
||||||
|
x_mask = sequence_mask(x_lengths).unsqueeze(1).long()
|
||||||
|
durations = T.randint(1, 4, (2, 21))
|
||||||
|
durations = durations * x_mask.squeeze(1)
|
||||||
|
y_lengths = durations.sum(1)
|
||||||
|
y_mask = sequence_mask(y_lengths).unsqueeze(1).long()
|
||||||
|
|
||||||
|
outputs = model.forward(x, x_lengths, y_lengths, dr=durations)
|
||||||
|
|
||||||
|
assert outputs["model_outputs"].shape == (2, durations.sum(1).max(), 80)
|
||||||
|
assert outputs["durations_log"].shape == (2, 21)
|
||||||
|
assert outputs["durations"].shape == (2, 21)
|
||||||
|
assert outputs["alignments"].shape == (2, durations.sum(1).max(), 21)
|
||||||
|
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||||
|
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||||
|
|
||||||
|
assert outputs["alignment_soft"] is None
|
||||||
|
assert outputs["alignment_mas"] is None
|
||||||
|
assert outputs["alignment_logprob"] is None
|
||||||
|
assert outputs["o_alignment_dur"] is None
|
||||||
|
assert outputs["pitch_avg"] is None
|
||||||
|
assert outputs["pitch_avg_gt"] is None
|
||||||
|
|
||||||
|
# USE PITCH
|
||||||
|
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=True, use_aligner=False))
|
||||||
|
|
||||||
|
x = T.randint(0, 10, (2, 21))
|
||||||
|
x_lengths = T.randint(10, 22, (2,))
|
||||||
|
x_lengths[-1] = 21
|
||||||
|
x_mask = sequence_mask(x_lengths).unsqueeze(1).long()
|
||||||
|
durations = T.randint(1, 4, (2, 21))
|
||||||
|
durations = durations * x_mask.squeeze(1)
|
||||||
|
y_lengths = durations.sum(1)
|
||||||
|
y_mask = sequence_mask(y_lengths).unsqueeze(1).long()
|
||||||
|
pitch = T.rand(2, 1, y_lengths.max())
|
||||||
|
|
||||||
|
outputs = model.forward(x, x_lengths, y_lengths, dr=durations, pitch=pitch)
|
||||||
|
|
||||||
|
assert outputs["model_outputs"].shape == (2, durations.sum(1).max(), 80)
|
||||||
|
assert outputs["durations_log"].shape == (2, 21)
|
||||||
|
assert outputs["durations"].shape == (2, 21)
|
||||||
|
assert outputs["alignments"].shape == (2, durations.sum(1).max(), 21)
|
||||||
|
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||||
|
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||||
|
assert outputs["pitch_avg"].shape == (2, 1, 21)
|
||||||
|
assert outputs["pitch_avg_gt"].shape == (2, 1, 21)
|
||||||
|
|
||||||
|
assert outputs["alignment_soft"] is None
|
||||||
|
assert outputs["alignment_mas"] is None
|
||||||
|
assert outputs["alignment_logprob"] is None
|
||||||
|
assert outputs["o_alignment_dur"] is None
|
||||||
|
|
||||||
|
# USE ALIGNER NETWORK
|
||||||
|
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=False, use_aligner=True))
|
||||||
|
|
||||||
|
x = T.randint(0, 10, (2, 21))
|
||||||
|
x_lengths = T.randint(10, 22, (2,))
|
||||||
|
x_lengths[-1] = 21
|
||||||
|
x_mask = sequence_mask(x_lengths).unsqueeze(1).long()
|
||||||
|
durations = T.randint(1, 4, (2, 21))
|
||||||
|
durations = durations * x_mask.squeeze(1)
|
||||||
|
y_lengths = durations.sum(1)
|
||||||
|
y_mask = sequence_mask(y_lengths).unsqueeze(1).long()
|
||||||
|
y = T.rand(2, y_lengths.max(), 80)
|
||||||
|
|
||||||
|
outputs = model.forward(x, x_lengths, y_lengths, dr=durations, y=y)
|
||||||
|
|
||||||
|
assert outputs["model_outputs"].shape == (2, durations.sum(1).max(), 80)
|
||||||
|
assert outputs["durations_log"].shape == (2, 21)
|
||||||
|
assert outputs["durations"].shape == (2, 21)
|
||||||
|
assert outputs["alignments"].shape == (2, durations.sum(1).max(), 21)
|
||||||
|
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||||
|
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||||
|
assert outputs["alignment_soft"].shape == (2, durations.sum(1).max(), 21)
|
||||||
|
assert outputs["alignment_mas"].shape == (2, durations.sum(1).max(), 21)
|
||||||
|
assert outputs["alignment_logprob"].shape == (2, 1, durations.sum(1).max(), 21)
|
||||||
|
assert outputs["o_alignment_dur"].shape == (2, 21)
|
||||||
|
|
||||||
|
assert outputs["pitch_avg"] is None
|
||||||
|
assert outputs["pitch_avg_gt"] is None
|
||||||
|
|
||||||
|
# USE ALIGNER NETWORK AND PITCH
|
||||||
|
model = ForwardTTS(ForwardTTSArgs(num_chars=10, use_pitch=True, use_aligner=True))
|
||||||
|
|
||||||
|
x = T.randint(0, 10, (2, 21))
|
||||||
|
x_lengths = T.randint(10, 22, (2,))
|
||||||
|
x_lengths[-1] = 21
|
||||||
|
x_mask = sequence_mask(x_lengths).unsqueeze(1).long()
|
||||||
|
durations = T.randint(1, 4, (2, 21))
|
||||||
|
durations = durations * x_mask.squeeze(1)
|
||||||
|
y_lengths = durations.sum(1)
|
||||||
|
y_mask = sequence_mask(y_lengths).unsqueeze(1).long()
|
||||||
|
y = T.rand(2, y_lengths.max(), 80)
|
||||||
|
pitch = T.rand(2, 1, y_lengths.max())
|
||||||
|
|
||||||
|
outputs = model.forward(x, x_lengths, y_lengths, dr=durations, pitch=pitch, y=y)
|
||||||
|
|
||||||
|
assert outputs["model_outputs"].shape == (2, durations.sum(1).max(), 80)
|
||||||
|
assert outputs["durations_log"].shape == (2, 21)
|
||||||
|
assert outputs["durations"].shape == (2, 21)
|
||||||
|
assert outputs["alignments"].shape == (2, durations.sum(1).max(), 21)
|
||||||
|
assert (outputs["x_mask"] - x_mask).sum() == 0.0
|
||||||
|
assert (outputs["y_mask"] - y_mask).sum() == 0.0
|
||||||
|
assert outputs["alignment_soft"].shape == (2, durations.sum(1).max(), 21)
|
||||||
|
assert outputs["alignment_mas"].shape == (2, durations.sum(1).max(), 21)
|
||||||
|
assert outputs["alignment_logprob"].shape == (2, 1, durations.sum(1).max(), 21)
|
||||||
|
assert outputs["o_alignment_dur"].shape == (2, 21)
|
||||||
|
assert outputs["pitch_avg"].shape == (2, 1, 21)
|
||||||
|
assert outputs["pitch_avg_gt"].shape == (2, 1, 21)
|
|
@ -0,0 +1,60 @@
|
||||||
|
import torch as T
|
||||||
|
|
||||||
|
from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask
|
||||||
|
|
||||||
|
|
||||||
|
def average_over_durations_test(): # pylint: disable=no-self-use
|
||||||
|
pitch = T.rand(1, 1, 128)
|
||||||
|
|
||||||
|
durations = T.randint(1, 5, (1, 21))
|
||||||
|
coeff = 128.0 / durations.sum()
|
||||||
|
durations = T.floor(durations * coeff)
|
||||||
|
diff = 128.0 - durations.sum()
|
||||||
|
durations[0, -1] += diff
|
||||||
|
durations = durations.long()
|
||||||
|
|
||||||
|
pitch_avg = average_over_durations(pitch, durations)
|
||||||
|
|
||||||
|
index = 0
|
||||||
|
for idx, dur in enumerate(durations[0]):
|
||||||
|
assert abs(pitch_avg[0, 0, idx] - pitch[0, 0, index : index + dur.item()].mean()) < 1e-5
|
||||||
|
index += dur
|
||||||
|
|
||||||
|
|
||||||
|
def seqeunce_mask_test():
|
||||||
|
lengths = T.randint(10, 15, (8,))
|
||||||
|
mask = sequence_mask(lengths)
|
||||||
|
for i in range(8):
|
||||||
|
l = lengths[i].item()
|
||||||
|
assert mask[i, :l].sum() == l
|
||||||
|
assert mask[i, l:].sum() == 0
|
||||||
|
|
||||||
|
|
||||||
|
def segment_test():
|
||||||
|
x = T.range(0, 11)
|
||||||
|
x = x.repeat(8, 1).unsqueeze(1)
|
||||||
|
segment_ids = T.randint(0, 7, (8,))
|
||||||
|
|
||||||
|
segments = segment(x, segment_ids, segment_size=4)
|
||||||
|
for idx, start_indx in enumerate(segment_ids):
|
||||||
|
assert x[idx, :, start_indx : start_indx + 4].sum() == segments[idx, :, :].sum()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_path_test():
|
||||||
|
durations = T.randint(1, 4, (10, 21))
|
||||||
|
x_length = T.randint(18, 22, (10,))
|
||||||
|
x_mask = sequence_mask(x_length).unsqueeze(1).long()
|
||||||
|
durations = durations * x_mask.squeeze(1)
|
||||||
|
y_length = durations.sum(1)
|
||||||
|
y_mask = sequence_mask(y_length).unsqueeze(1).long()
|
||||||
|
attn_mask = (T.unsqueeze(x_mask, -1) * T.unsqueeze(y_mask, 2)).squeeze(1).long()
|
||||||
|
print(attn_mask.shape)
|
||||||
|
path = generate_path(durations, attn_mask)
|
||||||
|
assert path.shape == (10, 21, durations.sum(1).max().item())
|
||||||
|
for b in range(durations.shape[0]):
|
||||||
|
current_idx = 0
|
||||||
|
for t in range(durations.shape[1]):
|
||||||
|
assert all(path[b, t, current_idx : current_idx + durations[b, t].item()] == 1.0)
|
||||||
|
assert all(path[b, t, :current_idx] == 0.0)
|
||||||
|
assert all(path[b, t, current_idx + durations[b, t].item() :] == 0.0)
|
||||||
|
current_idx += durations[b, t].item()
|
|
@ -1,96 +0,0 @@
|
||||||
import torch
|
|
||||||
|
|
||||||
from TTS.tts.configs import SpeedySpeechConfig
|
|
||||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
|
||||||
from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs
|
|
||||||
from TTS.tts.utils.data import sequence_mask
|
|
||||||
|
|
||||||
use_cuda = torch.cuda.is_available()
|
|
||||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
||||||
|
|
||||||
|
|
||||||
def test_duration_predictor():
|
|
||||||
input_dummy = torch.rand(8, 128, 27).to(device)
|
|
||||||
input_lengths = torch.randint(20, 27, (8,)).long().to(device)
|
|
||||||
input_lengths[-1] = 27
|
|
||||||
|
|
||||||
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
|
||||||
|
|
||||||
layer = DurationPredictor(hidden_channels=128).to(device)
|
|
||||||
|
|
||||||
output = layer(input_dummy, x_mask)
|
|
||||||
assert list(output.shape) == [8, 1, 27]
|
|
||||||
|
|
||||||
|
|
||||||
def test_speedy_speech():
|
|
||||||
num_chars = 7
|
|
||||||
B = 8
|
|
||||||
T_en = 37
|
|
||||||
T_de = 74
|
|
||||||
|
|
||||||
x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device)
|
|
||||||
x_lengths = torch.randint(31, T_en, (B,)).long().to(device)
|
|
||||||
x_lengths[-1] = T_en
|
|
||||||
|
|
||||||
# set durations. max total duration should be equal to T_de
|
|
||||||
durations = torch.randint(1, 4, (B, T_en))
|
|
||||||
durations = durations * (T_de / durations.sum(1)).unsqueeze(1)
|
|
||||||
durations = durations.to(torch.long).to(device)
|
|
||||||
max_dur = durations.sum(1).max()
|
|
||||||
durations[:, 0] += T_de - max_dur if T_de > max_dur else 0
|
|
||||||
|
|
||||||
y_lengths = durations.sum(1)
|
|
||||||
|
|
||||||
config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128))
|
|
||||||
model = SpeedySpeech(config)
|
|
||||||
if use_cuda:
|
|
||||||
model.cuda()
|
|
||||||
|
|
||||||
# forward pass
|
|
||||||
outputs = model(x_dummy, x_lengths, y_lengths, durations)
|
|
||||||
o_de = outputs["model_outputs"]
|
|
||||||
attn = outputs["alignments"]
|
|
||||||
o_dr = outputs["durations_log"]
|
|
||||||
|
|
||||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
|
||||||
assert list(attn.shape) == [B, T_de, T_en]
|
|
||||||
assert list(o_dr.shape) == [B, T_en]
|
|
||||||
|
|
||||||
# with speaker embedding
|
|
||||||
config = SpeedySpeechConfig(
|
|
||||||
model_args=SpeedySpeechArgs(
|
|
||||||
num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256
|
|
||||||
)
|
|
||||||
)
|
|
||||||
model = SpeedySpeech(config).to(device)
|
|
||||||
model.forward(
|
|
||||||
x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)}
|
|
||||||
)
|
|
||||||
o_de = outputs["model_outputs"]
|
|
||||||
attn = outputs["alignments"]
|
|
||||||
o_dr = outputs["durations_log"]
|
|
||||||
|
|
||||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
|
||||||
assert list(attn.shape) == [B, T_de, T_en]
|
|
||||||
assert list(o_dr.shape) == [B, T_en]
|
|
||||||
|
|
||||||
# with speaker external embedding
|
|
||||||
config = SpeedySpeechConfig(
|
|
||||||
model_args=SpeedySpeechArgs(
|
|
||||||
num_chars=num_chars,
|
|
||||||
out_channels=80,
|
|
||||||
hidden_channels=128,
|
|
||||||
num_speakers=10,
|
|
||||||
use_d_vector=True,
|
|
||||||
d_vector_dim=256,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
model = SpeedySpeech(config).to(device)
|
|
||||||
model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)})
|
|
||||||
o_de = outputs["model_outputs"]
|
|
||||||
attn = outputs["alignments"]
|
|
||||||
o_dr = outputs["durations_log"]
|
|
||||||
|
|
||||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
|
||||||
assert list(attn.shape) == [B, T_de, T_en]
|
|
||||||
assert list(o_dr.shape) == [B, T_en]
|
|
|
@ -4,14 +4,12 @@ import shutil
|
||||||
|
|
||||||
from tests import get_device_id, get_tests_output_path, run_cli
|
from tests import get_device_id, get_tests_output_path, run_cli
|
||||||
from TTS.tts.configs import SpeedySpeechConfig
|
from TTS.tts.configs import SpeedySpeechConfig
|
||||||
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
|
|
||||||
|
|
||||||
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
|
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
|
||||||
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||||
|
|
||||||
|
|
||||||
config = SpeedySpeechConfig(
|
config = SpeedySpeechConfig(
|
||||||
model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0),
|
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
eval_batch_size=8,
|
eval_batch_size=8,
|
||||||
num_loader_workers=0,
|
num_loader_workers=0,
|
||||||
|
|
|
@ -38,6 +38,7 @@ class TacotronTFTrainTest(unittest.TestCase):
|
||||||
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
|
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
|
||||||
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths, stop_targets, speaker_ids
|
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths, stop_targets, speaker_ids
|
||||||
|
|
||||||
|
@unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.")
|
||||||
def test_train_step(self):
|
def test_train_step(self):
|
||||||
"""test forward pass"""
|
"""test forward pass"""
|
||||||
(
|
(
|
||||||
|
@ -70,6 +71,7 @@ class TacotronTFTrainTest(unittest.TestCase):
|
||||||
# inference pass
|
# inference pass
|
||||||
output = model(chars_seq, training=False)
|
output = model(chars_seq, training=False)
|
||||||
|
|
||||||
|
@unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.")
|
||||||
def test_forward_attention(
|
def test_forward_attention(
|
||||||
self,
|
self,
|
||||||
):
|
):
|
||||||
|
@ -103,6 +105,7 @@ class TacotronTFTrainTest(unittest.TestCase):
|
||||||
# inference pass
|
# inference pass
|
||||||
output = model(chars_seq, training=False)
|
output = model(chars_seq, training=False)
|
||||||
|
|
||||||
|
@unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.")
|
||||||
def test_tflite_conversion(
|
def test_tflite_conversion(
|
||||||
self,
|
self,
|
||||||
): # pylint:disable=no-self-use
|
): # pylint:disable=no-self-use
|
||||||
|
|
|
@ -4,7 +4,7 @@ import torch as T
|
||||||
|
|
||||||
from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
|
from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
|
||||||
from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet
|
from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet
|
||||||
from TTS.tts.utils.data import sequence_mask
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
|
|
||||||
# pylint: disable=unused-variable
|
# pylint: disable=unused-variable
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,15 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
import torch
|
||||||
|
|
||||||
from TTS.vocoder.tf.models.melgan_generator import MelganGenerator
|
from TTS.vocoder.tf.models.melgan_generator import MelganGenerator
|
||||||
|
|
||||||
|
use_cuda = torch.cuda.is_available()
|
||||||
|
|
||||||
|
|
||||||
|
@unittest.skipIf(use_cuda, " [!] Skip Test: Loosy TF support.")
|
||||||
def test_melgan_generator():
|
def test_melgan_generator():
|
||||||
hop_length = 256
|
hop_length = 256
|
||||||
model = MelganGenerator()
|
model = MelganGenerator()
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import os
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
import torch
|
||||||
from librosa.core import load
|
from librosa.core import load
|
||||||
|
|
||||||
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
|
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
|
||||||
|
@ -9,8 +11,10 @@ from TTS.vocoder.tf.layers.pqmf import PQMF
|
||||||
|
|
||||||
TESTS_PATH = get_tests_path()
|
TESTS_PATH = get_tests_path()
|
||||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||||
|
use_cuda = torch.cuda.is_available()
|
||||||
|
|
||||||
|
|
||||||
|
@unittest.skipIf(use_cuda, " [!] Skip Test: Loosy TF support.")
|
||||||
def test_pqmf():
|
def test_pqmf():
|
||||||
w, sr = load(WAV_FILE)
|
w, sr = load(WAV_FILE)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue