Merge pull request #887 from coqui-ai/vctk_recipes

VCTK Recipes
This commit is contained in:
Eren Gölge 2021-10-21 19:54:44 +02:00 committed by GitHub
commit 1e9a97560b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
86 changed files with 1798 additions and 2165 deletions

View File

@ -1,4 +1,4 @@
name: CI name: aux-tests
on: on:
push: push:
@ -45,8 +45,5 @@ jobs:
run: | run: |
python3 -m pip install .[all] python3 -m pip install .[all]
python3 setup.py egg_info python3 setup.py egg_info
- name: Lint check
run: |
make lint
- name: Unit tests - name: Unit tests
run: make test run: make test_aux

50
.github/workflows/style_check.yml vendored Normal file
View File

@ -0,0 +1,50 @@
name: style-check
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.9]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
architecture: x64
- name: check OS
run: cat /etc/os-release
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y git make
sudo apt install -y python3-wheel gcc
make system-deps
- name: Upgrade pip
run: python3 -m pip install --upgrade pip
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Lint check
run: |
make lint

49
.github/workflows/tts_tests.yml vendored Normal file
View File

@ -0,0 +1,49 @@
name: tts-tests
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
architecture: x64
- name: check OS
run: cat /etc/os-release
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y git make
sudo apt install -y python3-wheel gcc
make system-deps
- name: Upgrade pip
run: python3 -m pip install --upgrade pip
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: make test_tts

49
.github/workflows/vocoder_tests.yml vendored Normal file
View File

@ -0,0 +1,49 @@
name: vocoder-tests
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
architecture: x64
- name: check OS
run: cat /etc/os-release
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y git make
sudo apt install -y python3-wheel gcc
make system-deps
- name: Upgrade pip
run: python3 -m pip install --upgrade pip
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: make test_vocoder

12
.gitignore vendored
View File

@ -124,6 +124,15 @@ version.py
# jupyter dummy files # jupyter dummy files
core core
# ignore local datasets
recipes/WIP/*
recipes/ljspeech/LJSpeech-1.1/*
recipes/vctk/VCTK/*
VCTK-Corpus-removed-silence/*
# ignore training logs
trainer_*_log.txt
# files used internally fro dev, test etc. # files used internally fro dev, test etc.
tests/outputs/* tests/outputs/*
tests/train_outputs/* tests/train_outputs/*
@ -134,9 +143,6 @@ notebooks/data/*
TTS/tts/layers/glow_tts/monotonic_align/core.c TTS/tts/layers/glow_tts/monotonic_align/core.c
.vscode-upload.json .vscode-upload.json
temp_build/* temp_build/*
recipes/WIP/*
recipes/ljspeech/LJSpeech-1.1/*
recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/*
events.out* events.out*
old_configs/* old_configs/*
model_importers/* model_importers/*

View File

@ -12,6 +12,15 @@ test_all: ## run tests and don't stop on an error.
test: ## run tests. test: ## run tests.
nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id
test_vocoder: ## run vocoder tests.
nosetests tests.vocoder_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.vocoder_tests --nologcapture --with-id
test_tts: ## run tts tests.
nosetests tests.tts_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.tts_tests --nologcapture --with-id
test_aux: ## run aux tests.
nosetests tests.aux_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.aux_tests --nologcapture --with-id
./run_bash_tests.sh ./run_bash_tests.sh
test_failed: ## only run tests failed the last time. test_failed: ## only run tests failed the last time.

View File

@ -4,6 +4,7 @@ from TTS.config import load_config, register_config
from TTS.trainer import Trainer, TrainingArgs from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models import setup_model from TTS.tts.models import setup_model
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
@ -43,8 +44,16 @@ def main():
# setup audio processor # setup audio processor
ap = AudioProcessor(**config.audio) ap = AudioProcessor(**config.audio)
# init speaker manager
if config.use_speaker_embedding:
speaker_manager = SpeakerManager(data_items=train_samples + eval_samples)
elif config.use_d_vector_file:
speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
else:
speaker_manager = None
# init the model from config # init the model from config
model = setup_model(config) model = setup_model(config, speaker_manager)
# init the trainer and 🚀 # init the trainer and 🚀
trainer = Trainer( trainer = Trainer(

View File

@ -36,10 +36,11 @@ def register_config(model_name: str) -> Coqpit:
Coqpit: config class. Coqpit: config class.
""" """
config_class = None config_class = None
config_name = model_name + "_config"
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"] paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"]
for path in paths: for path in paths:
try: try:
config_class = find_module(path, model_name + "_config") config_class = find_module(path, config_name)
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
if config_class is None: if config_class is None:

View File

@ -182,14 +182,22 @@ class Trainer:
- TPU training - TPU training
- NOTE: Consider moving `training_assets` to the model implementation. - NOTE: Consider moving `training_assets` to the model implementation.
""" """
if parse_command_line_args: if parse_command_line_args:
# parse command-line arguments for TrainingArgs() # parse command-line arguments for TrainerArgs()
args, coqpit_overrides = self.parse_argv(args) args, coqpit_overrides = self.parse_argv(args)
# get ready for training and parse command-line arguments for the model config # get ready for training and parse command-line arguments for the model config
config = self.init_training(args, coqpit_overrides, config) config = self.init_training(args, coqpit_overrides, config)
# define the experiment path and create the folder # set the output path
if args.continue_path:
# use the same path as the continuing run
output_path = args.continue_path
else:
# override the output path if it is provided
output_path = config.output_path if output_path is None else output_path
# create a new output folder name
output_path = get_experiment_folder_path(config.output_path, config.run_name) output_path = get_experiment_folder_path(config.output_path, config.run_name)
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
@ -252,11 +260,6 @@ class Trainer:
else: else:
self.run_get_model(self.config, get_model) self.run_get_model(self.config, get_model)
# TODO: out!
# init multispeaker settings of the model
if hasattr(self.model, "init_multispeaker"):
self.model.init_multispeaker(self.config, self.train_samples + self.eval_samples)
# setup criterion # setup criterion
self.criterion = self.get_criterion(self.model) self.criterion = self.get_criterion(self.model)
@ -359,7 +362,7 @@ class Trainer:
# override config values from command-line args # override config values from command-line args
# TODO: Maybe it is better to do it outside # TODO: Maybe it is better to do it outside
if len(coqpit_overrides) > 0: if len(coqpit_overrides) > 0:
config.parse_known_args(coqpit_overrides, relaxed_parser=True) config.parse_known_args(coqpit_overrides, arg_prefix="coqpit", relaxed_parser=True)
experiment_path = args.continue_path experiment_path = args.continue_path
# update the config.json fields and copy it to the output folder # update the config.json fields and copy it to the output folder
@ -615,10 +618,8 @@ class Trainer:
else: else:
grad_clip = 0.0 # meaning no gradient clipping grad_clip = 0.0 # meaning no gradient clipping
if grad_clip <= 0:
grad_norm = 0
# optimizer step # optimizer step
grad_norm = 0
update_lr_scheduler = True update_lr_scheduler = True
if self.use_amp_scaler: if self.use_amp_scaler:
if self.use_apex: if self.use_apex:
@ -633,13 +634,11 @@ class Trainer:
if grad_clip > 0: if grad_clip > 0:
scaler.unscale_(optimizer) scaler.unscale_(optimizer)
grad_norm = torch.nn.utils.clip_grad_norm_(self.master_params(optimizer), grad_clip) grad_norm = torch.nn.utils.clip_grad_norm_(self.master_params(optimizer), grad_clip)
# pytorch skips the step when the norm is 0. So ignore the norm value when it is NaN
if torch.isnan(grad_norm) or torch.isinf(grad_norm):
grad_norm = 0
scale_prev = scaler.get_scale() scale_prev = scaler.get_scale()
scaler.step(optimizer) scaler.step(optimizer)
scaler.update() scaler.update()
update_lr_scheduler = scale_prev <= scaler.get_scale() update_lr_scheduler = scale_prev <= scaler.get_scale()
loss_dict["amp_scaler"] = scaler.get_scale() # for logging
else: else:
# main model optimizer step # main model optimizer step
loss_dict["loss"].backward() loss_dict["loss"].backward()
@ -647,6 +646,10 @@ class Trainer:
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
optimizer.step() optimizer.step()
# pytorch skips the step when the norm is 0. So ignore the norm value when it is NaN
if isinstance(grad_norm, torch.Tensor) and (torch.isnan(grad_norm) or torch.isinf(grad_norm)):
grad_norm = 0
step_time = time.time() - step_start_time step_time = time.time() - step_start_time
# setup lr # setup lr
@ -1144,7 +1147,7 @@ class Trainer:
if isinstance(value, (int, float)): if isinstance(value, (int, float)):
loss_dict_detached[key] = value loss_dict_detached[key] = value
else: else:
loss_dict_detached[key] = value.detach() loss_dict_detached[key] = value.detach().item()
return loss_dict_detached return loss_dict_detached
def _pick_target_avg_loss(self, keep_avg_target: KeepAverage) -> Dict: def _pick_target_avg_loss(self, keep_avg_target: KeepAverage) -> Dict:

View File

@ -3,15 +3,15 @@ import os
from inspect import isclass from inspect import isclass
# import all files under configs/ # import all files under configs/
configs_dir = os.path.dirname(__file__) # configs_dir = os.path.dirname(__file__)
for file in os.listdir(configs_dir): # for file in os.listdir(configs_dir):
path = os.path.join(configs_dir, file) # path = os.path.join(configs_dir, file)
if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
config_name = file[: file.find(".py")] if file.endswith(".py") else file # config_name = file[: file.find(".py")] if file.endswith(".py") else file
module = importlib.import_module("TTS.tts.configs." + config_name) # module = importlib.import_module("TTS.tts.configs." + config_name)
for attribute_name in dir(module): # for attribute_name in dir(module):
attribute = getattr(module, attribute_name) # attribute = getattr(module, attribute_name)
if isclass(attribute): # if isclass(attribute):
# Add the class to this package's variables # # Add the class to this package's variables
globals()[attribute_name] = attribute # globals()[attribute_name] = attribute

View File

@ -11,7 +11,7 @@ class FastPitchConfig(BaseTTSConfig):
Example: Example:
>>> from TTS.tts.configs import FastPitchConfig >>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig
>>> config = FastPitchConfig() >>> config = FastPitchConfig()
Args: Args:
@ -30,6 +30,10 @@ class FastPitchConfig(BaseTTSConfig):
Activation Normalization that pre-computes normalization stats at the beginning and use the same values Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10. for the rest. Defaults to 10.
speakers_file (str):
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
speaker names. Defaults to `None`.
use_speaker_embedding (bool): use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False. in the multi-speaker mode. Defaults to False.
@ -105,6 +109,8 @@ class FastPitchConfig(BaseTTSConfig):
model_args: ForwardTTSArgs = ForwardTTSArgs() model_args: ForwardTTSArgs = ForwardTTSArgs()
# multi-speaker settings # multi-speaker settings
num_speakers: int = 0
speakers_file: str = None
use_speaker_embedding: bool = False use_speaker_embedding: bool = False
use_d_vector_file: bool = False use_d_vector_file: bool = False
d_vector_file: str = False d_vector_file: str = False
@ -149,3 +155,22 @@ class FastPitchConfig(BaseTTSConfig):
"Prior to November 22, 1963.", "Prior to November 22, 1963.",
] ]
) )
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file

View File

@ -30,6 +30,11 @@ class FastSpeechConfig(BaseTTSConfig):
Activation Normalization that pre-computes normalization stats at the beginning and use the same values Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10. for the rest. Defaults to 10.
speakers_file (str):
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
speaker names. Defaults to `None`.
use_speaker_embedding (bool): use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False. in the multi-speaker mode. Defaults to False.
@ -105,6 +110,7 @@ class FastSpeechConfig(BaseTTSConfig):
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False) model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
# multi-speaker settings # multi-speaker settings
speakers_file: str = None
use_speaker_embedding: bool = False use_speaker_embedding: bool = False
use_d_vector_file: bool = False use_d_vector_file: bool = False
d_vector_file: str = False d_vector_file: str = False
@ -149,3 +155,22 @@ class FastSpeechConfig(BaseTTSConfig):
"Prior to November 22, 1963.", "Prior to November 22, 1963.",
] ]
) )
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file

View File

@ -218,7 +218,3 @@ class BaseTTSConfig(BaseTrainingConfig):
lr_scheduler_params: dict = field(default_factory=lambda: {}) lr_scheduler_params: dict = field(default_factory=lambda: {})
# testing # testing
test_sentences: List[str] = field(default_factory=lambda: []) test_sentences: List[str] = field(default_factory=lambda: [])
# multi-speaker
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_dim: int = 0

View File

@ -30,6 +30,10 @@ class SpeedySpeechConfig(BaseTTSConfig):
Activation Normalization that pre-computes normalization stats at the beginning and use the same values Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10. for the rest. Defaults to 10.
speakers_file (str):
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
speaker names. Defaults to `None`.
use_speaker_embedding (bool): use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False. in the multi-speaker mode. Defaults to False.
@ -117,12 +121,13 @@ class SpeedySpeechConfig(BaseTTSConfig):
}, },
out_channels=80, out_channels=80,
hidden_channels=128, hidden_channels=128,
num_speakers=0,
positional_encoding=True, positional_encoding=True,
detach_duration_predictor=True, detach_duration_predictor=True,
) )
# multi-speaker settings # multi-speaker settings
num_speakers: int = 0
speakers_file: str = None
use_speaker_embedding: bool = False use_speaker_embedding: bool = False
use_d_vector_file: bool = False use_d_vector_file: bool = False
d_vector_file: str = False d_vector_file: str = False
@ -166,3 +171,22 @@ class SpeedySpeechConfig(BaseTTSConfig):
"Prior to November 22, 1963.", "Prior to November 22, 1963.",
] ]
) )
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file

View File

@ -106,7 +106,7 @@ class TacotronConfig(BaseTTSConfig):
Weight decay coefficient. Defaults to `1e-6`. Weight decay coefficient. Defaults to `1e-6`.
grad_clip (float): grad_clip (float):
Gradient clipping threshold. Defaults to `5`. Gradient clipping threshold. Defaults to `5`.
seq_len_notm (bool): seq_len_norm (bool):
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
is divided by the sequence length. Defaults to False. is divided by the sequence length. Defaults to False.
loss_masking (bool): loss_masking (bool):

View File

@ -139,3 +139,36 @@ class VitsConfig(BaseTTSConfig):
"Prior to November 22, 1963.", "Prior to November 22, 1963.",
] ]
) )
# multi-speaker settings
# use speaker embedding layer
num_speakers: int = 0
use_speaker_embedding: bool = False
speakers_file: str = None
speaker_embedding_channels: int = 256
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: str = False
d_vector_dim: int = None
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
if self.speaker_embedding_channels:
self.model_args.speaker_embedding_channels = self.speaker_embedding_channels
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file

View File

@ -330,7 +330,7 @@ class TTSDataset(Dataset):
if by_audio_len: if by_audio_len:
lengths = [] lengths = []
for item in self.items: for item in self.items:
lengths.append(os.path.getsize(item[1])) lengths.append(os.path.getsize(item[1]) / 16 * 8) # assuming 16bit audio
lengths = np.array(lengths) lengths = np.array(lengths)
else: else:
lengths = np.array([len(ins[0]) for ins in self.items]) lengths = np.array([len(ins[0]) for ins in self.items])
@ -419,6 +419,7 @@ class TTSDataset(Dataset):
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names] d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
else: else:
d_vectors = None d_vectors = None
# get numerical speaker ids from speaker names # get numerical speaker ids from speaker names
if self.speaker_id_mapping: if self.speaker_id_mapping:
speaker_ids = [self.speaker_id_mapping[sn] for sn in batch["speaker_name"]] speaker_ids = [self.speaker_id_mapping[sn] for sn in batch["speaker_name"]]

View File

@ -410,11 +410,6 @@ class TacotronLoss(torch.nn.Module):
return_dict["postnet_ssim_loss"] = postnet_ssim_loss return_dict["postnet_ssim_loss"] = postnet_ssim_loss
return_dict["loss"] = loss return_dict["loss"] = loss
# check if any loss is NaN
for key, loss in return_dict.items():
if torch.isnan(loss):
raise RuntimeError(f" [!] NaN loss with {key}.")
return return_dict return return_dict

View File

@ -126,27 +126,24 @@ class GravesAttention(nn.Module):
class OriginalAttention(nn.Module): class OriginalAttention(nn.Module):
"""Bahdanau Attention with various optional modifications. Proposed below. """Bahdanau Attention with various optional modifications.
- Location sensitive attnetion: https://arxiv.org/abs/1712.05884 - Location sensitive attnetion: https://arxiv.org/abs/1712.05884
- Forward Attention: https://arxiv.org/abs/1807.06736 + state masking at inference - Forward Attention: https://arxiv.org/abs/1807.06736 + state masking at inference
- Using sigmoid instead of softmax normalization - Using sigmoid instead of softmax normalization
- Attention windowing at inference time - Attention windowing at inference time
Note: Note:
Location Sensitive Attention is an attention mechanism that extends the additive attention mechanism Location Sensitive Attention extends the additive attention mechanism
to use cumulative attention weights from previous decoder time steps as an additional feature. to use cumulative attention weights from previous decoder time steps with the current time step features.
Forward attention considers only the alignment paths that satisfy the monotonic condition at each Forward attention computes most probable monotonic alignment. The modified attention probabilities at each
decoder timestep. The modified attention probabilities at each timestep are computed recursively timestep are computed recursively by the forward algorithm.
using a forward algorithm.
Transition agent for forward attention is further proposed, which helps the attention mechanism Transition agent in the forward attention explicitly gates the attention mechanism whether to move forward or
to make decisions whether to move forward or stay at each decoder timestep. stay at each decoder timestep.
Attention windowing applies a sliding windows to time steps of the input tensor centering at the last
time step with the largest attention weight. It is especially useful at inference to keep the attention
alignment diagonal.
Attention windowing is a inductive prior that prevents the model from attending to previous and future timesteps
beyond a certain window.
Args: Args:
query_dim (int): number of channels in the query tensor. query_dim (int): number of channels in the query tensor.

View File

@ -2,7 +2,7 @@ from TTS.tts.utils.text.symbols import make_symbols, parse_symbols
from TTS.utils.generic_utils import find_module from TTS.utils.generic_utils import find_module
def setup_model(config): def setup_model(config, speaker_manager: "SpeakerManager" = None):
print(" > Using model: {}".format(config.model)) print(" > Using model: {}".format(config.model))
# fetch the right model implementation. # fetch the right model implementation.
if "base_model" in config and config["base_model"] is not None: if "base_model" in config and config["base_model"] is not None:
@ -31,7 +31,7 @@ def setup_model(config):
config.model_params.num_chars = num_chars config.model_params.num_chars = num_chars
if "model_args" in config: if "model_args" in config:
config.model_args.num_chars = num_chars config.model_args.num_chars = num_chars
model = MyModel(config) model = MyModel(config, speaker_manager=speaker_manager)
return model return model

View File

@ -11,6 +11,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.io import load_fsspec from TTS.utils.io import load_fsspec
@ -99,9 +100,10 @@ class AlignTTS(BaseTTS):
# pylint: disable=dangerous-default-value # pylint: disable=dangerous-default-value
def __init__(self, config: Coqpit): def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
super().__init__(config) super().__init__(config)
self.speaker_manager = speaker_manager
self.config = config self.config = config
self.phase = -1 self.phase = -1
self.length_scale = ( self.length_scale = (

View File

@ -1,6 +1,6 @@
import copy import copy
from abc import abstractmethod from abc import abstractmethod
from typing import Dict, List from typing import Dict
import torch import torch
from coqpit import Coqpit from coqpit import Coqpit
@ -9,15 +9,15 @@ from torch import nn
from TTS.tts.layers.losses import TacotronLoss from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
from TTS.utils.generic_utils import format_aux_input from TTS.utils.generic_utils import format_aux_input
from TTS.utils.io import load_fsspec from TTS.utils.io import load_fsspec
from TTS.utils.training import gradual_training_scheduler from TTS.utils.training import gradual_training_scheduler
class BaseTacotron(BaseTTS): class BaseTacotron(BaseTTS):
"""Base class shared by Tacotron and Tacotron2"""
def __init__(self, config: Coqpit): def __init__(self, config: Coqpit):
"""Abstract Tacotron class"""
super().__init__(config) super().__init__(config)
# pass all config fields as class attributes # pass all config fields as class attributes
@ -45,6 +45,7 @@ class BaseTacotron(BaseTTS):
@staticmethod @staticmethod
def _format_aux_input(aux_input: Dict) -> Dict: def _format_aux_input(aux_input: Dict) -> Dict:
"""Set missing fields to their default values"""
if aux_input: if aux_input:
return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)
return None return None
@ -53,14 +54,12 @@ class BaseTacotron(BaseTTS):
# INIT FUNCTIONS # INIT FUNCTIONS
############################# #############################
def _init_states(self):
self.embedded_speakers = None
self.embedded_speakers_projected = None
def _init_backward_decoder(self): def _init_backward_decoder(self):
"""Init the backward decoder for Forward-Backward decoding."""
self.decoder_backward = copy.deepcopy(self.decoder) self.decoder_backward = copy.deepcopy(self.decoder)
def _init_coarse_decoder(self): def _init_coarse_decoder(self):
"""Init the coarse decoder for Double-Decoder Consistency."""
self.coarse_decoder = copy.deepcopy(self.decoder) self.coarse_decoder = copy.deepcopy(self.decoder)
self.coarse_decoder.r_init = self.ddc_r self.coarse_decoder.r_init = self.ddc_r
self.coarse_decoder.set_r(self.ddc_r) self.coarse_decoder.set_r(self.ddc_r)
@ -80,6 +79,13 @@ class BaseTacotron(BaseTTS):
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
"""Load model checkpoint and set up internals.
Args:
config (Coqpi): model configuration.
checkpoint_path (str): path to checkpoint file.
eval (bool): whether to load model for evaluation.
"""
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
# TODO: set r in run-time by taking it from the new config # TODO: set r in run-time by taking it from the new config
@ -98,45 +104,9 @@ class BaseTacotron(BaseTTS):
assert not self.training assert not self.training
def get_criterion(self) -> nn.Module: def get_criterion(self) -> nn.Module:
"""Get the model criterion used in training."""
return TacotronLoss(self.config) return TacotronLoss(self.config)
@staticmethod
def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager:
return get_speaker_manager(config, restore_path, data, out_path)
def get_aux_input(self, **kwargs) -> Dict:
"""Compute Tacotron's auxiliary inputs based on model config.
- speaker d_vector
- style wav for GST
- speaker ID for speaker embedding
"""
# setup speaker_id
if self.config.use_speaker_embedding:
speaker_id = kwargs.get("speaker_id", 0)
else:
speaker_id = None
# setup d_vector
d_vector = (
self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0])
if self.config.use_d_vector_file and self.config.use_speaker_embedding
else None
)
# setup style_mel
if "style_wav" in kwargs:
style_wav = kwargs["style_wav"]
elif self.config.has("gst_style_input"):
style_wav = self.config.gst_style_input
else:
style_wav = None
if style_wav is None and "use_gst" in self.config and self.config.use_gst:
# inicialize GST with zero dict.
style_wav = {}
print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!")
for i in range(self.config.gst["gst_num_style_tokens"]):
style_wav[str(i)] = 0
aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector}
return aux_inputs
############################# #############################
# COMMON COMPUTE FUNCTIONS # COMMON COMPUTE FUNCTIONS
############################# #############################
@ -182,15 +152,6 @@ class BaseTacotron(BaseTTS):
# EMBEDDING FUNCTIONS # EMBEDDING FUNCTIONS
############################# #############################
def compute_speaker_embedding(self, speaker_ids):
"""Compute speaker embedding vectors"""
if hasattr(self, "speaker_embedding") and speaker_ids is None:
raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1)
if hasattr(self, "speaker_project_mel") and speaker_ids is not None:
self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1)
def compute_gst(self, inputs, style_input, speaker_embedding=None): def compute_gst(self, inputs, style_input, speaker_embedding=None):
"""Compute global style token""" """Compute global style token"""
if isinstance(style_input, dict): if isinstance(style_input, dict):

View File

@ -1,4 +1,5 @@
import os import os
import random
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
import torch import torch
@ -9,20 +10,20 @@ from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from TTS.model import BaseModel from TTS.model import BaseModel
from TTS.tts.configs.shared_configs import CharactersConfig
from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.datasets.dataset import TTSDataset
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text import make_symbols from TTS.tts.utils.text import make_symbols
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.audio import AudioProcessor
# pylint: skip-file # pylint: skip-file
class BaseTTS(BaseModel): class BaseTTS(BaseModel):
"""Abstract `tts` class. Every new `tts` model must inherit this. """Base `tts` class. Every new `tts` model must inherit this.
It defines `tts` specific functions on top of `Model`. It defines common `tts` specific functions on top of `Model` implementation.
Notes on input/output tensor shapes: Notes on input/output tensor shapes:
Any input or output tensor of the model must be shaped as Any input or output tensor of the model must be shaped as
@ -64,7 +65,7 @@ class BaseTTS(BaseModel):
else: else:
from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols
config.characters = parse_symbols() config.characters = CharactersConfig(**parse_symbols())
model_characters = phonemes if config.use_phonemes else symbols model_characters = phonemes if config.use_phonemes else symbols
num_chars = len(model_characters) + getattr(config, "add_blank", False) num_chars = len(model_characters) + getattr(config, "add_blank", False)
return model_characters, config, num_chars return model_characters, config, num_chars
@ -72,35 +73,18 @@ class BaseTTS(BaseModel):
def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager:
return get_speaker_manager(config, restore_path, data, out_path) return get_speaker_manager(config, restore_path, data, out_path)
def init_multispeaker(self, config: Coqpit, data: List = None): def init_multispeaker(self, config: Coqpit):
"""Initialize a speaker embedding layer if needen and define expected embedding channel size for defining """Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding
`in_channels` size of the connected layers. vector dimension in the network. If model uses d-vectors, then it only sets the expected dimension.
This implementation yields 3 possible outcomes:
1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
`config.d_vector_dim` or 512.
You can override this function for new models.0
Args: Args:
config (Coqpit): Model configuration. config (Coqpit): Model configuration.
data (List, optional): Dataset items to infer number of speakers. Defaults to None.
""" """
# init speaker manager # set number of speakers
self.speaker_manager = get_speaker_manager(config, data=data) if self.speaker_manager is not None:
# set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
if data is not None or self.speaker_manager.speaker_ids:
self.num_speakers = self.speaker_manager.num_speakers self.num_speakers = self.speaker_manager.num_speakers
else: elif hasattr(config, "num_speakers"):
self.num_speakers = ( self.num_speakers = config.num_speakers
config.num_speakers
if "num_speakers" in config and config.num_speakers != 0
else self.speaker_manager.num_speakers
)
# set ultimate speaker embedding size # set ultimate speaker embedding size
if config.use_speaker_embedding or config.use_d_vector_file: if config.use_speaker_embedding or config.use_d_vector_file:
@ -109,13 +93,10 @@ class BaseTTS(BaseModel):
) )
# init speaker embedding layer # init speaker embedding layer
if config.use_speaker_embedding and not config.use_d_vector_file: if config.use_speaker_embedding and not config.use_d_vector_file:
print(" > Init speaker_embedding layer.")
self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3) self.speaker_embedding.weight.data.normal_(0, 0.3)
def get_aux_input(self, **kwargs) -> Dict:
"""Prepare and return `aux_input` used by `forward()`"""
return {"speaker_id": None, "style_wav": None, "d_vector": None}
def format_batch(self, batch: Dict) -> Dict: def format_batch(self, batch: Dict) -> Dict:
"""Generic batch formatting for `TTSDataset`. """Generic batch formatting for `TTSDataset`.
@ -206,13 +187,9 @@ class BaseTTS(BaseModel):
ap = assets["audio_processor"] ap = assets["audio_processor"]
# setup multi-speaker attributes # setup multi-speaker attributes
if hasattr(self, "speaker_manager"): if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None
d_vector_mapping = ( d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None
self.speaker_manager.d_vectors
if config.use_speaker_embedding and config.use_d_vector_file
else None
)
else: else:
speaker_id_mapping = None speaker_id_mapping = None
d_vector_mapping = None d_vector_mapping = None
@ -245,9 +222,7 @@ class BaseTTS(BaseModel):
use_noise_augment=not is_eval, use_noise_augment=not is_eval,
verbose=verbose, verbose=verbose,
speaker_id_mapping=speaker_id_mapping, speaker_id_mapping=speaker_id_mapping,
d_vector_mapping=d_vector_mapping d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
if config.use_speaker_embedding and config.use_d_vector_file
else None,
) )
# pre-compute phonemes # pre-compute phonemes
@ -306,6 +281,24 @@ class BaseTTS(BaseModel):
) )
return loader return loader
def _get_test_aux_input(
self,
) -> Dict:
d_vector = None
if self.config.use_d_vector_file:
d_vector = [self.speaker_manager.d_vectors[name]["embedding"] for name in self.speaker_manager.d_vectors]
d_vector = (random.sample(sorted(d_vector), 1),)
aux_inputs = {
"speaker_id": None
if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1),
"d_vector": d_vector,
"style_wav": None, # TODO: handle GST style input
}
return aux_inputs
def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
"""Generic test run for `tts` models used by `Trainer`. """Generic test run for `tts` models used by `Trainer`.
@ -322,7 +315,7 @@ class BaseTTS(BaseModel):
test_audios = {} test_audios = {}
test_figures = {} test_figures = {}
test_sentences = self.config.test_sentences test_sentences = self.config.test_sentences
aux_inputs = self.get_aux_input() aux_inputs = self._get_test_aux_input()
for idx, sen in enumerate(test_sentences): for idx, sen in enumerate(test_sentences):
outputs_dict = synthesis( outputs_dict = synthesis(
self, self,
@ -345,3 +338,17 @@ class BaseTTS(BaseModel):
outputs_dict["outputs"]["alignments"], output_fig=False outputs_dict["outputs"]["alignments"], output_fig=False
) )
return test_figures, test_audios return test_figures, test_audios
def on_init_start(self, trainer):
"""Save the speaker.json at the beginning of the training. And update the config.json with the
speakers.json file path."""
if self.speaker_manager is not None:
output_path = os.path.join(trainer.output_path, "speakers.json")
self.speaker_manager.save_speaker_ids_to_file(output_path)
trainer.config.speakers_file = output_path
# some models don't have `model_args` set
if hasattr(trainer.config, "model_args"):
trainer.config.model_args.speakers_file = output_path
trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
print(f" > `speakers.json` is saved to {output_path}.")
print(" > `speakers_file` is updated in the config.json.")

View File

@ -13,6 +13,7 @@ from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
@ -31,9 +32,6 @@ class ForwardTTSArgs(Coqpit):
hidden_channels (int): hidden_channels (int):
Number of base hidden channels of the model. Defaults to 512. Number of base hidden channels of the model. Defaults to 512.
num_speakers (int):
Number of speakers for the speaker embedding layer. Defaults to 0.
use_aligner (bool): use_aligner (bool):
Whether to use aligner network to learn the text to speech alignment or use pre-computed durations. Whether to use aligner network to learn the text to speech alignment or use pre-computed durations.
If set False, durations should be computed by `TTS/bin/compute_attention_masks.py` and path to the If set False, durations should be computed by `TTS/bin/compute_attention_masks.py` and path to the
@ -86,12 +84,6 @@ class ForwardTTSArgs(Coqpit):
decoder_params (str): decoder_params (str):
Parameters of the decoder module. Defaults to ```{"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}``` Parameters of the decoder module. Defaults to ```{"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}```
use_d_vetor (bool):
Whether to use precomputed d-vectors for multi-speaker training. Defaults to False.
d_vector_dim (int):
Number of channels of the d-vectors. Defaults to 0.
detach_duration_predictor (bool): detach_duration_predictor (bool):
Detach the input to the duration predictor from the earlier computation graph so that the duraiton loss Detach the input to the duration predictor from the earlier computation graph so that the duraiton loss
does not pass to the earlier layers. Defaults to True. does not pass to the earlier layers. Defaults to True.
@ -99,12 +91,26 @@ class ForwardTTSArgs(Coqpit):
max_duration (int): max_duration (int):
Maximum duration accepted by the model. Defaults to 75. Maximum duration accepted by the model. Defaults to 75.
num_speakers (int):
Number of speakers for the speaker embedding layer. Defaults to 0.
speakers_file (str):
Path to the speaker mapping file for the Speaker Manager. Defaults to None.
speaker_embedding_channels (int):
Number of speaker embedding channels. Defaults to 256.
use_d_vector_file (bool):
Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
d_vector_dim (int):
Number of d-vector channels. Defaults to 0.
""" """
num_chars: int = None num_chars: int = None
out_channels: int = 80 out_channels: int = 80
hidden_channels: int = 384 hidden_channels: int = 384
num_speakers: int = 0
use_aligner: bool = True use_aligner: bool = True
use_pitch: bool = True use_pitch: bool = True
pitch_predictor_hidden_channels: int = 256 pitch_predictor_hidden_channels: int = 256
@ -125,10 +131,14 @@ class ForwardTTSArgs(Coqpit):
decoder_params: dict = field( decoder_params: dict = field(
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1} default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}
) )
use_d_vector: bool = False
d_vector_dim: int = 0
detach_duration_predictor: bool = False detach_duration_predictor: bool = False
max_duration: int = 75 max_duration: int = 75
num_speakers: int = 1
use_speaker_embedding: bool = False
speakers_file: str = None
use_d_vector_file: bool = False
d_vector_dim: int = None
d_vector_file: str = None
class ForwardTTS(BaseTTS): class ForwardTTS(BaseTTS):
@ -150,6 +160,8 @@ class ForwardTTS(BaseTTS):
Args: Args:
config (Coqpit): Model coqpit class. config (Coqpit): Model coqpit class.
speaker_manager (SpeakerManager): Speaker manager for multi-speaker training. Only used for multi-speaker models.
Defaults to None.
Examples: Examples:
>>> from TTS.tts.models.fast_pitch import ForwardTTS, ForwardTTSArgs >>> from TTS.tts.models.fast_pitch import ForwardTTS, ForwardTTSArgs
@ -158,10 +170,13 @@ class ForwardTTS(BaseTTS):
""" """
# pylint: disable=dangerous-default-value # pylint: disable=dangerous-default-value
def __init__(self, config: Coqpit): def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
super().__init__(config) super().__init__(config)
self.speaker_manager = speaker_manager
self.init_multispeaker(config)
self.max_duration = self.args.max_duration self.max_duration = self.args.max_duration
self.use_aligner = self.args.use_aligner self.use_aligner = self.args.use_aligner
self.use_pitch = self.args.use_pitch self.use_pitch = self.args.use_pitch
@ -178,7 +193,7 @@ class ForwardTTS(BaseTTS):
self.args.hidden_channels, self.args.hidden_channels,
self.args.encoder_type, self.args.encoder_type,
self.args.encoder_params, self.args.encoder_params,
self.args.d_vector_dim, self.embedded_speaker_dim,
) )
if self.args.positional_encoding: if self.args.positional_encoding:
@ -192,7 +207,7 @@ class ForwardTTS(BaseTTS):
) )
self.duration_predictor = DurationPredictor( self.duration_predictor = DurationPredictor(
self.args.hidden_channels + self.args.d_vector_dim, self.args.hidden_channels + self.embedded_speaker_dim,
self.args.duration_predictor_hidden_channels, self.args.duration_predictor_hidden_channels,
self.args.duration_predictor_kernel_size, self.args.duration_predictor_kernel_size,
self.args.duration_predictor_dropout_p, self.args.duration_predictor_dropout_p,
@ -200,7 +215,7 @@ class ForwardTTS(BaseTTS):
if self.args.use_pitch: if self.args.use_pitch:
self.pitch_predictor = DurationPredictor( self.pitch_predictor = DurationPredictor(
self.args.hidden_channels + self.args.d_vector_dim, self.args.hidden_channels + self.embedded_speaker_dim,
self.args.pitch_predictor_hidden_channels, self.args.pitch_predictor_hidden_channels,
self.args.pitch_predictor_kernel_size, self.args.pitch_predictor_kernel_size,
self.args.pitch_predictor_dropout_p, self.args.pitch_predictor_dropout_p,
@ -212,19 +227,37 @@ class ForwardTTS(BaseTTS):
padding=int((self.args.pitch_embedding_kernel_size - 1) / 2), padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
) )
if self.args.num_speakers > 1 and not self.args.use_d_vector:
# speaker embedding layer
self.emb_g = nn.Embedding(self.args.num_speakers, self.args.d_vector_dim)
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
if self.args.d_vector_dim > 0 and self.args.d_vector_dim != self.args.hidden_channels:
self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1)
if self.args.use_aligner: if self.args.use_aligner:
self.aligner = AlignmentNetwork( self.aligner = AlignmentNetwork(
in_query_channels=self.args.out_channels, in_key_channels=self.args.hidden_channels in_query_channels=self.args.out_channels, in_key_channels=self.args.hidden_channels
) )
def init_multispeaker(self, config: Coqpit):
"""Init for multi-speaker training.
Args:
config (Coqpit): Model configuration.
"""
self.embedded_speaker_dim = 0
# init speaker manager
if self.speaker_manager is None and (config.use_d_vector_file or config.use_speaker_embedding):
raise ValueError(
" > SpeakerManager is not provided. You must provide the SpeakerManager before initializing a multi-speaker model."
)
# set number of speakers
if self.speaker_manager is not None:
self.num_speakers = self.speaker_manager.num_speakers
# init d-vector embedding
if config.use_d_vector_file:
self.embedded_speaker_dim = config.d_vector_dim
if self.args.d_vector_dim != self.args.hidden_channels:
self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1)
# init speaker embedding layer
if config.use_speaker_embedding and not config.use_d_vector_file:
print(" > Init speaker_embedding layer.")
self.emb_g = nn.Embedding(self.args.num_speakers, self.args.hidden_channels)
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
@staticmethod @staticmethod
def generate_attn(dr, x_mask, y_mask=None): def generate_attn(dr, x_mask, y_mask=None):
"""Generate an attention mask from the durations. """Generate an attention mask from the durations.
@ -289,18 +322,6 @@ class ForwardTTS(BaseTTS):
o_dr = torch.round(o_dr) o_dr = torch.round(o_dr)
return o_dr return o_dr
@staticmethod
def _concat_speaker_embedding(o_en, g):
g_exp = g.expand(-1, -1, o_en.size(-1)) # [B, C, T_en]
o_en = torch.cat([o_en, g_exp], 1)
return o_en
def _sum_speaker_embedding(self, x, g):
# project g to decoder dim.
if hasattr(self, "proj_g"):
g = self.proj_g(g)
return x + g
def _forward_encoder( def _forward_encoder(
self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None
) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
@ -309,7 +330,7 @@ class ForwardTTS(BaseTTS):
1. Embed speaker IDs if multi-speaker mode. 1. Embed speaker IDs if multi-speaker mode.
2. Embed character sequences. 2. Embed character sequences.
3. Run the encoder network. 3. Run the encoder network.
4. Concat speaker embedding to the encoder output for the duration predictor. 4. Sum encoder outputs and speaker embeddings
Args: Args:
x (torch.LongTensor): Input sequence IDs. x (torch.LongTensor): Input sequence IDs.
@ -327,19 +348,18 @@ class ForwardTTS(BaseTTS):
- g: :math:`(B, C)` - g: :math:`(B, C)`
""" """
if hasattr(self, "emb_g"): if hasattr(self, "emb_g"):
g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1] g = self.emb_g(g) # [B, C, 1]
if g is not None: if g is not None:
g = g.unsqueeze(-1) g = g.unsqueeze(-1)
# [B, T, C] # [B, T, C]
x_emb = self.emb(x) x_emb = self.emb(x)
# encoder pass # encoder pass
o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask) o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask)
# speaker conditioning for duration predictor # speaker conditioning
# TODO: try different ways of conditioning
if g is not None: if g is not None:
o_en_dp = self._concat_speaker_embedding(o_en, g) o_en = o_en + g
else: return o_en, x_mask, g, x_emb
o_en_dp = o_en
return o_en, o_en_dp, x_mask, g, x_emb
def _forward_decoder( def _forward_decoder(
self, self,
@ -373,9 +393,6 @@ class ForwardTTS(BaseTTS):
# positional encoding # positional encoding
if hasattr(self, "pos_encoder"): if hasattr(self, "pos_encoder"):
o_en_ex = self.pos_encoder(o_en_ex, y_mask) o_en_ex = self.pos_encoder(o_en_ex, y_mask)
# speaker embedding
if g is not None:
o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
# decoder pass # decoder pass
o_de = self.decoder(o_en_ex, y_mask, g=g) o_de = self.decoder(o_en_ex, y_mask, g=g)
return o_de.transpose(1, 2), attn.transpose(1, 2) return o_de.transpose(1, 2), attn.transpose(1, 2)
@ -457,6 +474,19 @@ class ForwardTTS(BaseTTS):
alignment_soft = alignment_soft.squeeze(1).transpose(1, 2) alignment_soft = alignment_soft.squeeze(1).transpose(1, 2)
return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas
def _set_speaker_input(self, aux_input: Dict):
d_vectors = aux_input.get("d_vectors", None)
speaker_ids = aux_input.get("speaker_ids", None)
if d_vectors is not None and speaker_ids is not None:
raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
if speaker_ids is not None and not hasattr(self, "emb_g"):
raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
g = speaker_ids if speaker_ids is not None else d_vectors
return g
def forward( def forward(
self, self,
x: torch.LongTensor, x: torch.LongTensor,
@ -487,17 +517,17 @@ class ForwardTTS(BaseTTS):
- g: :math:`[B, C]` - g: :math:`[B, C]`
- pitch: :math:`[B, 1, T]` - pitch: :math:`[B, 1, T]`
""" """
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None g = self._set_speaker_input(aux_input)
# compute sequence masks # compute sequence masks
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).float() y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).float()
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).float() x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).float()
# encoder pass # encoder pass
o_en, o_en_dp, x_mask, g, x_emb = self._forward_encoder(x, x_mask, g) o_en, x_mask, g, x_emb = self._forward_encoder(x, x_mask, g)
# duration predictor pass # duration predictor pass
if self.args.detach_duration_predictor: if self.args.detach_duration_predictor:
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_dr_log = self.duration_predictor(o_en.detach(), x_mask)
else: else:
o_dr_log = self.duration_predictor(o_en_dp, x_mask) o_dr_log = self.duration_predictor(o_en, x_mask)
o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration) o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration)
# generate attn mask from predicted durations # generate attn mask from predicted durations
o_attn = self.generate_attn(o_dr.squeeze(1), x_mask) o_attn = self.generate_attn(o_dr.squeeze(1), x_mask)
@ -517,10 +547,12 @@ class ForwardTTS(BaseTTS):
o_pitch = None o_pitch = None
avg_pitch = None avg_pitch = None
if self.args.use_pitch: if self.args.use_pitch:
o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en_dp, x_mask, pitch, dr) o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en, x_mask, pitch, dr)
o_en = o_en + o_pitch_emb o_en = o_en + o_pitch_emb
# decoder pass # decoder pass
o_de, attn = self._forward_decoder(o_en, dr, x_mask, y_lengths, g=g) o_de, attn = self._forward_decoder(
o_en, dr, x_mask, y_lengths, g=None
) # TODO: maybe pass speaker embedding (g) too
outputs = { outputs = {
"model_outputs": o_de, # [B, T, C] "model_outputs": o_de, # [B, T, C]
"durations_log": o_dr_log.squeeze(1), # [B, T] "durations_log": o_dr_log.squeeze(1), # [B, T]
@ -551,22 +583,22 @@ class ForwardTTS(BaseTTS):
- x_lengths: [B] - x_lengths: [B]
- g: [B, C] - g: [B, C]
""" """
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None g = self._set_speaker_input(aux_input)
x_lengths = torch.tensor(x.shape[1:2]).to(x.device) x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype).float() x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype).float()
# encoder pass # encoder pass
o_en, o_en_dp, x_mask, g, _ = self._forward_encoder(x, x_mask, g) o_en, x_mask, g, _ = self._forward_encoder(x, x_mask, g)
# duration predictor pass # duration predictor pass
o_dr_log = self.duration_predictor(o_en_dp, x_mask) o_dr_log = self.duration_predictor(o_en, x_mask)
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
y_lengths = o_dr.sum(1) y_lengths = o_dr.sum(1)
# pitch predictor pass # pitch predictor pass
o_pitch = None o_pitch = None
if self.args.use_pitch: if self.args.use_pitch:
o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en_dp, x_mask) o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en, x_mask)
o_en = o_en + o_pitch_emb o_en = o_en + o_pitch_emb
# decoder pass # decoder pass
o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=g) o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=None)
outputs = { outputs = {
"model_outputs": o_de, "model_outputs": o_de,
"alignments": attn, "alignments": attn,

View File

@ -1,17 +1,18 @@
import math import math
from typing import Dict, Tuple from typing import Dict, Tuple, Union
import torch import torch
from coqpit import Coqpit
from torch import nn from torch import nn
from torch.cuda.amp.autocast_mode import autocast from torch.cuda.amp.autocast_mode import autocast
from torch.nn import functional as F from torch.nn import functional as F
from TTS.tts.configs import GlowTTSConfig from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.decoder import Decoder
from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.layers.glow_tts.encoder import Encoder
from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.speakers import get_speaker_manager from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.io import load_fsspec from TTS.utils.io import load_fsspec
@ -38,17 +39,19 @@ class GlowTTS(BaseTTS):
Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments. Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments.
Examples: Examples:
>>> from TTS.tts.configs import GlowTTSConfig >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
>>> from TTS.tts.models.glow_tts import GlowTTS >>> from TTS.tts.models.glow_tts import GlowTTS
>>> config = GlowTTSConfig() >>> config = GlowTTSConfig()
>>> model = GlowTTS(config) >>> model = GlowTTS(config)
""" """
def __init__(self, config: GlowTTSConfig): def __init__(self, config: GlowTTSConfig, speaker_manager: SpeakerManager = None):
super().__init__(config) super().__init__(config)
self.speaker_manager = speaker_manager
# pass all config fields to `self` # pass all config fields to `self`
# for fewer code change # for fewer code change
self.config = config self.config = config
@ -58,19 +61,10 @@ class GlowTTS(BaseTTS):
_, self.config, self.num_chars = self.get_characters(config) _, self.config, self.num_chars = self.get_characters(config)
self.decoder_output_dim = config.out_channels self.decoder_output_dim = config.out_channels
# init multi-speaker layers if necessary
self.init_multispeaker(config) self.init_multispeaker(config)
# if is a multispeaker and c_in_channels is 0, set to 256
self.c_in_channels = 0
if self.num_speakers > 1:
if self.d_vector_dim:
self.c_in_channels = self.d_vector_dim
elif self.c_in_channels == 0 and not self.d_vector_dim:
# TODO: make this adjustable
self.c_in_channels = 256
self.run_data_dep_init = config.data_dep_init_steps > 0 self.run_data_dep_init = config.data_dep_init_steps > 0
self.encoder = Encoder( self.encoder = Encoder(
self.num_chars, self.num_chars,
out_channels=self.out_channels, out_channels=self.out_channels,
@ -98,28 +92,35 @@ class GlowTTS(BaseTTS):
c_in_channels=self.c_in_channels, c_in_channels=self.c_in_channels,
) )
def init_multispeaker(self, config: "Coqpit", data: list = None) -> None: def init_multispeaker(self, config: Coqpit):
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer """Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding
or with external `d_vectors` computed from a speaker encoder model. vector dimension in the network. If model uses d-vectors, then it only sets the expected dimension.
If you need a different behaviour, override this function for your model.
Args: Args:
config (Coqpit): Model configuration. config (Coqpit): Model configuration.
data (List, optional): Dataset items to infer number of speakers. Defaults to None.
""" """
self.embedded_speaker_dim = 0
# init speaker manager # init speaker manager
self.speaker_manager = get_speaker_manager(config, data=data) if self.speaker_manager is None and (self.use_speaker_embedding or self.use_d_vector_file):
raise ValueError(
" > SpeakerManager is not provided. You must provide the SpeakerManager before initializing a multi-speaker model."
)
# set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
if self.speaker_manager is not None:
self.num_speakers = self.speaker_manager.num_speakers self.num_speakers = self.speaker_manager.num_speakers
if config.use_d_vector_file: # set ultimate speaker embedding size
self.external_d_vector_dim = config.d_vector_dim if config.use_speaker_embedding or config.use_d_vector_file:
else: self.embedded_speaker_dim = (
self.external_d_vector_dim = 0 config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
)
# init speaker embedding layer # init speaker embedding layer
if config.use_speaker_embedding and not config.use_d_vector_file: if config.use_speaker_embedding and not config.use_d_vector_file:
self.embedded_speaker_dim = self.c_in_channels print(" > Init speaker_embedding layer.")
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.embedded_speaker_dim = self.hidden_channels_enc
self.emb_g = nn.Embedding(self.num_speakers, self.hidden_channels_enc)
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
# set conditioning dimensions
self.c_in_channels = self.embedded_speaker_dim
@staticmethod @staticmethod
def compute_outputs(attn, o_mean, o_log_scale, x_mask): def compute_outputs(attn, o_mean, o_log_scale, x_mask):
@ -146,6 +147,35 @@ class GlowTTS(BaseTTS):
if getattr(f, "set_ddi", False): if getattr(f, "set_ddi", False):
f.set_ddi(False) f.set_ddi(False)
def _set_speaker_input(self, aux_input: Dict):
if aux_input is None:
d_vectors = None
speaker_ids = None
else:
d_vectors = aux_input.get("d_vectors", None)
speaker_ids = aux_input.get("speaker_ids", None)
if d_vectors is not None and speaker_ids is not None:
raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
if speaker_ids is not None and not hasattr(self, "emb_g"):
raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
g = speaker_ids if speaker_ids is not None else d_vectors
return g
def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:
g = self._set_speaker_input(aux_input)
# speaker embedding
if g is not None:
if hasattr(self, "emb_g"):
# use speaker embedding layer
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
else:
# use d-vector
g = F.normalize(g).unsqueeze(-1) # [b, h, 1]
return g
def forward( def forward(
self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
): # pylint: disable=dangerous-default-value ): # pylint: disable=dangerous-default-value
@ -161,12 +191,7 @@ class GlowTTS(BaseTTS):
y = y.transpose(1, 2) y = y.transpose(1, 2)
y_max_length = y.size(2) y_max_length = y.size(2)
# norm speaker embeddings # norm speaker embeddings
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None g = self._speaker_embedding(aux_input)
if self.use_speaker_embedding or self.use_d_vector_file:
if not self.use_d_vector_file:
g = F.normalize(g).unsqueeze(-1)
else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
# embedding pass # embedding pass
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
# drop redisual frames wrt num_squeeze and set y_lengths. # drop redisual frames wrt num_squeeze and set y_lengths.
@ -217,12 +242,7 @@ class GlowTTS(BaseTTS):
y = y.transpose(1, 2) y = y.transpose(1, 2)
y_max_length = y.size(2) y_max_length = y.size(2)
# norm speaker embeddings # norm speaker embeddings
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None g = self._speaker_embedding(aux_input)
if self.use_speaker_embedding or self.use_d_vector_file:
if not self.use_d_vector_file:
g = F.normalize(g).unsqueeze(-1)
else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
# embedding pass # embedding pass
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
# drop redisual frames wrt num_squeeze and set y_lengths. # drop redisual frames wrt num_squeeze and set y_lengths.
@ -272,22 +292,12 @@ class GlowTTS(BaseTTS):
""" """
y = y.transpose(1, 2) y = y.transpose(1, 2)
y_max_length = y.size(2) y_max_length = y.size(2)
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None g = self._speaker_embedding(aux_input)
# norm speaker embeddings
if g is not None:
if self.external_d_vector_dim:
g = F.normalize(g).unsqueeze(-1)
else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype) y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype)
# decoder pass # decoder pass
z, logdet = self.decoder(y, y_mask, g=g, reverse=False) z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
# reverse decoder and predict # reverse decoder and predict
y, logdet = self.decoder(z, y_mask, g=g, reverse=True) y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
outputs = {} outputs = {}
outputs["model_outputs"] = y.transpose(1, 2) outputs["model_outputs"] = y.transpose(1, 2)
outputs["logdet"] = logdet outputs["logdet"] = logdet
@ -298,14 +308,7 @@ class GlowTTS(BaseTTS):
self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None} self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
): # pylint: disable=dangerous-default-value ): # pylint: disable=dangerous-default-value
x_lengths = aux_input["x_lengths"] x_lengths = aux_input["x_lengths"]
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None g = self._speaker_embedding(aux_input)
if g is not None:
if self.d_vector_dim:
g = F.normalize(g).unsqueeze(-1)
else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h]
# embedding pass # embedding pass
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
# compute output durations # compute output durations
@ -389,15 +392,15 @@ class GlowTTS(BaseTTS):
def _create_logs(self, batch, outputs, ap): def _create_logs(self, batch, outputs, ap):
alignments = outputs["alignments"] alignments = outputs["alignments"]
text_input = batch["text_input"] text_input = batch["text_input"][:1] if batch["text_input"] is not None else None
text_lengths = batch["text_lengths"] text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"] mel_input = batch["mel_input"]
d_vectors = batch["d_vectors"] d_vectors = batch["d_vectors"][:1] if batch["d_vectors"] is not None else None
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"][:1] if batch["speaker_ids"] is not None else None
# model runs reverse flow to predict spectrograms # model runs reverse flow to predict spectrograms
pred_outputs = self.inference( pred_outputs = self.inference(
text_input[:1], text_input,
aux_input={"x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids}, aux_input={"x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids},
) )
model_outputs = pred_outputs["model_outputs"] model_outputs = pred_outputs["model_outputs"]
@ -448,7 +451,7 @@ class GlowTTS(BaseTTS):
test_audios = {} test_audios = {}
test_figures = {} test_figures = {}
test_sentences = self.config.test_sentences test_sentences = self.config.test_sentences
aux_inputs = self.get_aux_input() aux_inputs = self._get_test_aux_input()
if len(test_sentences) == 0: if len(test_sentences) == 0:
print(" | [!] No test sentences provided.") print(" | [!] No test sentences provided.")
else: else:

View File

@ -3,11 +3,13 @@
import torch import torch
from coqpit import Coqpit from coqpit import Coqpit
from torch import nn from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.gst_layers import GST
from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.models.base_tacotron import BaseTacotron
from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
@ -15,11 +17,17 @@ class Tacotron(BaseTacotron):
"""Tacotron as in https://arxiv.org/abs/1703.10135 """Tacotron as in https://arxiv.org/abs/1703.10135
It's an autoregressive encoder-attention-decoder-postnet architecture. It's an autoregressive encoder-attention-decoder-postnet architecture.
Check `TacotronConfig` for the arguments. Check `TacotronConfig` for the arguments.
Args:
config (TacotronConfig): Configuration for the Tacotron model.
speaker_manager (SpeakerManager): Speaker manager to handle multi-speaker settings. Only use if the model is
a multi-speaker model. Defaults to None.
""" """
def __init__(self, config: Coqpit): def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
super().__init__(config) super().__init__(config)
self.speaker_manager = speaker_manager
chars, self.config, _ = self.get_characters(config) chars, self.config, _ = self.get_characters(config)
config.num_chars = self.num_chars = len(chars) config.num_chars = self.num_chars = len(chars)
@ -240,19 +248,20 @@ class Tacotron(BaseTacotron):
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input) outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
# compute loss # compute loss
with autocast(enabled=False): # use float32 for the criterion
loss_dict = criterion( loss_dict = criterion(
outputs["model_outputs"], outputs["model_outputs"].float(),
outputs["decoder_outputs"], outputs["decoder_outputs"].float(),
mel_input, mel_input.float(),
linear_input, linear_input.float(),
outputs["stop_tokens"], outputs["stop_tokens"].float(),
stop_targets, stop_targets.float(),
stop_target_lengths, stop_target_lengths,
mel_lengths, mel_lengths,
outputs["decoder_outputs_backward"], None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
outputs["alignments"], outputs["alignments"].float(),
alignment_lengths, alignment_lengths,
outputs["alignments_backward"], None if outputs["alignments_backward"] is None else outputs["alignments_backward"].float(),
text_lengths, text_lengths,
) )
@ -263,17 +272,23 @@ class Tacotron(BaseTacotron):
def _create_logs(self, batch, outputs, ap): def _create_logs(self, batch, outputs, ap):
postnet_outputs = outputs["model_outputs"] postnet_outputs = outputs["model_outputs"]
decoder_outputs = outputs["decoder_outputs"]
alignments = outputs["alignments"] alignments = outputs["alignments"]
alignments_backward = outputs["alignments_backward"] alignments_backward = outputs["alignments_backward"]
mel_input = batch["mel_input"] mel_input = batch["mel_input"]
linear_input = batch["linear_input"]
pred_spec = postnet_outputs[0].data.cpu().numpy() pred_linear_spec = postnet_outputs[0].data.cpu().numpy()
gt_spec = mel_input[0].data.cpu().numpy() pred_mel_spec = decoder_outputs[0].data.cpu().numpy()
gt_linear_spec = linear_input[0].data.cpu().numpy()
gt_mel_spec = mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy()
figures = { figures = {
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False), "pred_linear_spec": plot_spectrogram(pred_linear_spec, ap, output_fig=False),
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), "real_linear_spec": plot_spectrogram(gt_linear_spec, ap, output_fig=False),
"pred_mel_spec": plot_spectrogram(pred_mel_spec, ap, output_fig=False),
"real_mel_spec": plot_spectrogram(gt_mel_spec, ap, output_fig=False),
"alignment": plot_alignment(align_img, output_fig=False), "alignment": plot_alignment(align_img, output_fig=False),
} }
@ -281,7 +296,7 @@ class Tacotron(BaseTacotron):
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False) figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
# Sample audio # Sample audio
audio = ap.inv_spectrogram(pred_spec.T) audio = ap.inv_spectrogram(pred_linear_spec.T)
return figures, {"audio": audio} return figures, {"audio": audio}
def train_log( def train_log(

View File

@ -1,24 +1,49 @@
# coding: utf-8 # coding: utf-8
from typing import Dict
import torch import torch
from coqpit import Coqpit from coqpit import Coqpit
from torch import nn from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.gst_layers import GST
from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.models.base_tacotron import BaseTacotron
from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
class Tacotron2(BaseTacotron): class Tacotron2(BaseTacotron):
"""Tacotron2 as in https://arxiv.org/abs/1712.05884 """Tacotron2 model implementation inherited from :class:`TTS.tts.models.base_tacotron.BaseTacotron`.
Check `TacotronConfig` for the arguments.
Paper::
https://arxiv.org/abs/1712.05884
Paper abstract::
This paper describes Tacotron 2, a neural network architecture for speech synthesis directly from text.
The system is composed of a recurrent sequence-to-sequence feature prediction network that maps character
embeddings to mel-scale spectrograms, followed by a modified WaveNet model acting as a vocoder to synthesize
timedomain waveforms from those spectrograms. Our model achieves a mean opinion score (MOS) of 4.53 comparable
to a MOS of 4.58 for professionally recorded speech. To validate our design choices, we present ablation
studies of key components of our system and evaluate the impact of using mel spectrograms as the input to
WaveNet instead of linguistic, duration, and F0 features. We further demonstrate that using a compact acoustic
intermediate representation enables significant simplification of the WaveNet architecture.
Check :class:`TTS.tts.configs.tacotron2_config.Tacotron2Config` for model arguments.
Args:
config (TacotronConfig):
Configuration for the Tacotron2 model.
speaker_manager (SpeakerManager):
Speaker manager for multi-speaker training. Uuse only for multi-speaker training. Defaults to None.
""" """
def __init__(self, config: Coqpit): def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
super().__init__(config) super().__init__(config)
self.speaker_manager = speaker_manager
chars, self.config, _ = self.get_characters(config) chars, self.config, _ = self.get_characters(config)
config.num_chars = len(chars) config.num_chars = len(chars)
self.decoder_output_dim = config.out_channels self.decoder_output_dim = config.out_channels
@ -28,9 +53,7 @@ class Tacotron2(BaseTacotron):
for key in config: for key in config:
setattr(self, key, config[key]) setattr(self, key, config[key])
# set speaker embedding channel size for determining `in_channels` for the connected layers. # init multi-speaker layers
# `init_multispeaker` needs to be called once more in training to initialize the speaker embedding layer based
# on the number of speakers infered from the dataset.
if self.use_speaker_embedding or self.use_d_vector_file: if self.use_speaker_embedding or self.use_d_vector_file:
self.init_multispeaker(config) self.init_multispeaker(config)
self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim
@ -100,6 +123,7 @@ class Tacotron2(BaseTacotron):
@staticmethod @staticmethod
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments): def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
"""Final reshape of the model output tensors."""
mel_outputs = mel_outputs.transpose(1, 2) mel_outputs = mel_outputs.transpose(1, 2)
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
return mel_outputs, mel_outputs_postnet, alignments return mel_outputs, mel_outputs_postnet, alignments
@ -107,13 +131,14 @@ class Tacotron2(BaseTacotron):
def forward( # pylint: disable=dangerous-default-value def forward( # pylint: disable=dangerous-default-value
self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input={"speaker_ids": None, "d_vectors": None} self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input={"speaker_ids": None, "d_vectors": None}
): ):
""" """Forward pass for training with Teacher Forcing.
Shapes: Shapes:
text: [B, T_in] text: :math:`[B, T_in]`
text_lengths: [B] text_lengths: :math:`[B]`
mel_specs: [B, T_out, C] mel_specs: :math:`[B, T_out, C]`
mel_lengths: [B] mel_lengths: :math:`[B]`
aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] aux_input: 'speaker_ids': :math:`[B, 1]` and 'd_vectors': :math:`[B, C]`
""" """
aux_input = self._format_aux_input(aux_input) aux_input = self._format_aux_input(aux_input)
outputs = {"alignments_backward": None, "decoder_outputs_backward": None} outputs = {"alignments_backward": None, "decoder_outputs_backward": None}
@ -174,6 +199,12 @@ class Tacotron2(BaseTacotron):
@torch.no_grad() @torch.no_grad()
def inference(self, text, aux_input=None): def inference(self, text, aux_input=None):
"""Forward pass for inference with no Teacher-Forcing.
Shapes:
text: :math:`[B, T_in]`
text_lengths: :math:`[B]`
"""
aux_input = self._format_aux_input(aux_input) aux_input = self._format_aux_input(aux_input)
embedded_inputs = self.embedding(text).transpose(1, 2) embedded_inputs = self.embedding(text).transpose(1, 2)
encoder_outputs = self.encoder.inference(embedded_inputs) encoder_outputs = self.encoder.inference(embedded_inputs)
@ -207,18 +238,17 @@ class Tacotron2(BaseTacotron):
} }
return outputs return outputs
def train_step(self, batch, criterion): def train_step(self, batch: Dict, criterion: torch.nn.Module):
"""Perform a single training step by fetching the right set if samples from the batch. """A single training step. Forward pass and loss computation.
Args: Args:
batch ([type]): [description] batch ([Dict]): A dictionary of input tensors.
criterion ([type]): [description] criterion ([type]): Callable criterion to compute model loss.
""" """
text_input = batch["text_input"] text_input = batch["text_input"]
text_lengths = batch["text_lengths"] text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"] mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"] mel_lengths = batch["mel_lengths"]
linear_input = batch["linear_input"]
stop_targets = batch["stop_targets"] stop_targets = batch["stop_targets"]
stop_target_lengths = batch["stop_target_lengths"] stop_target_lengths = batch["stop_target_lengths"]
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
@ -245,19 +275,20 @@ class Tacotron2(BaseTacotron):
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input) outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
# compute loss # compute loss
with autocast(enabled=False): # use float32 for the criterion
loss_dict = criterion( loss_dict = criterion(
outputs["model_outputs"], outputs["model_outputs"].float(),
outputs["decoder_outputs"], outputs["decoder_outputs"].float(),
mel_input, mel_input.float(),
linear_input, None,
outputs["stop_tokens"], outputs["stop_tokens"].float(),
stop_targets, stop_targets.float(),
stop_target_lengths, stop_target_lengths,
mel_lengths, mel_lengths,
outputs["decoder_outputs_backward"], None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
outputs["alignments"], outputs["alignments"].float(),
alignment_lengths, alignment_lengths,
outputs["alignments_backward"], None if outputs["alignments_backward"] is None else outputs["alignments_backward"].float(),
text_lengths, text_lengths,
) )
@ -267,6 +298,7 @@ class Tacotron2(BaseTacotron):
return outputs, loss_dict return outputs, loss_dict
def _create_logs(self, batch, outputs, ap): def _create_logs(self, batch, outputs, ap):
"""Create dashboard log information."""
postnet_outputs = outputs["model_outputs"] postnet_outputs = outputs["model_outputs"]
alignments = outputs["alignments"] alignments = outputs["alignments"]
alignments_backward = outputs["alignments_backward"] alignments_backward = outputs["alignments_backward"]
@ -292,6 +324,7 @@ class Tacotron2(BaseTacotron):
def train_log( def train_log(
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
) -> None: # pylint: disable=no-self-use ) -> None: # pylint: disable=no-self-use
"""Log training progress."""
ap = assets["audio_processor"] ap = assets["audio_processor"]
figures, audios = self._create_logs(batch, outputs, ap) figures, audios = self._create_logs(batch, outputs, ap)
logger.train_figures(steps, figures) logger.train_figures(steps, figures)

View File

@ -1,4 +1,5 @@
import math import math
import random
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain from itertools import chain
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
@ -14,7 +15,7 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
from TTS.tts.utils.speakers import get_speaker_manager from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.visual import plot_alignment from TTS.tts.utils.visual import plot_alignment
from TTS.utils.trainer_utils import get_optimizer, get_scheduler from TTS.utils.trainer_utils import get_optimizer, get_scheduler
@ -180,6 +181,7 @@ class VitsArgs(Coqpit):
speakers_file: str = None speakers_file: str = None
speaker_embedding_channels: int = 256 speaker_embedding_channels: int = 256
use_d_vector_file: bool = False use_d_vector_file: bool = False
d_vector_file: str = None
d_vector_dim: int = 0 d_vector_dim: int = 0
detach_dp_input: bool = True detach_dp_input: bool = True
@ -214,12 +216,13 @@ class Vits(BaseTTS):
# pylint: disable=dangerous-default-value # pylint: disable=dangerous-default-value
def __init__(self, config: Coqpit): def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
super().__init__(config) super().__init__(config)
self.END2END = True self.END2END = True
self.speaker_manager = speaker_manager
if config.__class__.__name__ == "VitsConfig": if config.__class__.__name__ == "VitsConfig":
# loading from VitsConfig # loading from VitsConfig
if "num_chars" not in config: if "num_chars" not in config:
@ -311,30 +314,41 @@ class Vits(BaseTTS):
if args.init_discriminator: if args.init_discriminator:
self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator) self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator)
def init_multispeaker(self, config: Coqpit, data: List = None): def init_multispeaker(self, config: Coqpit):
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
or with external `d_vectors` computed from a speaker encoder model. or with external `d_vectors` computed from a speaker encoder model.
If you need a different behaviour, override this function for your model.
Args: Args:
config (Coqpit): Model configuration. config (Coqpit): Model configuration.
data (List, optional): Dataset items to infer number of speakers. Defaults to None. data (List, optional): Dataset items to infer number of speakers. Defaults to None.
""" """
self.embedded_speaker_dim = 0
if hasattr(config, "model_args"): if hasattr(config, "model_args"):
config = config.model_args config = config.model_args
self.embedded_speaker_dim = 0
# init speaker manager self.num_speakers = config.num_speakers
self.speaker_manager = get_speaker_manager(config, data=data)
if config.num_speakers > 0 and self.speaker_manager.num_speakers == 0: if config.use_speaker_embedding:
self.speaker_manager.num_speakers = config.num_speakers self._init_speaker_embedding(config)
self.num_speakers = self.speaker_manager.num_speakers
# init speaker embedding layer
if config.use_speaker_embedding and not config.use_d_vector_file:
self.embedded_speaker_dim = config.speaker_embedding_channels
self.emb_g = nn.Embedding(config.num_speakers, config.speaker_embedding_channels)
# init d-vector usage
if config.use_d_vector_file: if config.use_d_vector_file:
self._init_d_vector(config)
def _init_speaker_embedding(self, config):
# pylint: disable=attribute-defined-outside-init
if config.speakers_file is not None:
self.speaker_manager = SpeakerManager(speaker_id_file_path=config.speakers_file_path)
if self.num_speakers > 0:
print(" > initialization of speaker-embedding layers.")
self.embedded_speaker_dim = config.speaker_embedding_channels
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
def _init_d_vector(self, config):
# pylint: disable=attribute-defined-outside-init
if hasattr(self, "emb_g"):
raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
self.embedded_speaker_dim = config.d_vector_dim self.embedded_speaker_dim = config.d_vector_dim
@staticmethod @staticmethod
@ -349,6 +363,10 @@ class Vits(BaseTTS):
g = aux_input["d_vectors"] g = aux_input["d_vectors"]
return sid, g return sid, g
def get_aux_input(self, aux_input: Dict):
sid, g = self._set_cond_input(aux_input)
return {"speaker_id": sid, "style_wav": None, "d_vector": g}
def forward( def forward(
self, self,
x: torch.tensor, x: torch.tensor,
@ -633,7 +651,15 @@ class Vits(BaseTTS):
test_audios = {} test_audios = {}
test_figures = {} test_figures = {}
test_sentences = self.config.test_sentences test_sentences = self.config.test_sentences
aux_inputs = self.get_aux_input() aux_inputs = {
"speaker_id": None
if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1),
"d_vector": None
if not self.config.use_d_vector_file
else random.samples(sorted(self.speaker_manager.d_vectors.values()), 1),
"style_wav": None,
}
for idx, sen in enumerate(test_sentences): for idx, sen in enumerate(test_sentences):
wav, alignment, _, _ = synthesis( wav, alignment, _, _ = synthesis(
self, self,
@ -670,7 +696,7 @@ class Vits(BaseTTS):
) )
# add the speaker embedding layer # add the speaker embedding layer
if hasattr(self, "emb_g"): if hasattr(self, "emb_g"):
gen_parameters = chain(gen_parameters, self.emb_g) gen_parameters = chain(gen_parameters, self.emb_g.parameters())
optimizer0 = get_optimizer( optimizer0 = get_optimizer(
self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters
) )

View File

@ -63,7 +63,6 @@ class SpeakerManager:
use_cuda: bool = False, use_cuda: bool = False,
): ):
self.data_items = []
self.d_vectors = {} self.d_vectors = {}
self.speaker_ids = {} self.speaker_ids = {}
self.clip_ids = [] self.clip_ids = []
@ -72,7 +71,7 @@ class SpeakerManager:
self.use_cuda = use_cuda self.use_cuda = use_cuda
if data_items: if data_items:
self.speaker_ids, self.speaker_names, _ = self.parse_speakers_from_data(self.data_items) self.speaker_ids, _ = self.parse_speakers_from_data(data_items)
if d_vectors_file_path: if d_vectors_file_path:
self.set_d_vectors_from_file(d_vectors_file_path) self.set_d_vectors_from_file(d_vectors_file_path)

View File

@ -23,8 +23,10 @@ def _ssim(img1, img2, window, window_size, channel, size_average=True):
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
mu1_sq = mu1.pow(2) # TODO: check if you need AMP disabled
mu2_sq = mu2.pow(2) # with torch.cuda.amp.autocast(enabled=False):
mu1_sq = mu1.float().pow(2)
mu2_sq = mu2.float().pow(2)
mu1_mu2 = mu1 * mu2 mu1_mu2 = mu1 * mu2
sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq

View File

@ -172,7 +172,7 @@ def speaker_id_to_torch(speaker_id, cuda=False):
def embedding_to_torch(d_vector, cuda=False): def embedding_to_torch(d_vector, cuda=False):
if d_vector is not None: if d_vector is not None:
d_vector = np.asarray(d_vector) d_vector = np.asarray(d_vector)
d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor) d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
if cuda: if cuda:
return d_vector.cuda() return d_vector.cuda()
return d_vector return d_vector
@ -210,20 +210,42 @@ def synthesis(
d_vector=None, d_vector=None,
backend="torch", backend="torch",
): ):
"""Synthesize voice for the given text. """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
the vocoder model.
Args: Args:
model (TTS.tts.models): model to synthesize. model (TTS.tts.models):
text (str): target text The TTS model to synthesize audio with.
CONFIG (dict): config dictionary to be loaded from config.json.
use_cuda (bool): enable cuda. text (str):
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process The input text to convert to speech.
model outputs.
speaker_id (int): id of speaker CONFIG (Coqpit):
style_wav (str | Dict[str, float]): Uses for style embedding of GST. Model configuration.
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
do_trim_silence (bool): trim silence after synthesis. use_cuda (bool):
backend (str): tf or torch Enable/disable CUDA.
ap (TTS.tts.utils.audio.AudioProcessor):
The audio processor for extracting features and pre/post-processing audio.
speaker_id (int):
Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
style_wav (str | Dict[str, float]):
Path or tensor to/of a waveform used for computing the style embedding. Defaults to None.
enable_eos_bos_chars (bool):
enable special chars for end of sentence and start of sentence. Defaults to False.
do_trim_silence (bool):
trim silence after synthesis. Defaults to False.
d_vector (torch.Tensor):
d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
backend (str):
tf or torch. Defaults to "torch".
""" """
# GST processing # GST processing
style_mel = None style_mel = None

View File

@ -108,6 +108,8 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method
class AudioProcessor(object): class AudioProcessor(object):
"""Audio Processor for TTS used by all the data pipelines. """Audio Processor for TTS used by all the data pipelines.
TODO: Make this a dataclass to replace `BaseAudioConfig`.
Note: Note:
All the class arguments are set to default values to enable a flexible initialization All the class arguments are set to default values to enable a flexible initialization
of the class with the model config. They are not meaningful for all the arguments. of the class with the model config. They are not meaningful for all the arguments.
@ -643,6 +645,10 @@ class AudioProcessor(object):
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
>>> pitch = ap.compute_f0(wav) >>> pitch = ap.compute_f0(wav)
""" """
# align F0 length to the spectrogram length
if len(x) % self.hop_length == 0:
x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
f0, t = pw.dio( f0, t = pw.dio(
x.astype(np.double), x.astype(np.double),
fs=self.sample_rate, fs=self.sample_rate,
@ -745,6 +751,14 @@ class AudioProcessor(object):
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
def get_duration(self, filename: str) -> float:
"""Get the duration of a wav file using Librosa.
Args:
filename (str): Path to the wav file.
"""
return librosa.get_duration(filename)
@staticmethod @staticmethod
def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray: def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray:
mu = 2 ** qc - 1 mu = 2 ** qc - 1

View File

@ -47,9 +47,17 @@ class ConsoleLogger:
tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC
) )
for key, value in loss_dict.items(): for key, value in loss_dict.items():
# print the avg value if given
if f"avg_{key}" in avg_loss_dict.keys(): if f"avg_{key}" in avg_loss_dict.keys():
# print the avg value if given
if isinstance(value, float) and round(value, 5) == 0:
# do not round the number if it is zero when rounded
log_text += "{}{}: {} ({})\n".format(indent, key, value, avg_loss_dict[f"avg_{key}"])
else:
# print the rounded value
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f"avg_{key}"]) log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f"avg_{key}"])
else:
if isinstance(value, float) and round(value, 5) == 0:
log_text += "{}{}: {} \n".format(indent, key, value)
else: else:
log_text += "{}{}: {:.5f} \n".format(indent, key, value) log_text += "{}{}: {:.5f} \n".format(indent, key, value)
print(log_text, flush=True) print(log_text, flush=True)

View File

@ -87,52 +87,15 @@ class Synthesizer(object):
""" """
return pysbd.Segmenter(language=lang, clean=True) return pysbd.Segmenter(language=lang, clean=True)
def _load_speakers(self, speaker_file: str) -> None:
"""Load the SpeakerManager to organize multi-speaker TTS. It loads the speakers meta-data and the speaker
encoder if it is defined.
Args:
speaker_file (str): path to the speakers meta-data file.
"""
print("Loading speakers ...")
self.speaker_manager = SpeakerManager(
encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config
)
self.speaker_manager.load_d_vectors_file(self.tts_config.get("d_vector_file", speaker_file))
self.num_speakers = self.speaker_manager.num_speakers
self.d_vector_dim = self.speaker_manager.d_vector_dim
def _set_tts_speaker_file(self):
"""Set the TTS speaker file used by a multi-speaker model."""
# setup if multi-speaker settings are in the global model config
if hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True:
if self.tts_config.use_d_vector_file:
self.tts_speakers_file = (
self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"]
)
self.tts_config["d_vector_file"] = self.tts_speakers_file
else:
self.tts_speakers_file = (
self.tts_speakers_file if self.tts_speakers_file else self.tts_config["speakers_file"]
)
# setup if multi-speaker settings are in the model args config
if (
self.tts_speakers_file is None
and hasattr(self.tts_config, "model_args")
and hasattr(self.tts_config.model_args, "use_speaker_embedding")
and self.tts_config.model_args.use_speaker_embedding
):
_args = self.tts_config.model_args
if _args.use_d_vector_file:
self.tts_speakers_file = self.tts_speakers_file if self.tts_speakers_file else _args["d_vector_file"]
_args["d_vector_file"] = self.tts_speakers_file
else:
self.tts_speakers_file = self.tts_speakers_file if self.tts_speakers_file else _args["speakers_file"]
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None:
"""Load the TTS model. """Load the TTS model.
1. Load the model config.
2. Init the AudioProcessor.
3. Init the model from the config.
4. Move the model to the GPU if CUDA is enabled.
5. Init the speaker manager for the model.
Args: Args:
tts_checkpoint (str): path to the model checkpoint. tts_checkpoint (str): path to the model checkpoint.
tts_config_path (str): path to the model config file. tts_config_path (str): path to the model config file.
@ -144,15 +107,38 @@ class Synthesizer(object):
self.use_phonemes = self.tts_config.use_phonemes self.use_phonemes = self.tts_config.use_phonemes
self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)
self.tts_model = setup_tts_model(config=self.tts_config) speaker_manager = self._init_speaker_manager()
self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager)
self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
if use_cuda: if use_cuda:
self.tts_model.cuda() self.tts_model.cuda()
self._set_tts_speaker_file()
def _init_speaker_manager(self):
"""Initialize the SpeakerManager"""
# setup if multi-speaker settings are in the global model config
speaker_manager = None
if hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True:
if self.tts_speakers_file:
speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_speakers_file)
if self.tts_config.get("speakers_file", None):
speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_config.speakers_file)
if hasattr(self.tts_config, "use_d_vector_file") and self.tts_config.use_speaker_embedding is True:
if self.tts_speakers_file:
speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_speakers_file)
if self.tts_config.get("d_vector_file", None):
speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_config.d_vector_file)
return speaker_manager
def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None:
"""Load the vocoder model. """Load the vocoder model.
1. Load the vocoder config.
2. Init the AudioProcessor for the vocoder.
3. Init the vocoder model from the config.
4. Move the model to the GPU if CUDA is enabled.
Args: Args:
model_file (str): path to the model checkpoint. model_file (str): path to the model checkpoint.
model_config (str): path to the model config file. model_config (str): path to the model config file.
@ -207,11 +193,12 @@ class Synthesizer(object):
# handle multi-speaker # handle multi-speaker
speaker_embedding = None speaker_embedding = None
speaker_id = None speaker_id = None
if self.tts_speakers_file: if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
if speaker_idx and isinstance(speaker_idx, str): if speaker_idx and isinstance(speaker_idx, str):
if self.tts_config.use_d_vector_file: if self.tts_config.use_d_vector_file:
# get the speaker embedding from the saved d_vectors. # get the speaker embedding from the saved d_vectors.
speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0]
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
else: else:
# get speaker idx from the speaker name # get speaker idx from the speaker name
speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_idx] speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_idx]
@ -226,7 +213,7 @@ class Synthesizer(object):
else: else:
if speaker_idx: if speaker_idx:
raise ValueError( raise ValueError(
f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}." f" [!] Missing speakers.json file path for selecting speaker {speaker_idx}."
"Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
) )

View File

@ -47,6 +47,7 @@
models/glow_tts.md models/glow_tts.md
models/vits.md models/vits.md
models/forward_tts.md models/forward_tts.md
models/tacotron1-2.md
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2

View File

@ -0,0 +1,63 @@
# 🌮 Tacotron 1 and 2
Tacotron is one of the first successful DL-based text-to-mel models and opened up the whole TTS field for more DL research.
Tacotron mainly is an encoder-decoder model with attention.
The encoder takes input tokens (characters or phonemes) and the decoder outputs mel-spectrogram* frames. Attention module in-between learns to align the input tokens with the output mel-spectrgorams.
Tacotron1 and 2 are both built on the same encoder-decoder architecture but they use different layers. Additionally, Tacotron1 uses a Postnet module to convert mel-spectrograms to linear spectrograms with a higher resolution before the vocoder.
Vanilla Tacotron models are slow at inference due to the auto-regressive* nature that prevents the model to process all the inputs in parallel. One trick is to use a higher “reduction rate” that helps the model to predict multiple frames at once. That is, reduction rate 2 reduces the number of decoder iterations by half.
Tacotron also uses a Prenet module with Dropout that projects the models previous output before feeding it to the decoder again. The paper and most of the implementations use the Dropout layer even in inference and they report the attention fails or the voice quality degrades otherwise. But the issue with that, you get a slightly different output speech every time you run the model.
Tsraining the attention is notoriously problematic in Tacoron models. Especially, in inference, for some input sequences, the alignment fails and causes the model to produce unexpected results. There are many different methods proposed to improve the attention.
After hundreds of experiments, @ 🐸TTS we suggest Double Decoder Consistency that leads to the most robust model performance.
If you have a limited VRAM, then you can try using the Guided Attention Loss or the Dynamic Convolutional Attention. You can also combine the two.
## Important resources & papers
- Tacotron: https://arxiv.org/abs/2006.06873
- Tacotron2: https://arxiv.org/abs/2008.03802
- Double Decoder Consistency: https://coqui.ai/blog/tts/solving-attention-problems-of-tts-models-with-double-decoder-consistency
- Guided Attention Loss: https://arxiv.org/abs/1710.08969
- Forward & Backward Decoder: https://arxiv.org/abs/1907.09006
- Forward Attention: https://arxiv.org/abs/1807.06736
- Gaussian Attention: https://arxiv.org/abs/1910.10288
- Dynamic Convolutional Attention: https://arxiv.org/pdf/1910.10288.pdf
## BaseTacotron
```{eval-rst}
.. autoclass:: TTS.tts.models.base_tacotron.BaseTacotron
:members:
```
## Tacotron
```{eval-rst}
.. autoclass:: TTS.tts.models.tacotron.Tacotron
:members:
```
## Tacotron2
```{eval-rst}
.. autoclass:: TTS.tts.models.tacotron2.Tacotron2
:members:
```
## TacotronConfig
```{eval-rst}
.. autoclass:: TTS.tts.configs.tacotron_config.TacotronConfig
:members:
```
## Tacotron2Config
```{eval-rst}
.. autoclass:: TTS.tts.configs.tacotron2_config.Tacotron2Config
:members:
```

View File

@ -1,18 +1,19 @@
# Training a Model # Training a Model
1. Decide what model you want to use. 1. Decide the model you want to use.
Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS
community models and see how fast and good each of the models. Or you can start a discussion on our communication channels. community models and see how fast and good each of the models. Or you can start a discussion on our communication channels.
2. Understand the configuration, its fields and values of your model. 2. Understand the configuration, its fields and values.
For instance, if you want to train a `Tacotron` model then see the `TacotronConfig` class and make sure you understand it. For instance, if you want to train a `Tacotron` model then see the `TacotronConfig` class and make sure you understand it.
3. Go to the recipes and check the recipe of your target model. 3. Check the recipes.
Recipes do not promise perfect models but they provide a good start point for `Nervous Beginners`. A recipe script for Recipes are located under `TTS/recipes/`. They do not promise perfect models but they provide a good start point for
`GlowTTS` using `LJSpeech` dataset looks like below. Let's be creative and call this `train_glowtts.py`. `Nervous Beginners`.
A recipe for `GlowTTS` using `LJSpeech` dataset looks like below. Let's be creative and call this `train_glowtts.py`.
```python ```python
# train_glowtts.py # train_glowtts.py
@ -20,7 +21,8 @@
import os import os
from TTS.trainer import Trainer, TrainingArgs from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs import BaseDatasetConfig, GlowTTSConfig from TTS.tts.configs.shared_config import BaseDatasetConfig
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.models.glow_tts import GlowTTS
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
@ -183,3 +185,80 @@
8. Return to the step 1 and reiterate for training a `vocoder` model. 8. Return to the step 1 and reiterate for training a `vocoder` model.
In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models. In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models.
# Multi-speaker Training
Training a multi-speaker model is mostly the same as training a single-speaker model.
You need to specify a couple of configuration parameters, initiate a `SpeakerManager` instance and pass it to the model.
The configuration parameters define whether you want to train the model with a speaker-embedding layer or pre-computed
d-vectors. For using d-vectors, you first need to compute the d-vectors using the `SpeakerEncoder`.
The same Glow-TTS model above can be trained on a multi-speaker VCTK dataset with the script below.
```python
import os
from TTS.config.shared_configs import BaseAudioConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts import BaseDatasetConfig, GlowTTSConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.glow_tts import GlowTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
# define dataset config for VCTK
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
# init audio processing config
audio_config = BaseAudioConfig(sample_rate=22050, do_trim_silence=True, trim_db=23.0)
# init training config
config = GlowTTSConfig(
batch_size=64,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=False,
mixed_precision=True,
output_path=output_path,
datasets=[dataset_config],
use_speaker_embedding=True,
)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# ONLY FOR MULTI-SPEAKER: init speaker manager for multi-speaker training
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
config.num_speakers = speaker_manager.num_speakers
# init model
model = GlowTTS(config, speaker_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainingArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()
```

View File

@ -29,10 +29,10 @@ each line.
import os import os
# GlowTTSConfig: all model related values for training, validating and testing. # GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs import GlowTTSConfig from TTS.tts.configs.glow_tts_config import GlowTTSConfig
# BaseDatasetConfig: defines name, formatter and path of the dataset. # BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs import BaseDatasetConfig from TTS.tts.configs.shared_config import BaseDatasetConfig
# init_training: Initialize and setup the training environment. # init_training: Initialize and setup the training environment.
# Trainer: Where the ✨️ happens. # Trainer: Where the ✨️ happens.
@ -79,7 +79,7 @@ each line.
# Initiate the Trainer. # Initiate the Trainer.
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training etc. # distributed training, etc.
trainer = Trainer( trainer = Trainer(
TrainingArgs(), TrainingArgs(),
config, config,

View File

@ -1,342 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "6LWsNd3_M3MP"
},
"source": [
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "FAqrSIWgLyP0"
},
"source": [
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
"\n",
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
"\n",
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
"\n",
"Note that both model performances can be improved with more training."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ku-dA4DKoeXk"
},
"source": [
"### Download Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 162
},
"colab_type": "code",
"id": "jGIgnWhGsxU1",
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
"tags": []
},
"outputs": [],
"source": [
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 235
},
"colab_type": "code",
"id": "4dnpE0-kvTsu",
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
"tags": []
},
"outputs": [],
"source": [
"!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Zlgi8fPdpRF0"
},
"source": [
"### Define TTS function"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "f-Yc42nQZG5A"
},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" waveform = waveform.flatten()\n",
" if use_cuda:\n",
" waveform = waveform.cpu()\n",
" waveform = waveform.numpy()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(waveform.shape)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
" return alignment, mel_postnet_spec, stop_tokens, waveform"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ZksegYQepkFg"
},
"source": [
"### Load Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "oVa0kOamprgj"
},
"outputs": [],
"source": [
"import os\n",
"import torch\n",
"import time\n",
"import IPython\n",
"\n",
"from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.utils.io import load_config\n",
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.synthesis import synthesis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "EY-sHVO8IFSH"
},
"outputs": [],
"source": [
"# runtime settings\n",
"use_cuda = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "_1aIUp2FpxOQ"
},
"outputs": [],
"source": [
"# model paths\n",
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
"TTS_CONFIG = \"data/config.json\"\n",
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
"VOCODER_CONFIG = \"data/config_vocoder.json\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "CpgmdBVQplbv"
},
"outputs": [],
"source": [
"# load configs\n",
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 471
},
"colab_type": "code",
"id": "zmrQxiozIUVE",
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
"tags": []
},
"outputs": [],
"source": [
"# load the audio processor\n",
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
"ap = AudioProcessor(**TTS_CONFIG.audio) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"colab_type": "code",
"id": "8fLoI4ipqMeS",
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
"tags": []
},
"outputs": [],
"source": [
"# LOAD TTS MODEL\n",
"# multi speaker \n",
"speaker_id = None\n",
"speakers = []\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
"\n",
"# load model state\n",
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
"\n",
"# load the model\n",
"model.load_state_dict(cp['model'])\n",
"if use_cuda:\n",
" model.cuda()\n",
"model.eval()\n",
"\n",
"# set model stepsize\n",
"if 'r' in cp:\n",
" model.decoder.set_r(cp['r'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"colab_type": "code",
"id": "zKoq0GgzqzhQ",
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
"tags": []
},
"outputs": [],
"source": [
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
"\n",
"# LOAD VOCODER MODEL\n",
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
"vocoder_model.remove_weight_norm()\n",
"vocoder_model.inference_padding = 0\n",
"\n",
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
"if use_cuda:\n",
" vocoder_model.cuda()\n",
"vocoder_model.eval()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ws_YkPKsLgo-"
},
"source": [
"## Run Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 134
},
"colab_type": "code",
"id": "FuWxZ9Ey5Puj",
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
"tags": []
},
"outputs": [],
"source": [
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,346 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false",
"colab_type": "text",
"id": "6LWsNd3_M3MP"
},
"source": [
"# Mozilla TTS on CPU Real-Time Speech Synthesis with Tensorflow"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false",
"colab_type": "text",
"id": "FAqrSIWgLyP0"
},
"source": [
"**These models are converted from released [PyTorch models](https://colab.research.google.com/drive/1u_16ZzHjKYFn1HNVuA4Qf_i2MMFB9olY?usp=sharing) using our TF utilities provided in Mozilla TTS.**\n",
"\n",
"These TF models support TF 2.2 and for different versions you might need to\n",
"regenerate them. \n",
"\n",
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
"\n",
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
"\n",
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
"\n",
"Note that both model performances can be improved with more training.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false",
"colab_type": "text",
"id": "Ku-dA4DKoeXk"
},
"source": [
"### Download Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 162
},
"colab_type": "code",
"id": "jGIgnWhGsxU1",
"outputId": "08b0dddd-4edf-48c9-e8e5-a419b36a5c3d",
"tags": []
},
"outputs": [],
"source": [
"!gdown --id 1p7OSEEW_Z7ORxNgfZwhMy7IiLE1s0aH7 -O data/tts_model.pkl\n",
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 235
},
"colab_type": "code",
"id": "4dnpE0-kvTsu",
"outputId": "2fe836eb-c7e7-4f1e-9352-0142126bb19f",
"tags": []
},
"outputs": [],
"source": [
"!gdown --id 1rHmj7CqD3Sfa716Y3ub_vpIBrQg_b1yF -O data/vocoder_model.pkl\n",
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false",
"colab_type": "text",
"id": "Zlgi8fPdpRF0"
},
"source": [
"### Define TTS function"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {},
"colab_type": "code",
"id": "f-Yc42nQZG5A"
},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, p):\n",
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n",
" backend='tf')\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" waveform = waveform.numpy()[0, 0]\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(waveform.shape)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
" return alignment, mel_postnet_spec, stop_tokens, waveform"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false",
"colab_type": "text",
"id": "ZksegYQepkFg"
},
"source": [
"### Load Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {},
"colab_type": "code",
"id": "oVa0kOamprgj"
},
"outputs": [],
"source": [
"import os\n",
"import torch\n",
"import time\n",
"import IPython\n",
"\n",
"from TTS.tts.tf.utils.generic_utils import setup_model\n",
"from TTS.tts.tf.utils.io import load_checkpoint\n",
"from TTS.utils.io import load_config\n",
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.synthesis import synthesis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {},
"colab_type": "code",
"id": "EY-sHVO8IFSH"
},
"outputs": [],
"source": [
"# runtime settings\n",
"use_cuda = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {},
"colab_type": "code",
"id": "_1aIUp2FpxOQ"
},
"outputs": [],
"source": [
"# model paths\n",
"TTS_MODEL = \"data/tts_model.pkl\"\n",
"TTS_CONFIG = \"data/config.json\"\n",
"VOCODER_MODEL = \"data/vocoder_model.pkl\"\n",
"VOCODER_CONFIG = \"data/config_vocoder.json\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {},
"colab_type": "code",
"id": "CpgmdBVQplbv"
},
"outputs": [],
"source": [
"# load configs\n",
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 471
},
"colab_type": "code",
"id": "zmrQxiozIUVE",
"outputId": "fa71bd05-401f-4e5b-a6f7-60ae765966db",
"tags": []
},
"outputs": [],
"source": [
"# load the audio processor\n",
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
"ap = AudioProcessor(**TTS_CONFIG.audio) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
},
"colab_type": "code",
"id": "8fLoI4ipqMeS",
"outputId": "595d990f-930d-4698-ee14-77796b5eed7d",
"tags": []
},
"outputs": [],
"source": [
"# LOAD TTS MODEL\n",
"# multi speaker \n",
"speaker_id = None\n",
"speakers = []\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
"model.build_inference()\n",
"model = load_checkpoint(model, TTS_MODEL)\n",
"model.decoder.set_max_decoder_steps(1000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 489
},
"colab_type": "code",
"id": "zKoq0GgzqzhQ",
"outputId": "2cc3deae-144f-4465-da3b-98628d948506"
},
"outputs": [],
"source": [
"from TTS.vocoder.tf.utils.generic_utils import setup_generator\n",
"from TTS.vocoder.tf.utils.io import load_checkpoint\n",
"\n",
"# LOAD VOCODER MODEL\n",
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
"vocoder_model.build_inference()\n",
"vocoder_model = load_checkpoint(vocoder_model, VOCODER_MODEL)\n",
"vocoder_model.inference_padding = 0\n",
"\n",
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) "
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false",
"colab_type": "text",
"id": "Ws_YkPKsLgo-"
},
"source": [
"## Run Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 134
},
"colab_type": "code",
"id": "FuWxZ9Ey5Puj",
"outputId": "07ede6e5-06e6-4612-f687-7984d20e5254"
},
"outputs": [],
"source": [
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "DDC-TTS_and_MultiBand-MelGAN_TF_Example.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,342 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "6LWsNd3_M3MP"
},
"source": [
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "FAqrSIWgLyP0"
},
"source": [
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
"\n",
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
"\n",
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
"\n",
"Note that both model performances can be improved with more training."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ku-dA4DKoeXk"
},
"source": [
"### Download Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 162
},
"colab_type": "code",
"id": "jGIgnWhGsxU1",
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
"tags": []
},
"outputs": [],
"source": [
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 235
},
"colab_type": "code",
"id": "4dnpE0-kvTsu",
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
"tags": []
},
"outputs": [],
"source": [
"!gdown --id 1X09hHAyAJOnrplCUMAdW_t341Kor4YR4 -O data/vocoder_model.pth.tar\n",
"!gdown --id \"1qN7vQRIYkzvOX_DtiZtTajzoZ1eW1-Eg\" -O data/config_vocoder.json\n",
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Zlgi8fPdpRF0"
},
"source": [
"### Define TTS function"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "f-Yc42nQZG5A"
},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" waveform = waveform.flatten()\n",
" if use_cuda:\n",
" waveform = waveform.cpu()\n",
" waveform = waveform.numpy()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(waveform.shape)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
" return alignment, mel_postnet_spec, stop_tokens, waveform"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ZksegYQepkFg"
},
"source": [
"### Load Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "oVa0kOamprgj"
},
"outputs": [],
"source": [
"import os\n",
"import torch\n",
"import time\n",
"import IPython\n",
"\n",
"from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.utils.io import load_config\n",
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.synthesis import synthesis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "EY-sHVO8IFSH"
},
"outputs": [],
"source": [
"# runtime settings\n",
"use_cuda = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "_1aIUp2FpxOQ"
},
"outputs": [],
"source": [
"# model paths\n",
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
"TTS_CONFIG = \"data/config.json\"\n",
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
"VOCODER_CONFIG = \"data/config_vocoder.json\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "CpgmdBVQplbv"
},
"outputs": [],
"source": [
"# load configs\n",
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 471
},
"colab_type": "code",
"id": "zmrQxiozIUVE",
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
"tags": []
},
"outputs": [],
"source": [
"# load the audio processor\n",
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
"ap = AudioProcessor(**TTS_CONFIG.audio) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"colab_type": "code",
"id": "8fLoI4ipqMeS",
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
"tags": []
},
"outputs": [],
"source": [
"# LOAD TTS MODEL\n",
"# multi speaker \n",
"speaker_id = None\n",
"speakers = []\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
"\n",
"# load model state\n",
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
"\n",
"# load the model\n",
"model.load_state_dict(cp['model'])\n",
"if use_cuda:\n",
" model.cuda()\n",
"model.eval()\n",
"\n",
"# set model stepsize\n",
"if 'r' in cp:\n",
" model.decoder.set_r(cp['r'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"colab_type": "code",
"id": "zKoq0GgzqzhQ",
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
"tags": []
},
"outputs": [],
"source": [
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
"\n",
"# LOAD VOCODER MODEL\n",
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
"vocoder_model.remove_weight_norm()\n",
"vocoder_model.inference_padding = 0\n",
"\n",
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
"if use_cuda:\n",
" vocoder_model.cuda()\n",
"vocoder_model.eval()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ws_YkPKsLgo-"
},
"source": [
"## Run Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 134
},
"colab_type": "code",
"id": "FuWxZ9Ey5Puj",
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
"tags": []
},
"outputs": [],
"source": [
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -2,14 +2,16 @@
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training." "This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training."
], ]
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"%load_ext autoreload\n", "%load_ext autoreload\n",
"%autoreload 2\n", "%autoreload 2\n",
@ -20,7 +22,7 @@
"import numpy as np\n", "import numpy as np\n",
"from tqdm import tqdm as tqdm\n", "from tqdm import tqdm as tqdm\n",
"from torch.utils.data import DataLoader\n", "from torch.utils.data import DataLoader\n",
"from TTS.tts.datasets.TTSDataset import TTSDataset\n", "from TTS.tts.datasets.dataset import TTSDataset\n",
"from TTS.tts.layers.losses import L1LossMasked\n", "from TTS.tts.layers.losses import L1LossMasked\n",
"from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.audio import AudioProcessor\n",
"from TTS.config import load_config\n", "from TTS.config import load_config\n",
@ -33,13 +35,13 @@
"\n", "\n",
"import os\n", "import os\n",
"os.environ['CUDA_VISIBLE_DEVICES']='2'" "os.environ['CUDA_VISIBLE_DEVICES']='2'"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"def set_filename(wav_path, out_path):\n", "def set_filename(wav_path, out_path):\n",
" wav_file = os.path.basename(wav_path)\n", " wav_file = os.path.basename(wav_path)\n",
@ -51,13 +53,13 @@
" mel_path = os.path.join(out_path, \"mel\", file_name)\n", " mel_path = os.path.join(out_path, \"mel\", file_name)\n",
" wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n", " wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n",
" return file_name, wavq_path, mel_path, wav_path" " return file_name, wavq_path, mel_path, wav_path"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n", "OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n",
"DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n", "DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n",
@ -77,13 +79,13 @@
"C = load_config(CONFIG_PATH)\n", "C = load_config(CONFIG_PATH)\n",
"C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n", "C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n",
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)" "ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"print(C['r'])\n", "print(C['r'])\n",
"# if the vocabulary was passed, replace the default\n", "# if the vocabulary was passed, replace the default\n",
@ -95,13 +97,13 @@
"# TODO: multiple speaker\n", "# TODO: multiple speaker\n",
"model = setup_model(C)\n", "model = setup_model(C)\n",
"model.load_checkpoint(C, MODEL_FILE, eval=True)" "model.load_checkpoint(C, MODEL_FILE, eval=True)"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n", "preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n",
"preprocessor = getattr(preprocessor, DATASET.lower())\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n",
@ -120,20 +122,20 @@
"loader = DataLoader(\n", "loader = DataLoader(\n",
" dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n", " dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n",
")\n" ")\n"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"### Generate model outputs " "### Generate model outputs "
], ]
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"import pickle\n", "import pickle\n",
"\n", "\n",
@ -212,42 +214,42 @@
"\n", "\n",
" print(np.mean(losses))\n", " print(np.mean(losses))\n",
" print(np.mean(postnet_losses))" " print(np.mean(postnet_losses))"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# for pwgan\n", "# for pwgan\n",
"with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n", "with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n",
" for data in metadata:\n", " for data in metadata:\n",
" f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")" " f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"### Sanity Check" "### Sanity Check"
], ]
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"idx = 1\n", "idx = 1\n",
"ap.melspectrogram(ap.load_wav(item_idx[idx])).shape" "ap.melspectrogram(ap.load_wav(item_idx[idx])).shape"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"import soundfile as sf\n", "import soundfile as sf\n",
"wav, sr = sf.read(item_idx[idx])\n", "wav, sr = sf.read(item_idx[idx])\n",
@ -255,46 +257,46 @@
"mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n", "mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n",
"mel_truth = ap.melspectrogram(wav)\n", "mel_truth = ap.melspectrogram(wav)\n",
"print(mel_truth.shape)" "print(mel_truth.shape)"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# plot posnet output\n", "# plot posnet output\n",
"print(mel_postnet[:mel_lengths[idx], :].shape)\n", "print(mel_postnet[:mel_lengths[idx], :].shape)\n",
"plot_spectrogram(mel_postnet, ap)" "plot_spectrogram(mel_postnet, ap)"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# plot decoder output\n", "# plot decoder output\n",
"print(mel_decoder.shape)\n", "print(mel_decoder.shape)\n",
"plot_spectrogram(mel_decoder, ap)" "plot_spectrogram(mel_decoder, ap)"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# plot GT specgrogram\n", "# plot GT specgrogram\n",
"print(mel_truth.shape)\n", "print(mel_truth.shape)\n",
"plot_spectrogram(mel_truth.T, ap)" "plot_spectrogram(mel_truth.T, ap)"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# postnet, decoder diff\n", "# postnet, decoder diff\n",
"from matplotlib import pylab as plt\n", "from matplotlib import pylab as plt\n",
@ -303,13 +305,13 @@
"plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n", "plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
"plt.colorbar()\n", "plt.colorbar()\n",
"plt.tight_layout()" "plt.tight_layout()"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# PLOT GT SPECTROGRAM diff\n", "# PLOT GT SPECTROGRAM diff\n",
"from matplotlib import pylab as plt\n", "from matplotlib import pylab as plt\n",
@ -318,13 +320,13 @@
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
"plt.colorbar()\n", "plt.colorbar()\n",
"plt.tight_layout()" "plt.tight_layout()"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# PLOT GT SPECTROGRAM diff\n", "# PLOT GT SPECTROGRAM diff\n",
"from matplotlib import pylab as plt\n", "from matplotlib import pylab as plt\n",
@ -334,22 +336,23 @@
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
"plt.colorbar()\n", "plt.colorbar()\n",
"plt.tight_layout()" "plt.tight_layout()"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"source": [], "metadata": {},
"outputs": [], "outputs": [],
"metadata": {} "source": []
} }
], ],
"metadata": { "metadata": {
"interpreter": {
"hash": "822ce188d9bce5372c4adbb11364eeb49293228c2224eb55307f4664778e7f56"
},
"kernelspec": { "kernelspec": {
"name": "python3", "display_name": "Python 3.9.7 64-bit ('base': conda)",
"display_name": "Python 3.9.7 64-bit ('base': conda)" "name": "python3"
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
@ -362,9 +365,6 @@
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.7" "version": "3.9.7"
},
"interpreter": {
"hash": "822ce188d9bce5372c4adbb11364eeb49293228c2224eb55307f4664778e7f56"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -19,19 +19,16 @@
"source": [ "source": [
"import os\n", "import os\n",
"import glob\n", "import glob\n",
"import random\n",
"import numpy as np\n", "import numpy as np\n",
"import torch\n",
"import umap\n", "import umap\n",
"\n", "\n",
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
"from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.generic_utils import load_config\n", "from TTS.config import load_config\n",
"\n", "\n",
"from bokeh.io import output_notebook, show\n", "from bokeh.io import output_notebook, show\n",
"from bokeh.plotting import figure\n", "from bokeh.plotting import figure\n",
"from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n", "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
"from bokeh.transform import factor_cmap, factor_mark\n", "from bokeh.transform import factor_cmap\n",
"from bokeh.palettes import Category10" "from bokeh.palettes import Category10"
] ]
}, },

View File

@ -22,7 +22,6 @@
"import os\n", "import os\n",
"import sys\n", "import sys\n",
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
"import glob\n",
"import librosa\n", "import librosa\n",
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",

View File

@ -21,10 +21,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import os, sys\n", "import os\n",
"import glob\n", "import glob\n",
"import subprocess\n", "import subprocess\n",
"import tempfile\n",
"import IPython\n", "import IPython\n",
"import soundfile as sf\n", "import soundfile as sf\n",
"import numpy as np\n", "import numpy as np\n",

File diff suppressed because one or more lines are too long

View File

@ -1,13 +1,16 @@
# 🐸💬 TTS Training Recipes # 🐸💬 TTS Training Recipes
TTS recipes intended to host bash scripts running all the necessary steps to train a TTS model with a particular dataset. TTS recipes intended to host scripts running all the necessary steps to train a TTS model on a particular dataset.
Run each script from the root TTS folder as follows For each dataset, you need to download the dataset once. Then you run the training for the model you want.
Run each script from the root TTS folder as follows.
```console ```console
$ bash ./recipes/<dataset>/<model>/run.sh $ sh ./recipes/<dataset>/download_<dataset>.sh
$ python recipes/<dataset>/<model_name>/train.py
``` ```
All the outputs are held under the recipe directory unless you change the paths in the bash script.
If you train a new model using TTS, feel free to share your training to expand the list of recipes. If you train a new model using TTS, feel free to share your training to expand the list of recipes.
You can also open a new discussion and share your progress with the 🐸 community.

View File

@ -1,7 +1,7 @@
import os import os
from TTS.trainer import Trainer, TrainingArgs from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs import AlignTTSConfig, BaseDatasetConfig from TTS.tts.configs.align_tts_config import AlignTTSConfig, BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.align_tts import AlignTTS from TTS.tts.models.align_tts import AlignTTS
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor

View File

@ -10,5 +10,5 @@ tar -xjf LJSpeech-1.1.tar.bz2
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
mv LJSpeech-1.1 $RUN_DIR/ mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/
rm LJSpeech-1.1.tar.bz2 rm LJSpeech-1.1.tar.bz2

View File

@ -1,8 +1,8 @@
import os import os
from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
from TTS.trainer import Trainer, TrainingArgs from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs import FastPitchConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.models.forward_tts import ForwardTTS
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor

View File

@ -2,7 +2,7 @@ import os
from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.trainer import Trainer, TrainingArgs from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs import FastSpeechConfig from TTS.tts.configs.fast_speech_config import FastSpeechConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.models.forward_tts import ForwardTTS
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor

View File

@ -1,7 +1,8 @@
import os import os
from TTS.trainer import Trainer, TrainingArgs from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs import BaseDatasetConfig, GlowTTSConfig from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.models.glow_tts import GlowTTS
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor

View File

@ -2,7 +2,7 @@ import os
from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.trainer import Trainer, TrainingArgs from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs import SpeedySpeechConfig from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.models.forward_tts import ForwardTTS
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor

View File

@ -0,0 +1,12 @@
#!/usr/bin/env bash
# take the scripts's parent's directory to prefix all the output paths.
RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
echo $RUN_DIR
# download LJSpeech dataset
wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip
# extract
mkdir VCTK
unzip VCTK-Corpus-0.92 -d VCTK
# create train-val splits
mv VCTK $RUN_DIR/recipes/vctk/
rm VCTK-Corpus-0.92.zip

View File

@ -0,0 +1,80 @@
import os
from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.fast_pitch_config import FastPitchConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,
do_trim_silence=True,
trim_db=23.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
config = FastPitchConfig(
run_name="fast_pitch_ljspeech",
audio=audio_config,
batch_size=32,
eval_batch_size=16,
num_loader_workers=8,
num_eval_loader_workers=4,
compute_input_seq_cache=True,
compute_f0=True,
f0_cache_path=os.path.join(output_path, "f0_cache"),
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="english_cleaners",
use_phonemes=True,
use_espeak_phonemes=False,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=50,
print_eval=False,
mixed_precision=False,
sort_by_audio_len=True,
max_seq_len=500000,
output_path=output_path,
datasets=[dataset_config],
use_speaker_embedding=True,
)
# init audio processor
ap = AudioProcessor(**config.audio)
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
config.model_args.num_speakers = speaker_manager.num_speakers
# init model
model = ForwardTTS(config, speaker_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainingArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,80 @@
import os
from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.fast_speech_config import FastSpeechConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,
do_trim_silence=True,
trim_db=23.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
config = FastSpeechConfig(
run_name="fast_pitch_ljspeech",
audio=audio_config,
batch_size=32,
eval_batch_size=16,
num_loader_workers=8,
num_eval_loader_workers=4,
compute_input_seq_cache=True,
compute_f0=True,
f0_cache_path=os.path.join(output_path, "f0_cache"),
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="english_cleaners",
use_phonemes=True,
use_espeak_phonemes=False,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=50,
print_eval=False,
mixed_precision=False,
sort_by_audio_len=True,
max_seq_len=500000,
output_path=output_path,
datasets=[dataset_config],
use_speaker_embedding=True,
)
# init audio processor
ap = AudioProcessor(**config.audio)
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
config.model_args.num_speakers = speaker_manager.num_speakers
# init model
model = ForwardTTS(config, speaker_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainingArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,62 @@
import os
from TTS.config.shared_configs import BaseAudioConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(sample_rate=22050, do_trim_silence=True, trim_db=23.0)
config = GlowTTSConfig(
batch_size=64,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=False,
mixed_precision=True,
output_path=output_path,
datasets=[dataset_config],
use_speaker_embedding=True,
)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
config.num_speakers = speaker_manager.num_speakers
# init model
model = GlowTTS(config, speaker_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainingArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,80 @@
import os
from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,
do_trim_silence=True,
trim_db=23.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
config = SpeedySpeechConfig(
run_name="fast_pitch_ljspeech",
audio=audio_config,
batch_size=32,
eval_batch_size=16,
num_loader_workers=8,
num_eval_loader_workers=4,
compute_input_seq_cache=True,
compute_f0=True,
f0_cache_path=os.path.join(output_path, "f0_cache"),
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="english_cleaners",
use_phonemes=True,
use_espeak_phonemes=False,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=50,
print_eval=False,
mixed_precision=False,
sort_by_audio_len=True,
max_seq_len=500000,
output_path=output_path,
datasets=[dataset_config],
use_speaker_embedding=True,
)
# init audio processor
ap = AudioProcessor(**config.audio)
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
config.model_args.num_speakers = speaker_manager.num_speakers
# init model
model = ForwardTTS(config, speaker_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainingArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,80 @@
import os
from TTS.config.shared_configs import BaseAudioConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron_config import TacotronConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron import Tacotron
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,
resample=True, # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training.
do_trim_silence=True,
trim_db=23.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
config = TacotronConfig( # This is the config that is saved for the future use
audio=audio_config,
batch_size=48,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
r=6,
gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
double_decoder_consistency=True,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=False,
mixed_precision=True,
sort_by_audio_len=True,
min_seq_len=0,
max_seq_len=44000 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
output_path=output_path,
datasets=[dataset_config],
use_speaker_embedding=True, # set this to enable multi-sepeaker training
)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init speaker manager for multi-speaker training
# it mainly handles speaker-id to speaker-name for the model and the data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
# init model
model = Tacotron(config, speaker_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainingArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,87 @@
import os
from TTS.config.shared_configs import BaseAudioConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,
resample=False, # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training.
do_trim_silence=True,
trim_db=23.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
preemphasis=0.0,
)
config = Tacotron2Config( # This is the config that is saved for the future use
audio=audio_config,
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
r=2,
# gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
double_decoder_consistency=False,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=150,
print_eval=False,
mixed_precision=True,
sort_by_audio_len=True,
min_seq_len=14800,
max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
output_path=output_path,
datasets=[dataset_config],
use_speaker_embedding=True, # set this to enable multi-sepeaker training
decoder_ssim_alpha=0.0, # disable ssim losses that causes NaN for some runs.
postnet_ssim_alpha=0.0,
postnet_diff_spec_alpha=0.0,
decoder_diff_spec_alpha=0.0,
attention_norm="softmax",
optimizer="Adam",
lr_scheduler=None,
lr=3e-5,
)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init speaker manager for multi-speaker training
# it mainly handles speaker-id to speaker-name for the model and the data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
# init model
model = Tacotron2(config, speaker_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainingArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,86 @@
import os
from TTS.config.shared_configs import BaseAudioConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,
win_length=1024,
hop_length=256,
num_mels=80,
preemphasis=0.0,
ref_level_db=20,
log_func="np.log",
do_trim_silence=True,
trim_db=23.0,
mel_fmin=0,
mel_fmax=None,
spec_gain=1.0,
signal_norm=False,
do_amp_to_db_linear=False,
resample=True,
)
config = VitsConfig(
audio=audio_config,
run_name="vits_vctk",
use_speaker_embedding=True,
batch_size=32,
eval_batch_size=16,
batch_group_size=5,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="english_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
compute_input_seq_cache=True,
print_step=25,
print_eval=False,
mixed_precision=True,
sort_by_audio_len=True,
min_seq_len=32 * 256 * 4,
max_seq_len=1500000,
output_path=output_path,
datasets=[dataset_config],
)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
config.model_args.num_speakers = speaker_manager.num_speakers
# init model
model = Vits(config, speaker_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainingArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

View File

@ -7,7 +7,7 @@ import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from tests import get_tests_output_path from tests import get_tests_output_path
from TTS.tts.configs import BaseTTSConfig from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.datasets import TTSDataset from TTS.tts.datasets import TTSDataset
from TTS.tts.datasets.formatters import ljspeech from TTS.tts.datasets.formatters import ljspeech
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import AlignTTSConfig from TTS.tts.configs.align_tts_config import AlignTTSConfig
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")

View File

@ -4,7 +4,7 @@ import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.config.shared_configs import BaseAudioConfig from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs import FastPitchConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig
config_path = os.path.join(get_tests_output_path(), "test_fast_pitch_config.json") config_path = os.path.join(get_tests_output_path(), "test_fast_pitch_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")

View File

@ -6,7 +6,7 @@ import torch
from torch import optim from torch import optim
from tests import get_tests_input_path from tests import get_tests_input_path
from TTS.tts.configs import GlowTTSConfig from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.layers.losses import GlowTTSLoss from TTS.tts.layers.losses import GlowTTSLoss
from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.models.glow_tts import GlowTTS
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import GlowTTSConfig from TTS.tts.configs.glow_tts_config import GlowTTSConfig
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import SpeedySpeechConfig from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import Tacotron2Config from TTS.tts.configs.tacotron2_config import Tacotron2Config
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")
@ -23,7 +23,7 @@ config = Tacotron2Config(
epochs=1, epochs=1,
print_step=1, print_step=1,
print_eval=True, print_eval=True,
use_speaker_embedding=True, use_speaker_embedding=False,
use_d_vector_file=True, use_d_vector_file=True,
test_sentences=[ test_sentences=[
"Be a voice, not an echo.", "Be a voice, not an echo.",

View File

@ -6,8 +6,8 @@ import torch
from torch import nn, optim from torch import nn, optim
from tests import get_tests_input_path from tests import get_tests_input_path
from TTS.tts.configs import Tacotron2Config
from TTS.tts.configs.shared_configs import GSTConfig from TTS.tts.configs.shared_configs import GSTConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.layers.losses import MSELossMasked from TTS.tts.layers.losses import MSELossMasked
from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.models.tacotron2 import Tacotron2
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
@ -114,7 +114,7 @@ class MultiSpeakerTacotronTrainTest(unittest.TestCase):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=config.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(5): for _ in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
) )

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import Tacotron2Config from TTS.tts.configs.tacotron2_config import Tacotron2Config
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")

View File

@ -5,7 +5,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
import torch import torch
from TTS.tts.configs import Tacotron2Config from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.models.tacotron2 import Tacotron2
from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import Tacotron2Config from TTS.tts.configs.tacotron2_config import Tacotron2Config
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import Tacotron2Config from TTS.tts.configs.tacotron2_config import Tacotron2Config
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")

View File

@ -6,7 +6,8 @@ import torch
from torch import nn, optim from torch import nn, optim
from tests import get_tests_input_path from tests import get_tests_input_path
from TTS.tts.configs import GSTConfig, TacotronConfig from TTS.tts.configs.shared_configs import GSTConfig
from TTS.tts.configs.tacotron_config import TacotronConfig
from TTS.tts.layers.losses import L1LossMasked from TTS.tts.layers.losses import L1LossMasked
from TTS.tts.models.tacotron import Tacotron from TTS.tts.models.tacotron import Tacotron
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import TacotronConfig from TTS.tts.configs.tacotron_config import TacotronConfig
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")

View File

@ -3,7 +3,7 @@ import os
import shutil import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import VitsConfig from TTS.tts.configs.vits_config import VitsConfig
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")