mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'trainer_v1' into dev
This commit is contained in:
commit
44d47fd4cf
|
@ -1,4 +1,4 @@
|
||||||
name: CI
|
name: aux-tests
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
|
@ -45,8 +45,5 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install .[all]
|
python3 -m pip install .[all]
|
||||||
python3 setup.py egg_info
|
python3 setup.py egg_info
|
||||||
- name: Lint check
|
|
||||||
run: |
|
|
||||||
make lint
|
|
||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: make test
|
run: make test_aux
|
|
@ -0,0 +1,50 @@
|
||||||
|
name: style-check
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
jobs:
|
||||||
|
check_skip:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
||||||
|
steps:
|
||||||
|
- run: echo "${{ github.event.head_commit.message }}"
|
||||||
|
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [3.9]
|
||||||
|
experimental: [false]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- uses: actions/cache@v1
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pip
|
||||||
|
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') }}
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
architecture: x64
|
||||||
|
- name: check OS
|
||||||
|
run: cat /etc/os-release
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y git make
|
||||||
|
sudo apt install -y python3-wheel gcc
|
||||||
|
make system-deps
|
||||||
|
- name: Upgrade pip
|
||||||
|
run: python3 -m pip install --upgrade pip
|
||||||
|
- name: Install TTS
|
||||||
|
run: |
|
||||||
|
python3 -m pip install .[all]
|
||||||
|
python3 setup.py egg_info
|
||||||
|
- name: Lint check
|
||||||
|
run: |
|
||||||
|
make lint
|
|
@ -0,0 +1,49 @@
|
||||||
|
name: tts-tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
jobs:
|
||||||
|
check_skip:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
||||||
|
steps:
|
||||||
|
- run: echo "${{ github.event.head_commit.message }}"
|
||||||
|
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [3.6, 3.7, 3.8, 3.9]
|
||||||
|
experimental: [false]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- uses: actions/cache@v1
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pip
|
||||||
|
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') }}
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
architecture: x64
|
||||||
|
- name: check OS
|
||||||
|
run: cat /etc/os-release
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y git make
|
||||||
|
sudo apt install -y python3-wheel gcc
|
||||||
|
make system-deps
|
||||||
|
- name: Upgrade pip
|
||||||
|
run: python3 -m pip install --upgrade pip
|
||||||
|
- name: Install TTS
|
||||||
|
run: |
|
||||||
|
python3 -m pip install .[all]
|
||||||
|
python3 setup.py egg_info
|
||||||
|
- name: Unit tests
|
||||||
|
run: make test_tts
|
|
@ -0,0 +1,49 @@
|
||||||
|
name: vocoder-tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
jobs:
|
||||||
|
check_skip:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
||||||
|
steps:
|
||||||
|
- run: echo "${{ github.event.head_commit.message }}"
|
||||||
|
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [3.6, 3.7, 3.8, 3.9]
|
||||||
|
experimental: [false]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- uses: actions/cache@v1
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pip
|
||||||
|
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/setup.py') }}
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
architecture: x64
|
||||||
|
- name: check OS
|
||||||
|
run: cat /etc/os-release
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y git make
|
||||||
|
sudo apt install -y python3-wheel gcc
|
||||||
|
make system-deps
|
||||||
|
- name: Upgrade pip
|
||||||
|
run: python3 -m pip install --upgrade pip
|
||||||
|
- name: Install TTS
|
||||||
|
run: |
|
||||||
|
python3 -m pip install .[all]
|
||||||
|
python3 setup.py egg_info
|
||||||
|
- name: Unit tests
|
||||||
|
run: make test_vocoder
|
|
@ -124,6 +124,15 @@ version.py
|
||||||
# jupyter dummy files
|
# jupyter dummy files
|
||||||
core
|
core
|
||||||
|
|
||||||
|
# ignore local datasets
|
||||||
|
recipes/WIP/*
|
||||||
|
recipes/ljspeech/LJSpeech-1.1/*
|
||||||
|
recipes/vctk/VCTK/*
|
||||||
|
VCTK-Corpus-removed-silence/*
|
||||||
|
|
||||||
|
# ignore training logs
|
||||||
|
trainer_*_log.txt
|
||||||
|
|
||||||
# files used internally fro dev, test etc.
|
# files used internally fro dev, test etc.
|
||||||
tests/outputs/*
|
tests/outputs/*
|
||||||
tests/train_outputs/*
|
tests/train_outputs/*
|
||||||
|
@ -134,9 +143,6 @@ notebooks/data/*
|
||||||
TTS/tts/layers/glow_tts/monotonic_align/core.c
|
TTS/tts/layers/glow_tts/monotonic_align/core.c
|
||||||
.vscode-upload.json
|
.vscode-upload.json
|
||||||
temp_build/*
|
temp_build/*
|
||||||
recipes/WIP/*
|
|
||||||
recipes/ljspeech/LJSpeech-1.1/*
|
|
||||||
recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/*
|
|
||||||
events.out*
|
events.out*
|
||||||
old_configs/*
|
old_configs/*
|
||||||
model_importers/*
|
model_importers/*
|
||||||
|
|
9
Makefile
9
Makefile
|
@ -12,6 +12,15 @@ test_all: ## run tests and don't stop on an error.
|
||||||
|
|
||||||
test: ## run tests.
|
test: ## run tests.
|
||||||
nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id
|
nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id
|
||||||
|
|
||||||
|
test_vocoder: ## run vocoder tests.
|
||||||
|
nosetests tests.vocoder_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.vocoder_tests --nologcapture --with-id
|
||||||
|
|
||||||
|
test_tts: ## run tts tests.
|
||||||
|
nosetests tests.tts_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.tts_tests --nologcapture --with-id
|
||||||
|
|
||||||
|
test_aux: ## run aux tests.
|
||||||
|
nosetests tests.aux_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.aux_tests --nologcapture --with-id
|
||||||
./run_bash_tests.sh
|
./run_bash_tests.sh
|
||||||
|
|
||||||
test_failed: ## only run tests failed the last time.
|
test_failed: ## only run tests failed the last time.
|
||||||
|
|
|
@ -5,7 +5,7 @@ from argparse import RawTextHelpFormatter
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_meta_data
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
|
@ -36,7 +36,7 @@ args = parser.parse_args()
|
||||||
|
|
||||||
c_dataset = load_config(args.config_dataset_path)
|
c_dataset = load_config(args.config_dataset_path)
|
||||||
|
|
||||||
meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval)
|
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
|
||||||
wav_files = meta_data_train + meta_data_eval
|
wav_files = meta_data_train + meta_data_eval
|
||||||
|
|
||||||
speaker_manager = SpeakerManager(
|
speaker_manager = SpeakerManager(
|
||||||
|
|
|
@ -10,7 +10,7 @@ from tqdm import tqdm
|
||||||
|
|
||||||
# from TTS.utils.io import load_config
|
# from TTS.utils.io import load_config
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_meta_data
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ def main():
|
||||||
if args.data_path:
|
if args.data_path:
|
||||||
dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
|
dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
|
||||||
else:
|
else:
|
||||||
dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data
|
dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
|
||||||
print(f" > There are {len(dataset_items)} files.")
|
print(f" > There are {len(dataset_items)} files.")
|
||||||
|
|
||||||
mel_sum = 0
|
mel_sum = 0
|
||||||
|
|
|
@ -10,8 +10,7 @@ from torch.utils.data import DataLoader
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_meta_data
|
from TTS.tts.datasets import TTSDataset, load_tts_samples
|
||||||
from TTS.tts.datasets.TTSDataset import TTSDataset
|
|
||||||
from TTS.tts.models import setup_model
|
from TTS.tts.models import setup_model
|
||||||
from TTS.tts.utils.speakers import get_speaker_manager
|
from TTS.tts.utils.speakers import get_speaker_manager
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
@ -230,7 +229,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
ap = AudioProcessor(**c.audio)
|
ap = AudioProcessor(**c.audio)
|
||||||
|
|
||||||
# load data instances
|
# load data instances
|
||||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=args.eval)
|
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=args.eval)
|
||||||
|
|
||||||
# use eval and training partitions
|
# use eval and training partitions
|
||||||
meta_data = meta_data_train + meta_data_eval
|
meta_data = meta_data_train + meta_data_eval
|
||||||
|
|
|
@ -3,7 +3,7 @@ import argparse
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_meta_data
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -23,7 +23,7 @@ def main():
|
||||||
c = load_config(args.config_path)
|
c = load_config(args.config_path)
|
||||||
|
|
||||||
# load all datasets
|
# load all datasets
|
||||||
train_items, eval_items = load_meta_data(c.datasets, eval_split=True)
|
train_items, eval_items = load_tts_samples(c.datasets, eval_split=True)
|
||||||
items = train_items + eval_items
|
items = train_items + eval_items
|
||||||
|
|
||||||
texts = "".join(item[0] for item in items)
|
texts = "".join(item[0] for item in items)
|
||||||
|
|
|
@ -12,9 +12,9 @@ from torch.utils.data import DataLoader
|
||||||
from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
|
from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
|
||||||
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
||||||
from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
|
from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
|
||||||
|
from TTS.speaker_encoder.utils.training import init_training
|
||||||
from TTS.speaker_encoder.utils.visual import plot_embeddings
|
from TTS.speaker_encoder.utils.visual import plot_embeddings
|
||||||
from TTS.trainer import init_training
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.datasets import load_meta_data
|
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict
|
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict
|
||||||
from TTS.utils.io import load_fsspec
|
from TTS.utils.io import load_fsspec
|
||||||
|
@ -156,7 +156,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
optimizer = RAdam(model.parameters(), lr=c.lr)
|
optimizer = RAdam(model.parameters(), lr=c.lr)
|
||||||
|
|
||||||
# pylint: disable=redefined-outer-name
|
# pylint: disable=redefined-outer-name
|
||||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=False)
|
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=False)
|
||||||
|
|
||||||
data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True)
|
data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True)
|
||||||
|
|
||||||
|
@ -208,7 +208,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training(sys.argv)
|
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -1,12 +1,71 @@
|
||||||
import sys
|
import os
|
||||||
|
|
||||||
from TTS.trainer import Trainer, init_training
|
from TTS.config import load_config, register_config
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models import setup_model
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Run 🐸TTS trainer from terminal. This is also necessary to run DDP training by ```distribute.py```"""
|
"""Run `tts` model training directly by a `config.json` file."""
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(sys.argv)
|
# init trainer args
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger, cudnn_benchmark=False)
|
train_args = TrainingArgs()
|
||||||
|
parser = train_args.init_argparse(arg_prefix="")
|
||||||
|
|
||||||
|
# override trainer args from comman-line args
|
||||||
|
args, config_overrides = parser.parse_known_args()
|
||||||
|
train_args.parse_args(args)
|
||||||
|
|
||||||
|
# load config.json and register
|
||||||
|
if args.config_path or args.continue_path:
|
||||||
|
if args.config_path:
|
||||||
|
# init from a file
|
||||||
|
config = load_config(args.config_path)
|
||||||
|
if len(config_overrides) > 0:
|
||||||
|
config.parse_known_args(config_overrides, relaxed_parser=True)
|
||||||
|
elif args.continue_path:
|
||||||
|
# continue from a prev experiment
|
||||||
|
config = load_config(os.path.join(args.continue_path, "config.json"))
|
||||||
|
if len(config_overrides) > 0:
|
||||||
|
config.parse_known_args(config_overrides, relaxed_parser=True)
|
||||||
|
else:
|
||||||
|
# init from console args
|
||||||
|
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
||||||
|
|
||||||
|
config_base = BaseTrainingConfig()
|
||||||
|
config_base.parse_known_args(config_overrides)
|
||||||
|
config = register_config(config_base.model)()
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True)
|
||||||
|
|
||||||
|
# setup audio processor
|
||||||
|
ap = AudioProcessor(**config.audio)
|
||||||
|
|
||||||
|
# init speaker manager
|
||||||
|
if config.use_speaker_embedding:
|
||||||
|
speaker_manager = SpeakerManager(data_items=train_samples + eval_samples)
|
||||||
|
elif config.use_d_vector_file:
|
||||||
|
speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
|
||||||
|
else:
|
||||||
|
speaker_manager = None
|
||||||
|
|
||||||
|
# init the model from config
|
||||||
|
model = setup_model(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
train_args,
|
||||||
|
config,
|
||||||
|
config.output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
parse_command_line_args=False,
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,26 +1,69 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
from TTS.trainer import Trainer, init_training
|
from TTS.config import load_config, register_config
|
||||||
from TTS.utils.generic_utils import remove_experiment_folder
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||||
|
from TTS.vocoder.models import setup_model
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
try:
|
"""Run `tts` model training directly by a `config.json` file."""
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(sys.argv)
|
# init trainer args
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
|
train_args = TrainingArgs()
|
||||||
trainer.fit()
|
parser = train_args.init_argparse(arg_prefix="")
|
||||||
except KeyboardInterrupt:
|
|
||||||
remove_experiment_folder(output_path)
|
# override trainer args from comman-line args
|
||||||
try:
|
args, config_overrides = parser.parse_known_args()
|
||||||
sys.exit(0)
|
train_args.parse_args(args)
|
||||||
except SystemExit:
|
|
||||||
os._exit(0) # pylint: disable=protected-access
|
# load config.json and register
|
||||||
except Exception: # pylint: disable=broad-except
|
if args.config_path or args.continue_path:
|
||||||
remove_experiment_folder(output_path)
|
if args.config_path:
|
||||||
traceback.print_exc()
|
# init from a file
|
||||||
sys.exit(1)
|
config = load_config(args.config_path)
|
||||||
|
if len(config_overrides) > 0:
|
||||||
|
config.parse_known_args(config_overrides, relaxed_parser=True)
|
||||||
|
elif args.continue_path:
|
||||||
|
# continue from a prev experiment
|
||||||
|
config = load_config(os.path.join(args.continue_path, "config.json"))
|
||||||
|
if len(config_overrides) > 0:
|
||||||
|
config.parse_known_args(config_overrides, relaxed_parser=True)
|
||||||
|
else:
|
||||||
|
# init from console args
|
||||||
|
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
||||||
|
|
||||||
|
config_base = BaseTrainingConfig()
|
||||||
|
config_base.parse_known_args(config_overrides)
|
||||||
|
config = register_config(config_base.model)()
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
if "feature_path" in config and config.feature_path:
|
||||||
|
# load pre-computed features
|
||||||
|
print(f" > Loading features from: {config.feature_path}")
|
||||||
|
eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
|
||||||
|
else:
|
||||||
|
# load data raw wav files
|
||||||
|
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||||
|
|
||||||
|
# setup audio processor
|
||||||
|
ap = AudioProcessor(**config.audio)
|
||||||
|
|
||||||
|
# init the model from config
|
||||||
|
model = setup_model(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
train_args,
|
||||||
|
config,
|
||||||
|
config.output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
parse_command_line_args=False,
|
||||||
|
)
|
||||||
|
trainer.fit()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -36,10 +36,11 @@ def register_config(model_name: str) -> Coqpit:
|
||||||
Coqpit: config class.
|
Coqpit: config class.
|
||||||
"""
|
"""
|
||||||
config_class = None
|
config_class = None
|
||||||
|
config_name = model_name + "_config"
|
||||||
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"]
|
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"]
|
||||||
for path in paths:
|
for path in paths:
|
||||||
try:
|
try:
|
||||||
config_class = find_module(path, model_name + "_config")
|
config_class = find_module(path, config_name)
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
pass
|
pass
|
||||||
if config_class is None:
|
if config_class is None:
|
||||||
|
|
18
TTS/model.py
18
TTS/model.py
|
@ -6,8 +6,6 @@ import torch
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
|
|
||||||
# pylint: skip-file
|
# pylint: skip-file
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,6 +20,14 @@ class BaseModel(nn.Module, ABC):
|
||||||
- 1D tensors `batch x 1`
|
- 1D tensors `batch x 1`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Coqpit):
|
||||||
|
super().__init__()
|
||||||
|
self._set_model_args(config)
|
||||||
|
|
||||||
|
def _set_model_args(self, config: Coqpit):
|
||||||
|
"""Set model arguments from the config. Override this."""
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
|
def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
|
||||||
"""Forward pass for the model mainly used in training.
|
"""Forward pass for the model mainly used in training.
|
||||||
|
@ -73,7 +79,7 @@ class BaseModel(nn.Module, ABC):
|
||||||
...
|
...
|
||||||
return outputs_dict, loss_dict
|
return outputs_dict, loss_dict
|
||||||
|
|
||||||
def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]:
|
def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
|
||||||
"""Create visualizations and waveform examples for training.
|
"""Create visualizations and waveform examples for training.
|
||||||
|
|
||||||
For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
|
For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
|
||||||
|
@ -87,7 +93,7 @@ class BaseModel(nn.Module, ABC):
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[Dict, np.ndarray]: training plots and output waveform.
|
Tuple[Dict, np.ndarray]: training plots and output waveform.
|
||||||
"""
|
"""
|
||||||
return None, None
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
|
def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
|
||||||
|
@ -106,9 +112,9 @@ class BaseModel(nn.Module, ABC):
|
||||||
...
|
...
|
||||||
return outputs_dict, loss_dict
|
return outputs_dict, loss_dict
|
||||||
|
|
||||||
def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]:
|
def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
|
||||||
"""The same as `train_log()`"""
|
"""The same as `train_log()`"""
|
||||||
return None, None
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False) -> None:
|
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False) -> None:
|
||||||
|
|
|
@ -0,0 +1,94 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from coqpit import Coqpit
|
||||||
|
|
||||||
|
from TTS.config import load_config, register_config
|
||||||
|
from TTS.trainer import TrainingArgs
|
||||||
|
from TTS.tts.utils.text.symbols import parse_symbols
|
||||||
|
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
|
||||||
|
from TTS.utils.io import copy_model_files
|
||||||
|
from TTS.utils.logging import init_dashboard_logger
|
||||||
|
from TTS.utils.logging.console_logger import ConsoleLogger
|
||||||
|
from TTS.utils.trainer_utils import get_last_checkpoint
|
||||||
|
|
||||||
|
|
||||||
|
def getarguments():
|
||||||
|
train_config = TrainingArgs()
|
||||||
|
parser = train_config.init_argparse(arg_prefix="")
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def process_args(args, config=None):
|
||||||
|
"""Process parsed comand line arguments and initialize the config if not provided.
|
||||||
|
Args:
|
||||||
|
args (argparse.Namespace or dict like): Parsed input arguments.
|
||||||
|
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
||||||
|
Returns:
|
||||||
|
c (TTS.utils.io.AttrDict): Config paramaters.
|
||||||
|
out_path (str): Path to save models and logging.
|
||||||
|
audio_path (str): Path to save generated test audios.
|
||||||
|
c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
|
||||||
|
logging to the console.
|
||||||
|
dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
|
||||||
|
TODO:
|
||||||
|
- Interactive config definition.
|
||||||
|
"""
|
||||||
|
if isinstance(args, tuple):
|
||||||
|
args, coqpit_overrides = args
|
||||||
|
if args.continue_path:
|
||||||
|
# continue a previous training from its output folder
|
||||||
|
experiment_path = args.continue_path
|
||||||
|
args.config_path = os.path.join(args.continue_path, "config.json")
|
||||||
|
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
|
||||||
|
if not args.best_path:
|
||||||
|
args.best_path = best_model
|
||||||
|
# init config if not already defined
|
||||||
|
if config is None:
|
||||||
|
if args.config_path:
|
||||||
|
# init from a file
|
||||||
|
config = load_config(args.config_path)
|
||||||
|
else:
|
||||||
|
# init from console args
|
||||||
|
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
||||||
|
|
||||||
|
config_base = BaseTrainingConfig()
|
||||||
|
config_base.parse_known_args(coqpit_overrides)
|
||||||
|
config = register_config(config_base.model)()
|
||||||
|
# override values from command-line args
|
||||||
|
config.parse_known_args(coqpit_overrides, relaxed_parser=True)
|
||||||
|
experiment_path = args.continue_path
|
||||||
|
if not experiment_path:
|
||||||
|
experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
|
||||||
|
audio_path = os.path.join(experiment_path, "test_audios")
|
||||||
|
config.output_log_path = experiment_path
|
||||||
|
# setup rank 0 process in distributed training
|
||||||
|
dashboard_logger = None
|
||||||
|
if args.rank == 0:
|
||||||
|
new_fields = {}
|
||||||
|
if args.restore_path:
|
||||||
|
new_fields["restore_path"] = args.restore_path
|
||||||
|
new_fields["github_branch"] = get_git_branch()
|
||||||
|
# if model characters are not set in the config file
|
||||||
|
# save the default set to the config file for future
|
||||||
|
# compatibility.
|
||||||
|
if config.has("characters") and config.characters is None:
|
||||||
|
used_characters = parse_symbols()
|
||||||
|
new_fields["characters"] = used_characters
|
||||||
|
copy_model_files(config, experiment_path, new_fields)
|
||||||
|
dashboard_logger = init_dashboard_logger(config)
|
||||||
|
c_logger = ConsoleLogger()
|
||||||
|
return config, experiment_path, audio_path, c_logger, dashboard_logger
|
||||||
|
|
||||||
|
|
||||||
|
def init_arguments():
|
||||||
|
train_config = TrainingArgs()
|
||||||
|
parser = train_config.init_argparse(arg_prefix="")
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def init_training(config: Coqpit = None):
|
||||||
|
"""Initialization of a training run."""
|
||||||
|
parser = init_arguments()
|
||||||
|
args = parser.parse_known_args()
|
||||||
|
config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
|
||||||
|
return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
|
570
TTS/trainer.py
570
TTS/trainer.py
|
@ -4,16 +4,14 @@ import importlib
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict, List, Tuple, Union
|
from inspect import signature
|
||||||
from urllib.parse import urlparse
|
from typing import Callable, Dict, List, Tuple, Union
|
||||||
|
|
||||||
import fsspec
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
@ -21,11 +19,6 @@ from torch import nn
|
||||||
from torch.nn.parallel import DistributedDataParallel as DDP_th
|
from torch.nn.parallel import DistributedDataParallel as DDP_th
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from TTS.config import load_config, register_config
|
|
||||||
from TTS.tts.datasets import load_meta_data
|
|
||||||
from TTS.tts.models import setup_model as setup_tts_model
|
|
||||||
from TTS.tts.utils.text.symbols import parse_symbols
|
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
from TTS.utils.callbacks import TrainerCallback
|
from TTS.utils.callbacks import TrainerCallback
|
||||||
from TTS.utils.distribute import init_distributed
|
from TTS.utils.distribute import init_distributed
|
||||||
from TTS.utils.generic_utils import (
|
from TTS.utils.generic_utils import (
|
||||||
|
@ -39,9 +32,13 @@ from TTS.utils.generic_utils import (
|
||||||
)
|
)
|
||||||
from TTS.utils.io import copy_model_files, load_fsspec, save_best_model, save_checkpoint
|
from TTS.utils.io import copy_model_files, load_fsspec, save_best_model, save_checkpoint
|
||||||
from TTS.utils.logging import ConsoleLogger, TensorboardLogger, WandbLogger, init_dashboard_logger
|
from TTS.utils.logging import ConsoleLogger, TensorboardLogger, WandbLogger, init_dashboard_logger
|
||||||
from TTS.utils.trainer_utils import get_optimizer, get_scheduler, is_apex_available, setup_torch_training_env
|
from TTS.utils.trainer_utils import (
|
||||||
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
get_last_checkpoint,
|
||||||
from TTS.vocoder.models import setup_model as setup_vocoder_model
|
get_optimizer,
|
||||||
|
get_scheduler,
|
||||||
|
is_apex_available,
|
||||||
|
setup_torch_training_env,
|
||||||
|
)
|
||||||
|
|
||||||
multiprocessing.set_start_method("fork")
|
multiprocessing.set_start_method("fork")
|
||||||
|
|
||||||
|
@ -80,6 +77,9 @@ class TrainingArgs(Coqpit):
|
||||||
"help": "Best model file to be used for extracting the best loss. If not specified, the latest best model in continue path is used"
|
"help": "Best model file to be used for extracting the best loss. If not specified, the latest best model in continue path is used"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
skip_train_epoch: bool = field(
|
||||||
|
default=False, metadata={"help": "Run only evaluation iteration. Useful for debugging."}
|
||||||
|
)
|
||||||
config_path: str = field(default="", metadata={"help": "Path to the configuration file."})
|
config_path: str = field(default="", metadata={"help": "Path to the configuration file."})
|
||||||
rank: int = field(default=0, metadata={"help": "Process rank in distributed training."})
|
rank: int = field(default=0, metadata={"help": "Process rank in distributed training."})
|
||||||
group_id: str = field(default="", metadata={"help": "Process group id in distributed training."})
|
group_id: str = field(default="", metadata={"help": "Process group id in distributed training."})
|
||||||
|
@ -90,7 +90,7 @@ class TrainingArgs(Coqpit):
|
||||||
|
|
||||||
|
|
||||||
class Trainer:
|
class Trainer:
|
||||||
def __init__(
|
def __init__( # pylint: disable=dangerous-default-value
|
||||||
self,
|
self,
|
||||||
args: Union[Coqpit, Namespace],
|
args: Union[Coqpit, Namespace],
|
||||||
config: Coqpit,
|
config: Coqpit,
|
||||||
|
@ -98,7 +98,13 @@ class Trainer:
|
||||||
c_logger: ConsoleLogger = None,
|
c_logger: ConsoleLogger = None,
|
||||||
dashboard_logger: Union[TensorboardLogger, WandbLogger] = None,
|
dashboard_logger: Union[TensorboardLogger, WandbLogger] = None,
|
||||||
model: nn.Module = None,
|
model: nn.Module = None,
|
||||||
|
get_model: Callable = None,
|
||||||
|
get_data_samples: Callable = None,
|
||||||
|
train_samples: List = None,
|
||||||
|
eval_samples: List = None,
|
||||||
cudnn_benchmark: bool = False,
|
cudnn_benchmark: bool = False,
|
||||||
|
training_assets: Dict = {},
|
||||||
|
parse_command_line_args: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Simple yet powerful 🐸💬 TTS trainer for PyTorch. It can train all the available `tts` and `vocoder` models
|
"""Simple yet powerful 🐸💬 TTS trainer for PyTorch. It can train all the available `tts` and `vocoder` models
|
||||||
or easily be customized.
|
or easily be customized.
|
||||||
|
@ -127,24 +133,44 @@ class Trainer:
|
||||||
model (nn.Module, optional): Initialized and ready-to-train model. If it is not defined, `Trainer`
|
model (nn.Module, optional): Initialized and ready-to-train model. If it is not defined, `Trainer`
|
||||||
initializes a model from the provided config. Defaults to None.
|
initializes a model from the provided config. Defaults to None.
|
||||||
|
|
||||||
|
get_model (Callable):
|
||||||
|
A function that returns a model. It is used to initialize the model when `model` is not provided.
|
||||||
|
It either takes the config as the only argument or does not take any argument.
|
||||||
|
Defaults to None
|
||||||
|
|
||||||
|
get_data_samples (Callable):
|
||||||
|
A function that returns a list of training and evaluation samples. Used if `train_samples` and
|
||||||
|
`eval_samples` are None. Defaults to None.
|
||||||
|
|
||||||
|
train_samples (List):
|
||||||
|
A list of training samples used by the model's `get_data_loader` to init the `dataset` and the
|
||||||
|
`data_loader`. Defaults to None.
|
||||||
|
|
||||||
|
eval_samples (List):
|
||||||
|
A list of evaluation samples used by the model's `get_data_loader` to init the `dataset` and the
|
||||||
|
`data_loader`. Defaults to None.
|
||||||
|
|
||||||
cudnn_benchmark (bool): enable/disable PyTorch cudnn benchmarking. It is better to disable if the model input
|
cudnn_benchmark (bool): enable/disable PyTorch cudnn benchmarking. It is better to disable if the model input
|
||||||
length is changing batch to batch along the training.
|
length is changing batch to batch along the training.
|
||||||
|
|
||||||
|
training_assets (Dict):
|
||||||
|
A dictionary of assets to be used at training and passed to the model's ```train_log(), eval_log(), get_data_loader()```
|
||||||
|
during training. It can include `AudioProcessor` or/and `Tokenizer`. Defaults to {}.
|
||||||
|
|
||||||
|
parse_command_line_args (bool):
|
||||||
|
If true, parse command-line arguments and update `TrainingArgs` and model `config` values. Set it
|
||||||
|
to false if you parse the arguments yourself. Defaults to True.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
Running trainer on a model.
|
Running trainer with HifiGAN model.
|
||||||
|
|
||||||
>>> args = TrainingArgs(...)
|
>>> args = TrainingArgs(...)
|
||||||
>>> config = HifiganConfig(...)
|
>>> config = HifiganConfig(...)
|
||||||
>>> model = GANModel(config)
|
>>> model = GANModel(config)
|
||||||
>>> trainer = Trainer(args, config, output_path, model=model)
|
>>> ap = AudioProcessor(**config.audio)
|
||||||
>>> trainer.fit()
|
>>> assets = {"audio_processor": ap}
|
||||||
|
>>> trainer = Trainer(args, config, output_path, model=model, training_assets=assets)
|
||||||
Running trainer on a config.
|
|
||||||
|
|
||||||
>>> config = WavegradConfig(data_path="/home/erogol/nvme/gdrive/Datasets/LJSpeech-1.1/wavs/", output_path=output_path,)
|
|
||||||
>>> args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
|
|
||||||
>>> trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
|
|
||||||
>>> trainer.fit()
|
>>> trainer.fit()
|
||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
|
@ -154,20 +180,41 @@ class Trainer:
|
||||||
- Profiler integration.
|
- Profiler integration.
|
||||||
- Overfitting to a batch.
|
- Overfitting to a batch.
|
||||||
- TPU training
|
- TPU training
|
||||||
|
- NOTE: Consider moving `training_assets` to the model implementation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if config is None:
|
if parse_command_line_args:
|
||||||
# parse config from console arguments
|
# parse command-line arguments for TrainerArgs()
|
||||||
config, output_path, _, c_logger, dashboard_logger = process_args(args)
|
args, coqpit_overrides = self.parse_argv(args)
|
||||||
|
|
||||||
|
# get ready for training and parse command-line arguments for the model config
|
||||||
|
config = self.init_training(args, coqpit_overrides, config)
|
||||||
|
|
||||||
|
# set the output path
|
||||||
|
if args.continue_path:
|
||||||
|
# use the same path as the continuing run
|
||||||
|
output_path = args.continue_path
|
||||||
|
else:
|
||||||
|
# override the output path if it is provided
|
||||||
|
output_path = config.output_path if output_path is None else output_path
|
||||||
|
# create a new output folder name
|
||||||
|
output_path = get_experiment_folder_path(config.output_path, config.run_name)
|
||||||
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
|
||||||
|
# copy training assets to the output folder
|
||||||
|
copy_model_files(config, output_path, new_fields=None)
|
||||||
|
|
||||||
|
# init class members
|
||||||
self.args = args
|
self.args = args
|
||||||
self.config = config
|
self.config = config
|
||||||
self.output_path = output_path
|
self.output_path = output_path
|
||||||
self.config.output_log_path = output_path
|
self.config.output_log_path = output_path
|
||||||
|
self.training_assets = training_assets
|
||||||
|
|
||||||
# setup logging
|
# setup logging
|
||||||
log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt")
|
log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt")
|
||||||
self._setup_logger_config(log_file)
|
self._setup_logger_config(log_file)
|
||||||
|
time.sleep(1.0) # wait for the logger to be ready
|
||||||
|
|
||||||
# set and initialize Pytorch runtime
|
# set and initialize Pytorch runtime
|
||||||
self.use_cuda, self.num_gpus = setup_torch_training_env(True, cudnn_benchmark, args.use_ddp)
|
self.use_cuda, self.num_gpus = setup_torch_training_env(True, cudnn_benchmark, args.use_ddp)
|
||||||
|
@ -196,33 +243,22 @@ class Trainer:
|
||||||
self.use_apex = self._is_apex_available()
|
self.use_apex = self._is_apex_available()
|
||||||
self.use_amp_scaler = self.config.mixed_precision and self.use_cuda
|
self.use_amp_scaler = self.config.mixed_precision and self.use_cuda
|
||||||
|
|
||||||
# init audio processor
|
|
||||||
self.ap = AudioProcessor(**self.config.audio.to_dict())
|
|
||||||
|
|
||||||
# load data samples
|
# load data samples
|
||||||
# TODO: refactor this
|
if train_samples is None and get_data_samples is None:
|
||||||
if "datasets" in self.config:
|
raise ValueError("[!] `train_samples` and `get_data_samples` cannot both be None.")
|
||||||
# load data for `tts` models
|
if train_samples is not None:
|
||||||
self.data_train, self.data_eval = load_meta_data(self.config.datasets)
|
self.train_samples = train_samples
|
||||||
elif self.config.feature_path is not None:
|
self.eval_samples = eval_samples
|
||||||
# load pre-comnputed features for `vocoder`models
|
|
||||||
print(f" > Loading features from: {self.config.feature_path}")
|
|
||||||
self.data_eval, self.data_train = load_wav_feat_data(
|
|
||||||
self.config.data_path, self.config.feature_path, self.config.eval_split_size
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
# load data for `vocoder`models
|
self.train_samples, self.eval_samples = self.run_get_data_samples(config, get_data_samples)
|
||||||
self.data_eval, self.data_train = load_wav_data(self.config.data_path, self.config.eval_split_size)
|
|
||||||
|
|
||||||
# init TTS model
|
# init TTS model
|
||||||
|
if model is None and get_model is None:
|
||||||
|
raise ValueError("[!] `model` and `get_model` cannot both be None.")
|
||||||
if model is not None:
|
if model is not None:
|
||||||
self.model = model
|
self.model = model
|
||||||
else:
|
else:
|
||||||
self.model = self.get_model(self.config)
|
self.run_get_model(self.config, get_model)
|
||||||
|
|
||||||
# init multispeaker settings of the model
|
|
||||||
if hasattr(self.model, "init_multispeaker"):
|
|
||||||
self.model.init_multispeaker(self.config, self.data_train + self.data_eval)
|
|
||||||
|
|
||||||
# setup criterion
|
# setup criterion
|
||||||
self.criterion = self.get_criterion(self.model)
|
self.criterion = self.get_criterion(self.model)
|
||||||
|
@ -247,7 +283,7 @@ class Trainer:
|
||||||
# setup optimizer
|
# setup optimizer
|
||||||
self.optimizer = self.get_optimizer(self.model, self.config)
|
self.optimizer = self.get_optimizer(self.model, self.config)
|
||||||
|
|
||||||
# callback
|
# CALLBACK
|
||||||
self.callbacks = TrainerCallback(self)
|
self.callbacks = TrainerCallback(self)
|
||||||
self.callbacks.on_init_start()
|
self.callbacks.on_init_start()
|
||||||
|
|
||||||
|
@ -280,7 +316,7 @@ class Trainer:
|
||||||
else:
|
else:
|
||||||
self.scheduler.last_epoch = self.restore_step
|
self.scheduler.last_epoch = self.restore_step
|
||||||
|
|
||||||
# DISTRUBUTED
|
# DISTRIBUTED
|
||||||
if self.num_gpus > 1:
|
if self.num_gpus > 1:
|
||||||
self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank)
|
self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank)
|
||||||
|
|
||||||
|
@ -291,8 +327,56 @@ class Trainer:
|
||||||
self.callbacks.on_init_end()
|
self.callbacks.on_init_end()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_model(config: Coqpit) -> nn.Module:
|
def parse_argv(args: Union[Coqpit, List]):
|
||||||
"""Initialize model from config.
|
"""Parse command line arguments to init or override `TrainingArgs()`."""
|
||||||
|
if isinstance(args, Coqpit):
|
||||||
|
parser = args.init_argparse(arg_prefix="")
|
||||||
|
else:
|
||||||
|
train_config = TrainingArgs()
|
||||||
|
parser = train_config.init_argparse(arg_prefix="")
|
||||||
|
training_args, coqpit_overrides = parser.parse_known_args()
|
||||||
|
args.parse_args(training_args)
|
||||||
|
return args, coqpit_overrides
|
||||||
|
|
||||||
|
def init_training(
|
||||||
|
self, args: TrainingArgs, coqpit_overrides: Dict, config: Coqpit = None
|
||||||
|
): # pylint: disable=no-self-use
|
||||||
|
"""Initialize training and update model configs from command line arguments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
args (argparse.Namespace or dict like): Parsed input arguments.
|
||||||
|
config_overrides (argparse.Namespace or dict like): Parsed config overriding arguments.
|
||||||
|
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
c (TTS.utils.io.AttrDict): Config paramaters.
|
||||||
|
"""
|
||||||
|
# set arguments for continuing training
|
||||||
|
if args.continue_path:
|
||||||
|
experiment_path = args.continue_path
|
||||||
|
args.config_path = os.path.join(args.continue_path, "config.json")
|
||||||
|
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
|
||||||
|
if not args.best_path:
|
||||||
|
args.best_path = best_model
|
||||||
|
|
||||||
|
# override config values from command-line args
|
||||||
|
# TODO: Maybe it is better to do it outside
|
||||||
|
if len(coqpit_overrides) > 0:
|
||||||
|
config.parse_known_args(coqpit_overrides, arg_prefix="coqpit", relaxed_parser=True)
|
||||||
|
experiment_path = args.continue_path
|
||||||
|
|
||||||
|
# update the config.json fields and copy it to the output folder
|
||||||
|
if args.rank == 0:
|
||||||
|
new_fields = {}
|
||||||
|
if args.restore_path:
|
||||||
|
new_fields["restore_path"] = args.restore_path
|
||||||
|
new_fields["github_branch"] = get_git_branch()
|
||||||
|
copy_model_files(config, experiment_path, new_fields)
|
||||||
|
return config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_get_model(config: Coqpit, get_model: Callable) -> nn.Module:
|
||||||
|
"""Run the `get_model` function and return the model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model config.
|
config (Coqpit): Model config.
|
||||||
|
@ -300,12 +384,22 @@ class Trainer:
|
||||||
Returns:
|
Returns:
|
||||||
nn.Module: initialized model.
|
nn.Module: initialized model.
|
||||||
"""
|
"""
|
||||||
try:
|
if len(signature(get_model).sig.parameters) == 1:
|
||||||
model = setup_vocoder_model(config)
|
model = get_model(config)
|
||||||
except ModuleNotFoundError:
|
else:
|
||||||
model = setup_tts_model(config)
|
model = get_model()
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_get_data_samples(config: Coqpit, get_data_samples: Callable) -> nn.Module:
|
||||||
|
if callable(get_data_samples):
|
||||||
|
if len(signature(get_data_samples).sig.parameters) == 1:
|
||||||
|
train_samples, eval_samples = get_data_samples(config)
|
||||||
|
else:
|
||||||
|
train_samples, eval_samples = get_data_samples()
|
||||||
|
return train_samples, eval_samples
|
||||||
|
return None, None
|
||||||
|
|
||||||
def restore_model(
|
def restore_model(
|
||||||
self,
|
self,
|
||||||
config: Coqpit,
|
config: Coqpit,
|
||||||
|
@ -366,11 +460,15 @@ class Trainer:
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
return model, optimizer, scaler, restore_step
|
return model, optimizer, scaler, restore_step
|
||||||
|
|
||||||
|
#########################
|
||||||
|
# DATA LOADING FUNCTIONS
|
||||||
|
#########################
|
||||||
|
|
||||||
def _get_loader(
|
def _get_loader(
|
||||||
self,
|
self,
|
||||||
model: nn.Module,
|
model: nn.Module,
|
||||||
config: Coqpit,
|
config: Coqpit,
|
||||||
ap: AudioProcessor,
|
assets: Dict,
|
||||||
is_eval: bool,
|
is_eval: bool,
|
||||||
data_items: List,
|
data_items: List,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
|
@ -379,14 +477,14 @@ class Trainer:
|
||||||
if num_gpus > 1:
|
if num_gpus > 1:
|
||||||
if hasattr(model.module, "get_data_loader"):
|
if hasattr(model.module, "get_data_loader"):
|
||||||
loader = model.module.get_data_loader(
|
loader = model.module.get_data_loader(
|
||||||
config, ap, is_eval, data_items, verbose, num_gpus, self.args.rank
|
config, assets, is_eval, data_items, verbose, num_gpus, self.args.rank
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if hasattr(model, "get_data_loader"):
|
if hasattr(model, "get_data_loader"):
|
||||||
loader = model.get_data_loader(config, ap, is_eval, data_items, verbose, num_gpus)
|
loader = model.get_data_loader(config, assets, is_eval, data_items, verbose, num_gpus)
|
||||||
return loader
|
return loader
|
||||||
|
|
||||||
def get_train_dataloader(self, ap: AudioProcessor, data_items: List, verbose: bool) -> DataLoader:
|
def get_train_dataloader(self, training_assets: Dict, data_items: List, verbose: bool) -> DataLoader:
|
||||||
"""Initialize and return a training data loader.
|
"""Initialize and return a training data loader.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -397,10 +495,10 @@ class Trainer:
|
||||||
Returns:
|
Returns:
|
||||||
DataLoader: Initialized training data loader.
|
DataLoader: Initialized training data loader.
|
||||||
"""
|
"""
|
||||||
return self._get_loader(self.model, self.config, ap, False, data_items, verbose, self.num_gpus)
|
return self._get_loader(self.model, self.config, training_assets, False, data_items, verbose, self.num_gpus)
|
||||||
|
|
||||||
def get_eval_dataloader(self, ap: AudioProcessor, data_items: List, verbose: bool) -> DataLoader:
|
def get_eval_dataloader(self, training_assets: Dict, data_items: List, verbose: bool) -> DataLoader:
|
||||||
return self._get_loader(self.model, self.config, ap, True, data_items, verbose, self.num_gpus)
|
return self._get_loader(self.model, self.config, training_assets, True, data_items, verbose, self.num_gpus)
|
||||||
|
|
||||||
def format_batch(self, batch: List) -> Dict:
|
def format_batch(self, batch: List) -> Dict:
|
||||||
"""Format the dataloader output and return a batch.
|
"""Format the dataloader output and return a batch.
|
||||||
|
@ -420,6 +518,10 @@ class Trainer:
|
||||||
batch[k] = to_cuda(v)
|
batch[k] = to_cuda(v)
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
|
######################
|
||||||
|
# TRAIN FUNCTIONS
|
||||||
|
######################
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def master_params(optimizer: torch.optim.Optimizer):
|
def master_params(optimizer: torch.optim.Optimizer):
|
||||||
"""Generator over parameters owned by the optimizer.
|
"""Generator over parameters owned by the optimizer.
|
||||||
|
@ -516,10 +618,8 @@ class Trainer:
|
||||||
else:
|
else:
|
||||||
grad_clip = 0.0 # meaning no gradient clipping
|
grad_clip = 0.0 # meaning no gradient clipping
|
||||||
|
|
||||||
if grad_clip <= 0:
|
|
||||||
grad_norm = 0
|
|
||||||
|
|
||||||
# optimizer step
|
# optimizer step
|
||||||
|
grad_norm = 0
|
||||||
update_lr_scheduler = True
|
update_lr_scheduler = True
|
||||||
if self.use_amp_scaler:
|
if self.use_amp_scaler:
|
||||||
if self.use_apex:
|
if self.use_apex:
|
||||||
|
@ -527,31 +627,29 @@ class Trainer:
|
||||||
# https://nvidia.github.io/apex/advanced.html?highlight=accumulate#backward-passes-with-multiple-optimizers
|
# https://nvidia.github.io/apex/advanced.html?highlight=accumulate#backward-passes-with-multiple-optimizers
|
||||||
with amp.scale_loss(loss_dict["loss"], optimizer) as scaled_loss:
|
with amp.scale_loss(loss_dict["loss"], optimizer) as scaled_loss:
|
||||||
scaled_loss.backward()
|
scaled_loss.backward()
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), grad_clip)
|
||||||
amp.master_params(optimizer), grad_clip, error_if_nonfinite=False
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
# model optimizer step in mixed precision mode
|
# model optimizer step in mixed precision mode
|
||||||
scaler.scale(loss_dict["loss"]).backward()
|
scaler.scale(loss_dict["loss"]).backward()
|
||||||
if grad_clip > 0:
|
if grad_clip > 0:
|
||||||
scaler.unscale_(optimizer)
|
scaler.unscale_(optimizer)
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
grad_norm = torch.nn.utils.clip_grad_norm_(self.master_params(optimizer), grad_clip)
|
||||||
self.master_params(optimizer), grad_clip, error_if_nonfinite=False
|
|
||||||
)
|
|
||||||
# pytorch skips the step when the norm is 0. So ignore the norm value when it is NaN
|
|
||||||
if torch.isnan(grad_norm) or torch.isinf(grad_norm):
|
|
||||||
grad_norm = 0
|
|
||||||
scale_prev = scaler.get_scale()
|
scale_prev = scaler.get_scale()
|
||||||
scaler.step(optimizer)
|
scaler.step(optimizer)
|
||||||
scaler.update()
|
scaler.update()
|
||||||
update_lr_scheduler = scale_prev <= scaler.get_scale()
|
update_lr_scheduler = scale_prev <= scaler.get_scale()
|
||||||
|
loss_dict["amp_scaler"] = scaler.get_scale() # for logging
|
||||||
else:
|
else:
|
||||||
# main model optimizer step
|
# main model optimizer step
|
||||||
loss_dict["loss"].backward()
|
loss_dict["loss"].backward()
|
||||||
if grad_clip > 0:
|
if grad_clip > 0:
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip, error_if_nonfinite=False)
|
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
|
# pytorch skips the step when the norm is 0. So ignore the norm value when it is NaN
|
||||||
|
if isinstance(grad_norm, torch.Tensor) and (torch.isnan(grad_norm) or torch.isinf(grad_norm)):
|
||||||
|
grad_norm = 0
|
||||||
|
|
||||||
step_time = time.time() - step_start_time
|
step_time = time.time() - step_start_time
|
||||||
|
|
||||||
# setup lr
|
# setup lr
|
||||||
|
@ -567,24 +665,6 @@ class Trainer:
|
||||||
loss_dict["grad_norm"] = grad_norm
|
loss_dict["grad_norm"] = grad_norm
|
||||||
return outputs, loss_dict, step_time
|
return outputs, loss_dict, step_time
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _detach_loss_dict(loss_dict: Dict) -> Dict:
|
|
||||||
"""Detach loss values from autograp.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
loss_dict (Dict): losses.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dict: losses detached from autograph.
|
|
||||||
"""
|
|
||||||
loss_dict_detached = {}
|
|
||||||
for key, value in loss_dict.items():
|
|
||||||
if isinstance(value, (int, float)):
|
|
||||||
loss_dict_detached[key] = value
|
|
||||||
else:
|
|
||||||
loss_dict_detached[key] = value.item()
|
|
||||||
return loss_dict_detached
|
|
||||||
|
|
||||||
def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]:
|
def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]:
|
||||||
"""Perform a training step on a batch of inputs and log the process.
|
"""Perform a training step on a batch of inputs and log the process.
|
||||||
|
|
||||||
|
@ -700,15 +780,14 @@ class Trainer:
|
||||||
self.dashboard_logger.log_artifact(self.output_path, "checkpoint", "model", aliases)
|
self.dashboard_logger.log_artifact(self.output_path, "checkpoint", "model", aliases)
|
||||||
|
|
||||||
# training visualizations
|
# training visualizations
|
||||||
figures, audios = None, None
|
|
||||||
if hasattr(self.model, "module") and hasattr(self.model.module, "train_log"):
|
if hasattr(self.model, "module") and hasattr(self.model.module, "train_log"):
|
||||||
figures, audios = self.model.module.train_log(self.ap, batch, outputs)
|
self.model.module.train_log(
|
||||||
|
batch, outputs, self.dashboard_logger, self.training_assets, self.total_steps_done
|
||||||
|
)
|
||||||
elif hasattr(self.model, "train_log"):
|
elif hasattr(self.model, "train_log"):
|
||||||
figures, audios = self.model.train_log(self.ap, batch, outputs)
|
self.model.train_log(
|
||||||
if figures is not None:
|
batch, outputs, self.dashboard_logger, self.training_assets, self.total_steps_done
|
||||||
self.dashboard_logger.train_figures(self.total_steps_done, figures)
|
)
|
||||||
if audios is not None:
|
|
||||||
self.dashboard_logger.train_audios(self.total_steps_done, audios, self.ap.sample_rate)
|
|
||||||
|
|
||||||
self.dashboard_logger.flush()
|
self.dashboard_logger.flush()
|
||||||
|
|
||||||
|
@ -718,11 +797,13 @@ class Trainer:
|
||||||
|
|
||||||
def train_epoch(self) -> None:
|
def train_epoch(self) -> None:
|
||||||
"""Main entry point for the training loop. Run training on the all training samples."""
|
"""Main entry point for the training loop. Run training on the all training samples."""
|
||||||
|
# initialize the data loader
|
||||||
self.train_loader = self.get_train_dataloader(
|
self.train_loader = self.get_train_dataloader(
|
||||||
self.ap,
|
self.training_assets,
|
||||||
self.data_train,
|
self.train_samples,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
)
|
)
|
||||||
|
# set model to training mode
|
||||||
if self.num_gpus > 1:
|
if self.num_gpus > 1:
|
||||||
self.model.module.train()
|
self.model.module.train()
|
||||||
else:
|
else:
|
||||||
|
@ -734,11 +815,12 @@ class Trainer:
|
||||||
batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size)
|
batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size)
|
||||||
self.c_logger.print_train_start()
|
self.c_logger.print_train_start()
|
||||||
loader_start_time = time.time()
|
loader_start_time = time.time()
|
||||||
|
# iterate over the training samples
|
||||||
for cur_step, batch in enumerate(self.train_loader):
|
for cur_step, batch in enumerate(self.train_loader):
|
||||||
_, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time)
|
_, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time)
|
||||||
loader_start_time = time.time()
|
loader_start_time = time.time()
|
||||||
epoch_time = time.time() - epoch_start_time
|
epoch_time = time.time() - epoch_start_time
|
||||||
# Plot self.epochs_done Stats
|
# plot self.epochs_done Stats
|
||||||
if self.args.rank == 0:
|
if self.args.rank == 0:
|
||||||
epoch_stats = {"epoch_time": epoch_time}
|
epoch_stats = {"epoch_time": epoch_time}
|
||||||
epoch_stats.update(self.keep_avg_train.avg_values)
|
epoch_stats.update(self.keep_avg_train.avg_values)
|
||||||
|
@ -754,6 +836,10 @@ class Trainer:
|
||||||
else:
|
else:
|
||||||
self.scheduler.step()
|
self.scheduler.step()
|
||||||
|
|
||||||
|
#######################
|
||||||
|
# EVAL FUNCTIONS
|
||||||
|
#######################
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _model_eval_step(
|
def _model_eval_step(
|
||||||
batch: Dict, model: nn.Module, criterion: nn.Module, optimizer_idx: int = None
|
batch: Dict, model: nn.Module, criterion: nn.Module, optimizer_idx: int = None
|
||||||
|
@ -803,7 +889,7 @@ class Trainer:
|
||||||
loss_dict_new[f"loss_{idx}"] = loss_dict_new.pop("loss")
|
loss_dict_new[f"loss_{idx}"] = loss_dict_new.pop("loss")
|
||||||
loss_dict.update(loss_dict_new)
|
loss_dict.update(loss_dict_new)
|
||||||
|
|
||||||
loss_dict = self._detach_loss_dict(loss_dict)
|
loss_dict = self._detach_loss_dict(loss_dict)
|
||||||
|
|
||||||
# update avg stats
|
# update avg stats
|
||||||
update_eval_values = {}
|
update_eval_values = {}
|
||||||
|
@ -819,8 +905,8 @@ class Trainer:
|
||||||
"""Main entry point for the evaluation loop. Run evaluation on the all validation samples."""
|
"""Main entry point for the evaluation loop. Run evaluation on the all validation samples."""
|
||||||
self.eval_loader = (
|
self.eval_loader = (
|
||||||
self.get_eval_dataloader(
|
self.get_eval_dataloader(
|
||||||
self.ap,
|
self.training_assets,
|
||||||
self.data_eval,
|
self.eval_samples,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
)
|
)
|
||||||
if self.config.run_eval
|
if self.config.run_eval
|
||||||
|
@ -840,15 +926,12 @@ class Trainer:
|
||||||
loader_start_time = time.time()
|
loader_start_time = time.time()
|
||||||
# plot epoch stats, artifacts and figures
|
# plot epoch stats, artifacts and figures
|
||||||
if self.args.rank == 0:
|
if self.args.rank == 0:
|
||||||
figures, audios = None, None
|
|
||||||
if hasattr(self.model, "module") and hasattr(self.model.module, "eval_log"):
|
if hasattr(self.model, "module") and hasattr(self.model.module, "eval_log"):
|
||||||
figures, audios = self.model.module.eval_log(self.ap, batch, outputs)
|
self.model.module.eval_log(
|
||||||
|
batch, outputs, self.dashboard_logger, self.training_assets, self.total_steps_done
|
||||||
|
)
|
||||||
elif hasattr(self.model, "eval_log"):
|
elif hasattr(self.model, "eval_log"):
|
||||||
figures, audios = self.model.eval_log(self.ap, batch, outputs)
|
self.model.eval_log(batch, outputs, self.dashboard_logger, self.training_assets, self.total_steps_done)
|
||||||
if figures is not None:
|
|
||||||
self.dashboard_logger.eval_figures(self.total_steps_done, figures)
|
|
||||||
if audios is not None:
|
|
||||||
self.dashboard_logger.eval_audios(self.total_steps_done, audios, self.ap.sample_rate)
|
|
||||||
self.dashboard_logger.eval_stats(self.total_steps_done, self.keep_avg_eval.avg_values)
|
self.dashboard_logger.eval_stats(self.total_steps_done, self.keep_avg_eval.avg_values)
|
||||||
|
|
||||||
def test_run(self) -> None:
|
def test_run(self) -> None:
|
||||||
|
@ -857,22 +940,22 @@ class Trainer:
|
||||||
if hasattr(self.model, "test_run") or (self.num_gpus > 1 and hasattr(self.model.module, "test_run")):
|
if hasattr(self.model, "test_run") or (self.num_gpus > 1 and hasattr(self.model.module, "test_run")):
|
||||||
if self.eval_loader is None:
|
if self.eval_loader is None:
|
||||||
self.eval_loader = self.get_eval_dataloader(
|
self.eval_loader = self.get_eval_dataloader(
|
||||||
self.ap,
|
self.training_assets,
|
||||||
self.data_eval,
|
self.eval_samples,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
if hasattr(self.eval_loader.dataset, "load_test_samples"):
|
if hasattr(self.eval_loader.dataset, "load_test_samples"):
|
||||||
samples = self.eval_loader.dataset.load_test_samples(1)
|
samples = self.eval_loader.dataset.load_test_samples(1)
|
||||||
if self.num_gpus > 1:
|
if self.num_gpus > 1:
|
||||||
figures, audios = self.model.module.test_run(self.ap, samples, None)
|
figures, audios = self.model.module.test_run(self.training_assets, samples, None)
|
||||||
else:
|
else:
|
||||||
figures, audios = self.model.test_run(self.ap, samples, None)
|
figures, audios = self.model.test_run(self.training_assets, samples, None)
|
||||||
else:
|
else:
|
||||||
if self.num_gpus > 1:
|
if self.num_gpus > 1:
|
||||||
figures, audios = self.model.module.test_run(self.ap)
|
figures, audios = self.model.module.test_run(self.training_assets)
|
||||||
else:
|
else:
|
||||||
figures, audios = self.model.test_run(self.ap)
|
figures, audios = self.model.test_run(self.training_assets)
|
||||||
self.dashboard_logger.test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"])
|
self.dashboard_logger.test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"])
|
||||||
self.dashboard_logger.test_figures(self.total_steps_done, figures)
|
self.dashboard_logger.test_figures(self.total_steps_done, figures)
|
||||||
|
|
||||||
|
@ -886,6 +969,10 @@ class Trainer:
|
||||||
self.best_loss = ch["model_loss"]
|
self.best_loss = ch["model_loss"]
|
||||||
print(f" > Starting with loaded last best loss {self.best_loss}.")
|
print(f" > Starting with loaded last best loss {self.best_loss}.")
|
||||||
|
|
||||||
|
###################################
|
||||||
|
# FIT FUNCTIONS
|
||||||
|
###################################
|
||||||
|
|
||||||
def _fit(self) -> None:
|
def _fit(self) -> None:
|
||||||
"""🏃 train -> evaluate -> test for the number of epochs."""
|
"""🏃 train -> evaluate -> test for the number of epochs."""
|
||||||
self._restore_best_loss()
|
self._restore_best_loss()
|
||||||
|
@ -901,7 +988,8 @@ class Trainer:
|
||||||
self.keep_avg_eval = KeepAverage() if self.config.run_eval else None
|
self.keep_avg_eval = KeepAverage() if self.config.run_eval else None
|
||||||
self.epochs_done = epoch
|
self.epochs_done = epoch
|
||||||
self.c_logger.print_epoch_start(epoch, self.config.epochs, self.output_path)
|
self.c_logger.print_epoch_start(epoch, self.config.epochs, self.output_path)
|
||||||
self.train_epoch()
|
if not self.args.skip_train_epoch:
|
||||||
|
self.train_epoch()
|
||||||
if self.config.run_eval:
|
if self.config.run_eval:
|
||||||
self.eval_epoch()
|
self.eval_epoch()
|
||||||
if epoch >= self.config.test_delay_epochs and self.args.rank <= 0:
|
if epoch >= self.config.test_delay_epochs and self.args.rank <= 0:
|
||||||
|
@ -939,24 +1027,6 @@ class Trainer:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
def _pick_target_avg_loss(self, keep_avg_target: KeepAverage) -> Dict:
|
|
||||||
"""Pick the target loss to compare models"""
|
|
||||||
target_avg_loss = None
|
|
||||||
|
|
||||||
# return if target loss defined in the model config
|
|
||||||
if "target_loss" in self.config and self.config.target_loss:
|
|
||||||
return keep_avg_target[f"avg_{self.config.target_loss}"]
|
|
||||||
|
|
||||||
# take the average of loss_{optimizer_idx} as the target loss when there are multiple optimizers
|
|
||||||
if isinstance(self.optimizer, list):
|
|
||||||
target_avg_loss = 0
|
|
||||||
for idx in range(len(self.optimizer)):
|
|
||||||
target_avg_loss += keep_avg_target[f"avg_loss_{idx}"]
|
|
||||||
target_avg_loss /= len(self.optimizer)
|
|
||||||
else:
|
|
||||||
target_avg_loss = keep_avg_target["avg_loss"]
|
|
||||||
return target_avg_loss
|
|
||||||
|
|
||||||
def save_best_model(self) -> None:
|
def save_best_model(self) -> None:
|
||||||
"""Save the best model. It only saves if the current target loss is smaller then the previous."""
|
"""Save the best model. It only saves if the current target loss is smaller then the previous."""
|
||||||
|
|
||||||
|
@ -978,35 +1048,9 @@ class Trainer:
|
||||||
keep_after=self.config.keep_after,
|
keep_after=self.config.keep_after,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _setup_logger_config(self, log_file: str) -> None:
|
#####################
|
||||||
"""Write log strings to a file and print logs to the terminal.
|
# GET FUNCTIONS
|
||||||
TODO: Causes formatting issues in pdb debugging."""
|
#####################
|
||||||
|
|
||||||
class Logger(object):
|
|
||||||
def __init__(self, print_to_terminal=True):
|
|
||||||
self.print_to_terminal = print_to_terminal
|
|
||||||
self.terminal = sys.stdout
|
|
||||||
self.log_file = log_file
|
|
||||||
|
|
||||||
def write(self, message):
|
|
||||||
if self.print_to_terminal:
|
|
||||||
self.terminal.write(message)
|
|
||||||
with open(self.log_file, "a", encoding="utf-8") as f:
|
|
||||||
f.write(message)
|
|
||||||
|
|
||||||
def flush(self):
|
|
||||||
# this flush method is needed for python 3 compatibility.
|
|
||||||
# this handles the flush command by doing nothing.
|
|
||||||
# you might want to specify some extra behavior here.
|
|
||||||
pass
|
|
||||||
|
|
||||||
# don't let processes rank > 0 write to the terminal
|
|
||||||
sys.stdout = Logger(self.args.rank == 0)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_apex_available() -> bool:
|
|
||||||
"""Check if Nvidia's APEX is available."""
|
|
||||||
return importlib.util.find_spec("apex") is not None
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_optimizer(model: nn.Module, config: Coqpit) -> Union[torch.optim.Optimizer, List]:
|
def get_optimizer(model: nn.Module, config: Coqpit) -> Union[torch.optim.Optimizer, List]:
|
||||||
|
@ -1084,154 +1128,72 @@ class Trainer:
|
||||||
criterion = model.get_criterion()
|
criterion = model.get_criterion()
|
||||||
return criterion
|
return criterion
|
||||||
|
|
||||||
|
####################
|
||||||
|
# HELPER FUNCTIONS
|
||||||
|
####################
|
||||||
|
|
||||||
def getarguments():
|
@staticmethod
|
||||||
train_config = TrainingArgs()
|
def _detach_loss_dict(loss_dict: Dict) -> Dict:
|
||||||
parser = train_config.init_argparse(arg_prefix="")
|
"""Detach loss values from autograp.
|
||||||
return parser
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
loss_dict (Dict): losses.
|
||||||
|
|
||||||
def get_last_checkpoint(path: str) -> Tuple[str, str]:
|
Returns:
|
||||||
"""Get latest checkpoint or/and best model in path.
|
Dict: losses detached from autograph.
|
||||||
|
"""
|
||||||
|
loss_dict_detached = {}
|
||||||
|
for key, value in loss_dict.items():
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
loss_dict_detached[key] = value
|
||||||
|
else:
|
||||||
|
loss_dict_detached[key] = value.detach().item()
|
||||||
|
return loss_dict_detached
|
||||||
|
|
||||||
It is based on globbing for `*.pth.tar` and the RegEx
|
def _pick_target_avg_loss(self, keep_avg_target: KeepAverage) -> Dict:
|
||||||
`(checkpoint|best_model)_([0-9]+)`.
|
"""Pick the target loss to compare models"""
|
||||||
|
target_avg_loss = None
|
||||||
|
|
||||||
Args:
|
# return if target loss defined in the model config
|
||||||
path: Path to files to be compared.
|
if "target_loss" in self.config and self.config.target_loss:
|
||||||
|
return keep_avg_target[f"avg_{self.config.target_loss}"]
|
||||||
|
|
||||||
Raises:
|
# take the average of loss_{optimizer_idx} as the target loss when there are multiple optimizers
|
||||||
ValueError: If no checkpoint or best_model files are found.
|
if isinstance(self.optimizer, list):
|
||||||
|
target_avg_loss = 0
|
||||||
Returns:
|
for idx in range(len(self.optimizer)):
|
||||||
Path to the last checkpoint
|
target_avg_loss += keep_avg_target[f"avg_loss_{idx}"]
|
||||||
Path to best checkpoint
|
target_avg_loss /= len(self.optimizer)
|
||||||
"""
|
|
||||||
fs = fsspec.get_mapper(path).fs
|
|
||||||
file_names = fs.glob(os.path.join(path, "*.pth.tar"))
|
|
||||||
scheme = urlparse(path).scheme
|
|
||||||
if scheme: # scheme is not preserved in fs.glob, add it back
|
|
||||||
file_names = [scheme + "://" + file_name for file_name in file_names]
|
|
||||||
last_models = {}
|
|
||||||
last_model_nums = {}
|
|
||||||
for key in ["checkpoint", "best_model"]:
|
|
||||||
last_model_num = None
|
|
||||||
last_model = None
|
|
||||||
# pass all the checkpoint files and find
|
|
||||||
# the one with the largest model number suffix.
|
|
||||||
for file_name in file_names:
|
|
||||||
match = re.search(f"{key}_([0-9]+)", file_name)
|
|
||||||
if match is not None:
|
|
||||||
model_num = int(match.groups()[0])
|
|
||||||
if last_model_num is None or model_num > last_model_num:
|
|
||||||
last_model_num = model_num
|
|
||||||
last_model = file_name
|
|
||||||
|
|
||||||
# if there is no checkpoint found above
|
|
||||||
# find the checkpoint with the latest
|
|
||||||
# modification date.
|
|
||||||
key_file_names = [fn for fn in file_names if key in fn]
|
|
||||||
if last_model is None and len(key_file_names) > 0:
|
|
||||||
last_model = max(key_file_names, key=os.path.getctime)
|
|
||||||
last_model_num = load_fsspec(last_model)["step"]
|
|
||||||
|
|
||||||
if last_model is not None:
|
|
||||||
last_models[key] = last_model
|
|
||||||
last_model_nums[key] = last_model_num
|
|
||||||
|
|
||||||
# check what models were found
|
|
||||||
if not last_models:
|
|
||||||
raise ValueError(f"No models found in continue path {path}!")
|
|
||||||
if "checkpoint" not in last_models: # no checkpoint just best model
|
|
||||||
last_models["checkpoint"] = last_models["best_model"]
|
|
||||||
elif "best_model" not in last_models: # no best model
|
|
||||||
# this shouldn't happen, but let's handle it just in case
|
|
||||||
last_models["best_model"] = last_models["checkpoint"]
|
|
||||||
# finally check if last best model is more recent than checkpoint
|
|
||||||
elif last_model_nums["best_model"] > last_model_nums["checkpoint"]:
|
|
||||||
last_models["checkpoint"] = last_models["best_model"]
|
|
||||||
|
|
||||||
return last_models["checkpoint"], last_models["best_model"]
|
|
||||||
|
|
||||||
|
|
||||||
def process_args(args, config=None):
|
|
||||||
"""Process parsed comand line arguments and initialize the config if not provided.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
args (argparse.Namespace or dict like): Parsed input arguments.
|
|
||||||
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
c (TTS.utils.io.AttrDict): Config paramaters.
|
|
||||||
out_path (str): Path to save models and logging.
|
|
||||||
audio_path (str): Path to save generated test audios.
|
|
||||||
c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
|
|
||||||
logging to the console.
|
|
||||||
|
|
||||||
dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
|
|
||||||
|
|
||||||
TODO:
|
|
||||||
- Interactive config definition.
|
|
||||||
"""
|
|
||||||
if isinstance(args, tuple):
|
|
||||||
args, coqpit_overrides = args
|
|
||||||
if args.continue_path:
|
|
||||||
# continue a previous training from its output folder
|
|
||||||
experiment_path = args.continue_path
|
|
||||||
args.config_path = os.path.join(args.continue_path, "config.json")
|
|
||||||
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
|
|
||||||
if not args.best_path:
|
|
||||||
args.best_path = best_model
|
|
||||||
# init config if not already defined
|
|
||||||
if config is None:
|
|
||||||
if args.config_path:
|
|
||||||
# init from a file
|
|
||||||
config = load_config(args.config_path)
|
|
||||||
else:
|
else:
|
||||||
# init from console args
|
target_avg_loss = keep_avg_target["avg_loss"]
|
||||||
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
return target_avg_loss
|
||||||
|
|
||||||
config_base = BaseTrainingConfig()
|
def _setup_logger_config(self, log_file: str) -> None:
|
||||||
config_base.parse_known_args(coqpit_overrides)
|
"""Write log strings to a file and print logs to the terminal.
|
||||||
config = register_config(config_base.model)()
|
TODO: Causes formatting issues in pdb debugging."""
|
||||||
# override values from command-line args
|
|
||||||
config.parse_known_args(coqpit_overrides, relaxed_parser=True)
|
|
||||||
experiment_path = args.continue_path
|
|
||||||
if not experiment_path:
|
|
||||||
experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
|
|
||||||
audio_path = os.path.join(experiment_path, "test_audios")
|
|
||||||
config.output_log_path = experiment_path
|
|
||||||
# setup rank 0 process in distributed training
|
|
||||||
dashboard_logger = None
|
|
||||||
if args.rank == 0:
|
|
||||||
new_fields = {}
|
|
||||||
if args.restore_path:
|
|
||||||
new_fields["restore_path"] = args.restore_path
|
|
||||||
new_fields["github_branch"] = get_git_branch()
|
|
||||||
# if model characters are not set in the config file
|
|
||||||
# save the default set to the config file for future
|
|
||||||
# compatibility.
|
|
||||||
if config.has("characters") and config.characters is None:
|
|
||||||
used_characters = parse_symbols()
|
|
||||||
new_fields["characters"] = used_characters
|
|
||||||
copy_model_files(config, experiment_path, new_fields)
|
|
||||||
dashboard_logger = init_dashboard_logger(config)
|
|
||||||
c_logger = ConsoleLogger()
|
|
||||||
return config, experiment_path, audio_path, c_logger, dashboard_logger
|
|
||||||
|
|
||||||
|
class Logger(object):
|
||||||
|
def __init__(self, print_to_terminal=True):
|
||||||
|
self.print_to_terminal = print_to_terminal
|
||||||
|
self.terminal = sys.stdout
|
||||||
|
self.log_file = log_file
|
||||||
|
|
||||||
def init_arguments():
|
def write(self, message):
|
||||||
train_config = TrainingArgs()
|
if self.print_to_terminal:
|
||||||
parser = train_config.init_argparse(arg_prefix="")
|
self.terminal.write(message)
|
||||||
return parser
|
with open(self.log_file, "a", encoding="utf-8") as f:
|
||||||
|
f.write(message)
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
# this flush method is needed for python 3 compatibility.
|
||||||
|
# this handles the flush command by doing nothing.
|
||||||
|
# you might want to specify some extra behavior here.
|
||||||
|
pass
|
||||||
|
|
||||||
def init_training(argv: Union[List, Coqpit], config: Coqpit = None):
|
# don't let processes rank > 0 write to the terminal
|
||||||
"""Initialization of a training run."""
|
sys.stdout = Logger(self.args.rank == 0)
|
||||||
if isinstance(argv, Coqpit):
|
|
||||||
parser = argv.init_argparse(arg_prefix="")
|
@staticmethod
|
||||||
else:
|
def _is_apex_available() -> bool:
|
||||||
parser = init_arguments()
|
"""Check if Nvidia's APEX is available."""
|
||||||
args = parser.parse_known_args()
|
return importlib.util.find_spec("apex") is not None
|
||||||
config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
|
|
||||||
return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
|
|
||||||
|
|
|
@ -3,15 +3,15 @@ import os
|
||||||
from inspect import isclass
|
from inspect import isclass
|
||||||
|
|
||||||
# import all files under configs/
|
# import all files under configs/
|
||||||
configs_dir = os.path.dirname(__file__)
|
# configs_dir = os.path.dirname(__file__)
|
||||||
for file in os.listdir(configs_dir):
|
# for file in os.listdir(configs_dir):
|
||||||
path = os.path.join(configs_dir, file)
|
# path = os.path.join(configs_dir, file)
|
||||||
if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
|
# if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
|
||||||
config_name = file[: file.find(".py")] if file.endswith(".py") else file
|
# config_name = file[: file.find(".py")] if file.endswith(".py") else file
|
||||||
module = importlib.import_module("TTS.tts.configs." + config_name)
|
# module = importlib.import_module("TTS.tts.configs." + config_name)
|
||||||
for attribute_name in dir(module):
|
# for attribute_name in dir(module):
|
||||||
attribute = getattr(module, attribute_name)
|
# attribute = getattr(module, attribute_name)
|
||||||
|
|
||||||
if isclass(attribute):
|
# if isclass(attribute):
|
||||||
# Add the class to this package's variables
|
# # Add the class to this package's variables
|
||||||
globals()[attribute_name] = attribute
|
# globals()[attribute_name] = attribute
|
||||||
|
|
|
@ -11,7 +11,7 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
>>> from TTS.tts.configs import FastPitchConfig
|
>>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig
|
||||||
>>> config = FastPitchConfig()
|
>>> config = FastPitchConfig()
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -30,6 +30,10 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
for the rest. Defaults to 10.
|
for the rest. Defaults to 10.
|
||||||
|
|
||||||
|
speakers_file (str):
|
||||||
|
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
|
||||||
|
speaker names. Defaults to `None`.
|
||||||
|
|
||||||
use_speaker_embedding (bool):
|
use_speaker_embedding (bool):
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
in the multi-speaker mode. Defaults to False.
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
@ -105,6 +109,8 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
model_args: ForwardTTSArgs = ForwardTTSArgs()
|
model_args: ForwardTTSArgs = ForwardTTSArgs()
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
|
num_speakers: int = 0
|
||||||
|
speakers_file: str = None
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
use_d_vector_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
d_vector_file: str = False
|
d_vector_file: str = False
|
||||||
|
@ -149,3 +155,22 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
"Prior to November 22, 1963.",
|
"Prior to November 22, 1963.",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
|
||||||
|
if self.num_speakers > 0:
|
||||||
|
self.model_args.num_speakers = self.num_speakers
|
||||||
|
|
||||||
|
# speaker embedding settings
|
||||||
|
if self.use_speaker_embedding:
|
||||||
|
self.model_args.use_speaker_embedding = True
|
||||||
|
if self.speakers_file:
|
||||||
|
self.model_args.speakers_file = self.speakers_file
|
||||||
|
|
||||||
|
# d-vector settings
|
||||||
|
if self.use_d_vector_file:
|
||||||
|
self.model_args.use_d_vector_file = True
|
||||||
|
if self.d_vector_dim is not None and self.d_vector_dim > 0:
|
||||||
|
self.model_args.d_vector_dim = self.d_vector_dim
|
||||||
|
if self.d_vector_file:
|
||||||
|
self.model_args.d_vector_file = self.d_vector_file
|
||||||
|
|
|
@ -30,6 +30,11 @@ class FastSpeechConfig(BaseTTSConfig):
|
||||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
for the rest. Defaults to 10.
|
for the rest. Defaults to 10.
|
||||||
|
|
||||||
|
speakers_file (str):
|
||||||
|
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
|
||||||
|
speaker names. Defaults to `None`.
|
||||||
|
|
||||||
|
|
||||||
use_speaker_embedding (bool):
|
use_speaker_embedding (bool):
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
in the multi-speaker mode. Defaults to False.
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
@ -105,6 +110,7 @@ class FastSpeechConfig(BaseTTSConfig):
|
||||||
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
|
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
|
speakers_file: str = None
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
use_d_vector_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
d_vector_file: str = False
|
d_vector_file: str = False
|
||||||
|
@ -149,3 +155,22 @@ class FastSpeechConfig(BaseTTSConfig):
|
||||||
"Prior to November 22, 1963.",
|
"Prior to November 22, 1963.",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
|
||||||
|
if self.num_speakers > 0:
|
||||||
|
self.model_args.num_speakers = self.num_speakers
|
||||||
|
|
||||||
|
# speaker embedding settings
|
||||||
|
if self.use_speaker_embedding:
|
||||||
|
self.model_args.use_speaker_embedding = True
|
||||||
|
if self.speakers_file:
|
||||||
|
self.model_args.speakers_file = self.speakers_file
|
||||||
|
|
||||||
|
# d-vector settings
|
||||||
|
if self.use_d_vector_file:
|
||||||
|
self.model_args.use_d_vector_file = True
|
||||||
|
if self.d_vector_dim is not None and self.d_vector_dim > 0:
|
||||||
|
self.model_args.d_vector_dim = self.d_vector_dim
|
||||||
|
if self.d_vector_file:
|
||||||
|
self.model_args.d_vector_file = self.d_vector_file
|
||||||
|
|
|
@ -218,7 +218,3 @@ class BaseTTSConfig(BaseTrainingConfig):
|
||||||
lr_scheduler_params: dict = field(default_factory=lambda: {})
|
lr_scheduler_params: dict = field(default_factory=lambda: {})
|
||||||
# testing
|
# testing
|
||||||
test_sentences: List[str] = field(default_factory=lambda: [])
|
test_sentences: List[str] = field(default_factory=lambda: [])
|
||||||
# multi-speaker
|
|
||||||
use_speaker_embedding: bool = False
|
|
||||||
use_d_vector_file: bool = False
|
|
||||||
d_vector_dim: int = 0
|
|
||||||
|
|
|
@ -30,6 +30,10 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
for the rest. Defaults to 10.
|
for the rest. Defaults to 10.
|
||||||
|
|
||||||
|
speakers_file (str):
|
||||||
|
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
|
||||||
|
speaker names. Defaults to `None`.
|
||||||
|
|
||||||
use_speaker_embedding (bool):
|
use_speaker_embedding (bool):
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
in the multi-speaker mode. Defaults to False.
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
@ -117,12 +121,13 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
},
|
},
|
||||||
out_channels=80,
|
out_channels=80,
|
||||||
hidden_channels=128,
|
hidden_channels=128,
|
||||||
num_speakers=0,
|
|
||||||
positional_encoding=True,
|
positional_encoding=True,
|
||||||
detach_duration_predictor=True,
|
detach_duration_predictor=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
|
num_speakers: int = 0
|
||||||
|
speakers_file: str = None
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
use_d_vector_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
d_vector_file: str = False
|
d_vector_file: str = False
|
||||||
|
@ -166,3 +171,22 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
"Prior to November 22, 1963.",
|
"Prior to November 22, 1963.",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
|
||||||
|
if self.num_speakers > 0:
|
||||||
|
self.model_args.num_speakers = self.num_speakers
|
||||||
|
|
||||||
|
# speaker embedding settings
|
||||||
|
if self.use_speaker_embedding:
|
||||||
|
self.model_args.use_speaker_embedding = True
|
||||||
|
if self.speakers_file:
|
||||||
|
self.model_args.speakers_file = self.speakers_file
|
||||||
|
|
||||||
|
# d-vector settings
|
||||||
|
if self.use_d_vector_file:
|
||||||
|
self.model_args.use_d_vector_file = True
|
||||||
|
if self.d_vector_dim is not None and self.d_vector_dim > 0:
|
||||||
|
self.model_args.d_vector_dim = self.d_vector_dim
|
||||||
|
if self.d_vector_file:
|
||||||
|
self.model_args.d_vector_file = self.d_vector_file
|
||||||
|
|
|
@ -106,7 +106,7 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
Weight decay coefficient. Defaults to `1e-6`.
|
Weight decay coefficient. Defaults to `1e-6`.
|
||||||
grad_clip (float):
|
grad_clip (float):
|
||||||
Gradient clipping threshold. Defaults to `5`.
|
Gradient clipping threshold. Defaults to `5`.
|
||||||
seq_len_notm (bool):
|
seq_len_norm (bool):
|
||||||
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
||||||
is divided by the sequence length. Defaults to False.
|
is divided by the sequence length. Defaults to False.
|
||||||
loss_masking (bool):
|
loss_masking (bool):
|
||||||
|
|
|
@ -139,3 +139,36 @@ class VitsConfig(BaseTTSConfig):
|
||||||
"Prior to November 22, 1963.",
|
"Prior to November 22, 1963.",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# multi-speaker settings
|
||||||
|
# use speaker embedding layer
|
||||||
|
num_speakers: int = 0
|
||||||
|
use_speaker_embedding: bool = False
|
||||||
|
speakers_file: str = None
|
||||||
|
speaker_embedding_channels: int = 256
|
||||||
|
|
||||||
|
# use d-vectors
|
||||||
|
use_d_vector_file: bool = False
|
||||||
|
d_vector_file: str = False
|
||||||
|
d_vector_dim: int = None
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
|
||||||
|
if self.num_speakers > 0:
|
||||||
|
self.model_args.num_speakers = self.num_speakers
|
||||||
|
|
||||||
|
# speaker embedding settings
|
||||||
|
if self.use_speaker_embedding:
|
||||||
|
self.model_args.use_speaker_embedding = True
|
||||||
|
if self.speakers_file:
|
||||||
|
self.model_args.speakers_file = self.speakers_file
|
||||||
|
if self.speaker_embedding_channels:
|
||||||
|
self.model_args.speaker_embedding_channels = self.speaker_embedding_channels
|
||||||
|
|
||||||
|
# d-vector settings
|
||||||
|
if self.use_d_vector_file:
|
||||||
|
self.model_args.use_d_vector_file = True
|
||||||
|
if self.d_vector_dim is not None and self.d_vector_dim > 0:
|
||||||
|
self.model_args.d_vector_dim = self.d_vector_dim
|
||||||
|
if self.d_vector_file:
|
||||||
|
self.model_args.d_vector_file = self.d_vector_file
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
import sys
|
import sys
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from TTS.tts.datasets.dataset import *
|
||||||
from TTS.tts.datasets.formatters import *
|
from TTS.tts.datasets.formatters import *
|
||||||
from TTS.tts.datasets.TTSDataset import TTSDataset
|
|
||||||
|
|
||||||
|
|
||||||
def split_dataset(items):
|
def split_dataset(items):
|
||||||
|
@ -31,11 +31,12 @@ def split_dataset(items):
|
||||||
return items[:eval_split_size], items[eval_split_size:]
|
return items[:eval_split_size], items[eval_split_size:]
|
||||||
|
|
||||||
|
|
||||||
def load_meta_data(datasets: List[Dict], eval_split=True) -> Tuple[List[List], List[List]]:
|
def load_tts_samples(datasets: Union[List[Dict], Dict], eval_split=True) -> Tuple[List[List], List[List]]:
|
||||||
"""Parse the dataset, load the samples as a list and load the attention alignments if provided.
|
"""Parse the dataset, load the samples as a list and load the attention alignments if provided.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
datasets (List[Dict]): A list of dataset dictionaries or dataset configs.
|
datasets (List[Dict], Dict): A list of datasets or a single dataset dictionary. If multiple datasets are
|
||||||
|
in the list, they are all merged.
|
||||||
eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate
|
eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate
|
||||||
an eval split automatically. Defaults to True.
|
an eval split automatically. Defaults to True.
|
||||||
|
|
||||||
|
@ -44,6 +45,8 @@ def load_meta_data(datasets: List[Dict], eval_split=True) -> Tuple[List[List], L
|
||||||
"""
|
"""
|
||||||
meta_data_train_all = []
|
meta_data_train_all = []
|
||||||
meta_data_eval_all = [] if eval_split else None
|
meta_data_eval_all = [] if eval_split else None
|
||||||
|
if not isinstance(datasets, list):
|
||||||
|
datasets = [datasets]
|
||||||
for dataset in datasets:
|
for dataset in datasets:
|
||||||
name = dataset["name"]
|
name = dataset["name"]
|
||||||
root_path = dataset["path"]
|
root_path = dataset["path"]
|
||||||
|
|
|
@ -330,7 +330,7 @@ class TTSDataset(Dataset):
|
||||||
if by_audio_len:
|
if by_audio_len:
|
||||||
lengths = []
|
lengths = []
|
||||||
for item in self.items:
|
for item in self.items:
|
||||||
lengths.append(os.path.getsize(item[1]))
|
lengths.append(os.path.getsize(item[1]) / 16 * 8) # assuming 16bit audio
|
||||||
lengths = np.array(lengths)
|
lengths = np.array(lengths)
|
||||||
else:
|
else:
|
||||||
lengths = np.array([len(ins[0]) for ins in self.items])
|
lengths = np.array([len(ins[0]) for ins in self.items])
|
||||||
|
@ -419,6 +419,7 @@ class TTSDataset(Dataset):
|
||||||
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
|
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
|
||||||
else:
|
else:
|
||||||
d_vectors = None
|
d_vectors = None
|
||||||
|
|
||||||
# get numerical speaker ids from speaker names
|
# get numerical speaker ids from speaker names
|
||||||
if self.speaker_id_mapping:
|
if self.speaker_id_mapping:
|
||||||
speaker_ids = [self.speaker_id_mapping[sn] for sn in batch["speaker_name"]]
|
speaker_ids = [self.speaker_id_mapping[sn] for sn in batch["speaker_name"]]
|
|
@ -308,14 +308,14 @@ def mls(root_path, meta_files=None):
|
||||||
# ======================================== VOX CELEB ===========================================
|
# ======================================== VOX CELEB ===========================================
|
||||||
def voxceleb2(root_path, meta_file=None):
|
def voxceleb2(root_path, meta_file=None):
|
||||||
"""
|
"""
|
||||||
:param meta_file Used only for consistency with load_meta_data api
|
:param meta_file Used only for consistency with load_tts_samples api
|
||||||
"""
|
"""
|
||||||
return _voxcel_x(root_path, meta_file, voxcel_idx="2")
|
return _voxcel_x(root_path, meta_file, voxcel_idx="2")
|
||||||
|
|
||||||
|
|
||||||
def voxceleb1(root_path, meta_file=None):
|
def voxceleb1(root_path, meta_file=None):
|
||||||
"""
|
"""
|
||||||
:param meta_file Used only for consistency with load_meta_data api
|
:param meta_file Used only for consistency with load_tts_samples api
|
||||||
"""
|
"""
|
||||||
return _voxcel_x(root_path, meta_file, voxcel_idx="1")
|
return _voxcel_x(root_path, meta_file, voxcel_idx="1")
|
||||||
|
|
||||||
|
|
|
@ -106,7 +106,6 @@ class InvConvNear(nn.Module):
|
||||||
- x: :math:`[B, C, T]`
|
- x: :math:`[B, C, T]`
|
||||||
- x_mask: :math:`[B, 1, T]`
|
- x_mask: :math:`[B, 1, T]`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
b, c, t = x.size()
|
b, c, t = x.size()
|
||||||
assert c % self.num_splits == 0
|
assert c % self.num_splits == 0
|
||||||
if x_mask is None:
|
if x_mask is None:
|
||||||
|
|
|
@ -410,11 +410,6 @@ class TacotronLoss(torch.nn.Module):
|
||||||
return_dict["postnet_ssim_loss"] = postnet_ssim_loss
|
return_dict["postnet_ssim_loss"] = postnet_ssim_loss
|
||||||
|
|
||||||
return_dict["loss"] = loss
|
return_dict["loss"] = loss
|
||||||
|
|
||||||
# check if any loss is NaN
|
|
||||||
for key, loss in return_dict.items():
|
|
||||||
if torch.isnan(loss):
|
|
||||||
raise RuntimeError(f" [!] NaN loss with {key}.")
|
|
||||||
return return_dict
|
return return_dict
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -126,27 +126,24 @@ class GravesAttention(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class OriginalAttention(nn.Module):
|
class OriginalAttention(nn.Module):
|
||||||
"""Bahdanau Attention with various optional modifications. Proposed below.
|
"""Bahdanau Attention with various optional modifications.
|
||||||
- Location sensitive attnetion: https://arxiv.org/abs/1712.05884
|
- Location sensitive attnetion: https://arxiv.org/abs/1712.05884
|
||||||
- Forward Attention: https://arxiv.org/abs/1807.06736 + state masking at inference
|
- Forward Attention: https://arxiv.org/abs/1807.06736 + state masking at inference
|
||||||
- Using sigmoid instead of softmax normalization
|
- Using sigmoid instead of softmax normalization
|
||||||
- Attention windowing at inference time
|
- Attention windowing at inference time
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
Location Sensitive Attention is an attention mechanism that extends the additive attention mechanism
|
Location Sensitive Attention extends the additive attention mechanism
|
||||||
to use cumulative attention weights from previous decoder time steps as an additional feature.
|
to use cumulative attention weights from previous decoder time steps with the current time step features.
|
||||||
|
|
||||||
Forward attention considers only the alignment paths that satisfy the monotonic condition at each
|
Forward attention computes most probable monotonic alignment. The modified attention probabilities at each
|
||||||
decoder timestep. The modified attention probabilities at each timestep are computed recursively
|
timestep are computed recursively by the forward algorithm.
|
||||||
using a forward algorithm.
|
|
||||||
|
|
||||||
Transition agent for forward attention is further proposed, which helps the attention mechanism
|
Transition agent in the forward attention explicitly gates the attention mechanism whether to move forward or
|
||||||
to make decisions whether to move forward or stay at each decoder timestep.
|
stay at each decoder timestep.
|
||||||
|
|
||||||
Attention windowing applies a sliding windows to time steps of the input tensor centering at the last
|
|
||||||
time step with the largest attention weight. It is especially useful at inference to keep the attention
|
|
||||||
alignment diagonal.
|
|
||||||
|
|
||||||
|
Attention windowing is a inductive prior that prevents the model from attending to previous and future timesteps
|
||||||
|
beyond a certain window.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query_dim (int): number of channels in the query tensor.
|
query_dim (int): number of channels in the query tensor.
|
||||||
|
|
|
@ -2,7 +2,7 @@ from TTS.tts.utils.text.symbols import make_symbols, parse_symbols
|
||||||
from TTS.utils.generic_utils import find_module
|
from TTS.utils.generic_utils import find_module
|
||||||
|
|
||||||
|
|
||||||
def setup_model(config):
|
def setup_model(config, speaker_manager: "SpeakerManager" = None):
|
||||||
print(" > Using model: {}".format(config.model))
|
print(" > Using model: {}".format(config.model))
|
||||||
# fetch the right model implementation.
|
# fetch the right model implementation.
|
||||||
if "base_model" in config and config["base_model"] is not None:
|
if "base_model" in config and config["base_model"] is not None:
|
||||||
|
@ -31,7 +31,7 @@ def setup_model(config):
|
||||||
config.model_params.num_chars = num_chars
|
config.model_params.num_chars = num_chars
|
||||||
if "model_args" in config:
|
if "model_args" in config:
|
||||||
config.model_args.num_chars = num_chars
|
config.model_args.num_chars = num_chars
|
||||||
model = MyModel(config)
|
model = MyModel(config, speaker_manager=speaker_manager)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict, Tuple
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
@ -12,8 +11,8 @@ from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
from TTS.utils.io import load_fsspec
|
from TTS.utils.io import load_fsspec
|
||||||
|
|
||||||
|
|
||||||
|
@ -101,9 +100,10 @@ class AlignTTS(BaseTTS):
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
|
|
||||||
def __init__(self, config: Coqpit):
|
def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__(config)
|
||||||
|
self.speaker_manager = speaker_manager
|
||||||
self.config = config
|
self.config = config
|
||||||
self.phase = -1
|
self.phase = -1
|
||||||
self.length_scale = (
|
self.length_scale = (
|
||||||
|
@ -360,9 +360,7 @@ class AlignTTS(BaseTTS):
|
||||||
|
|
||||||
return outputs, loss_dict
|
return outputs, loss_dict
|
||||||
|
|
||||||
def train_log(
|
def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use
|
||||||
self, ap: AudioProcessor, batch: dict, outputs: dict
|
|
||||||
) -> Tuple[Dict, Dict]: # pylint: disable=no-self-use
|
|
||||||
model_outputs = outputs["model_outputs"]
|
model_outputs = outputs["model_outputs"]
|
||||||
alignments = outputs["alignments"]
|
alignments = outputs["alignments"]
|
||||||
mel_input = batch["mel_input"]
|
mel_input = batch["mel_input"]
|
||||||
|
@ -381,11 +379,22 @@ class AlignTTS(BaseTTS):
|
||||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||||
return figures, {"audio": train_audio}
|
return figures, {"audio": train_audio}
|
||||||
|
|
||||||
|
def train_log(
|
||||||
|
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
|
||||||
|
) -> None: # pylint: disable=no-self-use
|
||||||
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.train_figures(steps, figures)
|
||||||
|
logger.train_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
||||||
def eval_step(self, batch: dict, criterion: nn.Module):
|
def eval_step(self, batch: dict, criterion: nn.Module):
|
||||||
return self.train_step(batch, criterion)
|
return self.train_step(batch, criterion)
|
||||||
|
|
||||||
def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict):
|
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
|
||||||
return self.train_log(ap, batch, outputs)
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.eval_figures(steps, figures)
|
||||||
|
logger.eval_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False
|
||||||
|
|
|
@ -1,59 +1,26 @@
|
||||||
import copy
|
import copy
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from dataclasses import dataclass
|
from typing import Dict
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from coqpit import MISSING, Coqpit
|
from coqpit import Coqpit
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from TTS.tts.layers.losses import TacotronLoss
|
from TTS.tts.layers.losses import TacotronLoss
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.helpers import sequence_mask
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
|
|
||||||
from TTS.tts.utils.text import make_symbols
|
|
||||||
from TTS.utils.generic_utils import format_aux_input
|
from TTS.utils.generic_utils import format_aux_input
|
||||||
from TTS.utils.io import load_fsspec
|
from TTS.utils.io import load_fsspec
|
||||||
from TTS.utils.training import gradual_training_scheduler
|
from TTS.utils.training import gradual_training_scheduler
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class BaseTacotronArgs(Coqpit):
|
|
||||||
"""TODO: update Tacotron configs using it"""
|
|
||||||
|
|
||||||
num_chars: int = MISSING
|
|
||||||
num_speakers: int = MISSING
|
|
||||||
r: int = MISSING
|
|
||||||
out_channels: int = 80
|
|
||||||
decoder_output_dim: int = 80
|
|
||||||
attn_type: str = "original"
|
|
||||||
attn_win: bool = False
|
|
||||||
attn_norm: str = "softmax"
|
|
||||||
prenet_type: str = "original"
|
|
||||||
prenet_dropout: bool = True
|
|
||||||
prenet_dropout_at_inference: bool = False
|
|
||||||
forward_attn: bool = False
|
|
||||||
trans_agent: bool = False
|
|
||||||
forward_attn_mask: bool = False
|
|
||||||
location_attn: bool = True
|
|
||||||
attn_K: int = 5
|
|
||||||
separate_stopnet: bool = True
|
|
||||||
bidirectional_decoder: bool = False
|
|
||||||
double_decoder_consistency: bool = False
|
|
||||||
ddc_r: int = None
|
|
||||||
encoder_in_features: int = 512
|
|
||||||
decoder_in_features: int = 512
|
|
||||||
d_vector_dim: int = None
|
|
||||||
use_gst: bool = False
|
|
||||||
gst: bool = None
|
|
||||||
gradual_training: bool = None
|
|
||||||
|
|
||||||
|
|
||||||
class BaseTacotron(BaseTTS):
|
class BaseTacotron(BaseTTS):
|
||||||
def __init__(self, config: Coqpit):
|
"""Base class shared by Tacotron and Tacotron2"""
|
||||||
"""Abstract Tacotron class"""
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
|
def __init__(self, config: Coqpit):
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
# pass all config fields as class attributes
|
||||||
for key in config:
|
for key in config:
|
||||||
setattr(self, key, config[key])
|
setattr(self, key, config[key])
|
||||||
|
|
||||||
|
@ -78,6 +45,7 @@ class BaseTacotron(BaseTTS):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _format_aux_input(aux_input: Dict) -> Dict:
|
def _format_aux_input(aux_input: Dict) -> Dict:
|
||||||
|
"""Set missing fields to their default values"""
|
||||||
if aux_input:
|
if aux_input:
|
||||||
return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)
|
return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)
|
||||||
return None
|
return None
|
||||||
|
@ -86,14 +54,12 @@ class BaseTacotron(BaseTTS):
|
||||||
# INIT FUNCTIONS
|
# INIT FUNCTIONS
|
||||||
#############################
|
#############################
|
||||||
|
|
||||||
def _init_states(self):
|
|
||||||
self.embedded_speakers = None
|
|
||||||
self.embedded_speakers_projected = None
|
|
||||||
|
|
||||||
def _init_backward_decoder(self):
|
def _init_backward_decoder(self):
|
||||||
|
"""Init the backward decoder for Forward-Backward decoding."""
|
||||||
self.decoder_backward = copy.deepcopy(self.decoder)
|
self.decoder_backward = copy.deepcopy(self.decoder)
|
||||||
|
|
||||||
def _init_coarse_decoder(self):
|
def _init_coarse_decoder(self):
|
||||||
|
"""Init the coarse decoder for Double-Decoder Consistency."""
|
||||||
self.coarse_decoder = copy.deepcopy(self.decoder)
|
self.coarse_decoder = copy.deepcopy(self.decoder)
|
||||||
self.coarse_decoder.r_init = self.ddc_r
|
self.coarse_decoder.r_init = self.ddc_r
|
||||||
self.coarse_decoder.set_r(self.ddc_r)
|
self.coarse_decoder.set_r(self.ddc_r)
|
||||||
|
@ -113,6 +79,13 @@ class BaseTacotron(BaseTTS):
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
|
"""Load model checkpoint and set up internals.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (Coqpi): model configuration.
|
||||||
|
checkpoint_path (str): path to checkpoint file.
|
||||||
|
eval (bool): whether to load model for evaluation.
|
||||||
|
"""
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
# TODO: set r in run-time by taking it from the new config
|
# TODO: set r in run-time by taking it from the new config
|
||||||
|
@ -131,61 +104,9 @@ class BaseTacotron(BaseTTS):
|
||||||
assert not self.training
|
assert not self.training
|
||||||
|
|
||||||
def get_criterion(self) -> nn.Module:
|
def get_criterion(self) -> nn.Module:
|
||||||
|
"""Get the model criterion used in training."""
|
||||||
return TacotronLoss(self.config)
|
return TacotronLoss(self.config)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_characters(config: Coqpit) -> str:
|
|
||||||
# TODO: implement CharacterProcessor
|
|
||||||
if config.characters is not None:
|
|
||||||
symbols, phonemes = make_symbols(**config.characters)
|
|
||||||
else:
|
|
||||||
from TTS.tts.utils.text.symbols import ( # pylint: disable=import-outside-toplevel
|
|
||||||
parse_symbols,
|
|
||||||
phonemes,
|
|
||||||
symbols,
|
|
||||||
)
|
|
||||||
|
|
||||||
config.characters = parse_symbols()
|
|
||||||
model_characters = phonemes if config.use_phonemes else symbols
|
|
||||||
return model_characters, config
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager:
|
|
||||||
return get_speaker_manager(config, restore_path, data, out_path)
|
|
||||||
|
|
||||||
def get_aux_input(self, **kwargs) -> Dict:
|
|
||||||
"""Compute Tacotron's auxiliary inputs based on model config.
|
|
||||||
- speaker d_vector
|
|
||||||
- style wav for GST
|
|
||||||
- speaker ID for speaker embedding
|
|
||||||
"""
|
|
||||||
# setup speaker_id
|
|
||||||
if self.config.use_speaker_embedding:
|
|
||||||
speaker_id = kwargs.get("speaker_id", 0)
|
|
||||||
else:
|
|
||||||
speaker_id = None
|
|
||||||
# setup d_vector
|
|
||||||
d_vector = (
|
|
||||||
self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0])
|
|
||||||
if self.config.use_d_vector_file and self.config.use_speaker_embedding
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
# setup style_mel
|
|
||||||
if "style_wav" in kwargs:
|
|
||||||
style_wav = kwargs["style_wav"]
|
|
||||||
elif self.config.has("gst_style_input"):
|
|
||||||
style_wav = self.config.gst_style_input
|
|
||||||
else:
|
|
||||||
style_wav = None
|
|
||||||
if style_wav is None and "use_gst" in self.config and self.config.use_gst:
|
|
||||||
# inicialize GST with zero dict.
|
|
||||||
style_wav = {}
|
|
||||||
print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!")
|
|
||||||
for i in range(self.config.gst["gst_num_style_tokens"]):
|
|
||||||
style_wav[str(i)] = 0
|
|
||||||
aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector}
|
|
||||||
return aux_inputs
|
|
||||||
|
|
||||||
#############################
|
#############################
|
||||||
# COMMON COMPUTE FUNCTIONS
|
# COMMON COMPUTE FUNCTIONS
|
||||||
#############################
|
#############################
|
||||||
|
@ -231,15 +152,6 @@ class BaseTacotron(BaseTTS):
|
||||||
# EMBEDDING FUNCTIONS
|
# EMBEDDING FUNCTIONS
|
||||||
#############################
|
#############################
|
||||||
|
|
||||||
def compute_speaker_embedding(self, speaker_ids):
|
|
||||||
"""Compute speaker embedding vectors"""
|
|
||||||
if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
|
||||||
raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
|
|
||||||
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
|
||||||
self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1)
|
|
||||||
if hasattr(self, "speaker_project_mel") and speaker_ids is not None:
|
|
||||||
self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1)
|
|
||||||
|
|
||||||
def compute_gst(self, inputs, style_input, speaker_embedding=None):
|
def compute_gst(self, inputs, style_input, speaker_embedding=None):
|
||||||
"""Compute global style token"""
|
"""Compute global style token"""
|
||||||
if isinstance(style_input, dict):
|
if isinstance(style_input, dict):
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -9,20 +10,20 @@ from torch.utils.data import DataLoader
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
from TTS.model import BaseModel
|
from TTS.model import BaseModel
|
||||||
from TTS.tts.datasets import TTSDataset
|
from TTS.tts.configs.shared_configs import CharactersConfig
|
||||||
|
from TTS.tts.datasets.dataset import TTSDataset
|
||||||
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
|
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
|
||||||
from TTS.tts.utils.synthesis import synthesis
|
from TTS.tts.utils.synthesis import synthesis
|
||||||
from TTS.tts.utils.text import make_symbols
|
from TTS.tts.utils.text import make_symbols
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
|
|
||||||
# pylint: skip-file
|
# pylint: skip-file
|
||||||
|
|
||||||
|
|
||||||
class BaseTTS(BaseModel):
|
class BaseTTS(BaseModel):
|
||||||
"""Abstract `tts` class. Every new `tts` model must inherit this.
|
"""Base `tts` class. Every new `tts` model must inherit this.
|
||||||
|
|
||||||
It defines `tts` specific functions on top of `Model`.
|
It defines common `tts` specific functions on top of `Model` implementation.
|
||||||
|
|
||||||
Notes on input/output tensor shapes:
|
Notes on input/output tensor shapes:
|
||||||
Any input or output tensor of the model must be shaped as
|
Any input or output tensor of the model must be shaped as
|
||||||
|
@ -32,6 +33,30 @@ class BaseTTS(BaseModel):
|
||||||
- 1D tensors `batch x 1`
|
- 1D tensors `batch x 1`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def _set_model_args(self, config: Coqpit):
|
||||||
|
"""Setup model args based on the config type.
|
||||||
|
|
||||||
|
If the config is for training with a name like "*Config", then the model args are embeded in the
|
||||||
|
config.model_args
|
||||||
|
|
||||||
|
If the config is for the model with a name like "*Args", then we assign the directly.
|
||||||
|
"""
|
||||||
|
# don't use isintance not to import recursively
|
||||||
|
if "Config" in config.__class__.__name__:
|
||||||
|
if "characters" in config:
|
||||||
|
_, self.config, num_chars = self.get_characters(config)
|
||||||
|
self.config.num_chars = num_chars
|
||||||
|
if hasattr(self.config, "model_args"):
|
||||||
|
config.model_args.num_chars = num_chars
|
||||||
|
self.args = self.config.model_args
|
||||||
|
else:
|
||||||
|
self.config = config
|
||||||
|
self.args = config.model_args
|
||||||
|
elif "Args" in config.__class__.__name__:
|
||||||
|
self.args = config
|
||||||
|
else:
|
||||||
|
raise ValueError("config must be either a *Config or *Args")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_characters(config: Coqpit) -> str:
|
def get_characters(config: Coqpit) -> str:
|
||||||
# TODO: implement CharacterProcessor
|
# TODO: implement CharacterProcessor
|
||||||
|
@ -40,7 +65,7 @@ class BaseTTS(BaseModel):
|
||||||
else:
|
else:
|
||||||
from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols
|
from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols
|
||||||
|
|
||||||
config.characters = parse_symbols()
|
config.characters = CharactersConfig(**parse_symbols())
|
||||||
model_characters = phonemes if config.use_phonemes else symbols
|
model_characters = phonemes if config.use_phonemes else symbols
|
||||||
num_chars = len(model_characters) + getattr(config, "add_blank", False)
|
num_chars = len(model_characters) + getattr(config, "add_blank", False)
|
||||||
return model_characters, config, num_chars
|
return model_characters, config, num_chars
|
||||||
|
@ -48,35 +73,18 @@ class BaseTTS(BaseModel):
|
||||||
def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager:
|
def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager:
|
||||||
return get_speaker_manager(config, restore_path, data, out_path)
|
return get_speaker_manager(config, restore_path, data, out_path)
|
||||||
|
|
||||||
def init_multispeaker(self, config: Coqpit, data: List = None):
|
def init_multispeaker(self, config: Coqpit):
|
||||||
"""Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
|
"""Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding
|
||||||
`in_channels` size of the connected layers.
|
vector dimension in the network. If model uses d-vectors, then it only sets the expected dimension.
|
||||||
|
|
||||||
This implementation yields 3 possible outcomes:
|
|
||||||
|
|
||||||
1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
|
|
||||||
2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
|
|
||||||
3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
|
|
||||||
`config.d_vector_dim` or 512.
|
|
||||||
|
|
||||||
You can override this function for new models.0
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model configuration.
|
config (Coqpit): Model configuration.
|
||||||
data (List, optional): Dataset items to infer number of speakers. Defaults to None.
|
|
||||||
"""
|
"""
|
||||||
# init speaker manager
|
# set number of speakers
|
||||||
self.speaker_manager = get_speaker_manager(config, data=data)
|
if self.speaker_manager is not None:
|
||||||
|
|
||||||
# set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
|
|
||||||
if data is not None or self.speaker_manager.speaker_ids:
|
|
||||||
self.num_speakers = self.speaker_manager.num_speakers
|
self.num_speakers = self.speaker_manager.num_speakers
|
||||||
else:
|
elif hasattr(config, "num_speakers"):
|
||||||
self.num_speakers = (
|
self.num_speakers = config.num_speakers
|
||||||
config.num_speakers
|
|
||||||
if "num_speakers" in config and config.num_speakers != 0
|
|
||||||
else self.speaker_manager.num_speakers
|
|
||||||
)
|
|
||||||
|
|
||||||
# set ultimate speaker embedding size
|
# set ultimate speaker embedding size
|
||||||
if config.use_speaker_embedding or config.use_d_vector_file:
|
if config.use_speaker_embedding or config.use_d_vector_file:
|
||||||
|
@ -85,13 +93,10 @@ class BaseTTS(BaseModel):
|
||||||
)
|
)
|
||||||
# init speaker embedding layer
|
# init speaker embedding layer
|
||||||
if config.use_speaker_embedding and not config.use_d_vector_file:
|
if config.use_speaker_embedding and not config.use_d_vector_file:
|
||||||
|
print(" > Init speaker_embedding layer.")
|
||||||
self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
||||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||||
|
|
||||||
def get_aux_input(self, **kwargs) -> Dict:
|
|
||||||
"""Prepare and return `aux_input` used by `forward()`"""
|
|
||||||
return {"speaker_id": None, "style_wav": None, "d_vector": None}
|
|
||||||
|
|
||||||
def format_batch(self, batch: Dict) -> Dict:
|
def format_batch(self, batch: Dict) -> Dict:
|
||||||
"""Generic batch formatting for `TTSDataset`.
|
"""Generic batch formatting for `TTSDataset`.
|
||||||
|
|
||||||
|
@ -169,7 +174,7 @@ class BaseTTS(BaseModel):
|
||||||
def get_data_loader(
|
def get_data_loader(
|
||||||
self,
|
self,
|
||||||
config: Coqpit,
|
config: Coqpit,
|
||||||
ap: AudioProcessor,
|
assets: Dict,
|
||||||
is_eval: bool,
|
is_eval: bool,
|
||||||
data_items: List,
|
data_items: List,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
|
@ -179,14 +184,12 @@ class BaseTTS(BaseModel):
|
||||||
if is_eval and not config.run_eval:
|
if is_eval and not config.run_eval:
|
||||||
loader = None
|
loader = None
|
||||||
else:
|
else:
|
||||||
|
ap = assets["audio_processor"]
|
||||||
|
|
||||||
# setup multi-speaker attributes
|
# setup multi-speaker attributes
|
||||||
if hasattr(self, "speaker_manager"):
|
if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
|
||||||
speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None
|
speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None
|
||||||
d_vector_mapping = (
|
d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None
|
||||||
self.speaker_manager.d_vectors
|
|
||||||
if config.use_speaker_embedding and config.use_d_vector_file
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
speaker_id_mapping = None
|
speaker_id_mapping = None
|
||||||
d_vector_mapping = None
|
d_vector_mapping = None
|
||||||
|
@ -219,9 +222,7 @@ class BaseTTS(BaseModel):
|
||||||
use_noise_augment=not is_eval,
|
use_noise_augment=not is_eval,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
speaker_id_mapping=speaker_id_mapping,
|
speaker_id_mapping=speaker_id_mapping,
|
||||||
d_vector_mapping=d_vector_mapping
|
d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
|
||||||
if config.use_speaker_embedding and config.use_d_vector_file
|
|
||||||
else None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# pre-compute phonemes
|
# pre-compute phonemes
|
||||||
|
@ -280,19 +281,41 @@ class BaseTTS(BaseModel):
|
||||||
)
|
)
|
||||||
return loader
|
return loader
|
||||||
|
|
||||||
def test_run(self, ap) -> Tuple[Dict, Dict]:
|
def _get_test_aux_input(
|
||||||
|
self,
|
||||||
|
) -> Dict:
|
||||||
|
|
||||||
|
d_vector = None
|
||||||
|
if self.config.use_d_vector_file:
|
||||||
|
d_vector = [self.speaker_manager.d_vectors[name]["embedding"] for name in self.speaker_manager.d_vectors]
|
||||||
|
d_vector = (random.sample(sorted(d_vector), 1),)
|
||||||
|
|
||||||
|
aux_inputs = {
|
||||||
|
"speaker_id": None
|
||||||
|
if not self.config.use_speaker_embedding
|
||||||
|
else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1),
|
||||||
|
"d_vector": d_vector,
|
||||||
|
"style_wav": None, # TODO: handle GST style input
|
||||||
|
}
|
||||||
|
return aux_inputs
|
||||||
|
|
||||||
|
def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
|
||||||
"""Generic test run for `tts` models used by `Trainer`.
|
"""Generic test run for `tts` models used by `Trainer`.
|
||||||
|
|
||||||
You can override this for a different behaviour.
|
You can override this for a different behaviour.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
|
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
|
||||||
"""
|
"""
|
||||||
|
ap = assets["audio_processor"]
|
||||||
print(" | > Synthesizing test sentences.")
|
print(" | > Synthesizing test sentences.")
|
||||||
test_audios = {}
|
test_audios = {}
|
||||||
test_figures = {}
|
test_figures = {}
|
||||||
test_sentences = self.config.test_sentences
|
test_sentences = self.config.test_sentences
|
||||||
aux_inputs = self.get_aux_input()
|
aux_inputs = self._get_test_aux_input()
|
||||||
for idx, sen in enumerate(test_sentences):
|
for idx, sen in enumerate(test_sentences):
|
||||||
outputs_dict = synthesis(
|
outputs_dict = synthesis(
|
||||||
self,
|
self,
|
||||||
|
@ -315,3 +338,17 @@ class BaseTTS(BaseModel):
|
||||||
outputs_dict["outputs"]["alignments"], output_fig=False
|
outputs_dict["outputs"]["alignments"], output_fig=False
|
||||||
)
|
)
|
||||||
return test_figures, test_audios
|
return test_figures, test_audios
|
||||||
|
|
||||||
|
def on_init_start(self, trainer):
|
||||||
|
"""Save the speaker.json at the beginning of the training. And update the config.json with the
|
||||||
|
speakers.json file path."""
|
||||||
|
if self.speaker_manager is not None:
|
||||||
|
output_path = os.path.join(trainer.output_path, "speakers.json")
|
||||||
|
self.speaker_manager.save_speaker_ids_to_file(output_path)
|
||||||
|
trainer.config.speakers_file = output_path
|
||||||
|
# some models don't have `model_args` set
|
||||||
|
if hasattr(trainer.config, "model_args"):
|
||||||
|
trainer.config.model_args.speakers_file = output_path
|
||||||
|
trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
|
||||||
|
print(f" > `speakers.json` is saved to {output_path}.")
|
||||||
|
print(" > `speakers_file` is updated in the config.json.")
|
||||||
|
|
|
@ -13,8 +13,8 @@ from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
|
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -32,9 +32,6 @@ class ForwardTTSArgs(Coqpit):
|
||||||
hidden_channels (int):
|
hidden_channels (int):
|
||||||
Number of base hidden channels of the model. Defaults to 512.
|
Number of base hidden channels of the model. Defaults to 512.
|
||||||
|
|
||||||
num_speakers (int):
|
|
||||||
Number of speakers for the speaker embedding layer. Defaults to 0.
|
|
||||||
|
|
||||||
use_aligner (bool):
|
use_aligner (bool):
|
||||||
Whether to use aligner network to learn the text to speech alignment or use pre-computed durations.
|
Whether to use aligner network to learn the text to speech alignment or use pre-computed durations.
|
||||||
If set False, durations should be computed by `TTS/bin/compute_attention_masks.py` and path to the
|
If set False, durations should be computed by `TTS/bin/compute_attention_masks.py` and path to the
|
||||||
|
@ -87,12 +84,6 @@ class ForwardTTSArgs(Coqpit):
|
||||||
decoder_params (str):
|
decoder_params (str):
|
||||||
Parameters of the decoder module. Defaults to ```{"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}```
|
Parameters of the decoder module. Defaults to ```{"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}```
|
||||||
|
|
||||||
use_d_vetor (bool):
|
|
||||||
Whether to use precomputed d-vectors for multi-speaker training. Defaults to False.
|
|
||||||
|
|
||||||
d_vector_dim (int):
|
|
||||||
Number of channels of the d-vectors. Defaults to 0.
|
|
||||||
|
|
||||||
detach_duration_predictor (bool):
|
detach_duration_predictor (bool):
|
||||||
Detach the input to the duration predictor from the earlier computation graph so that the duraiton loss
|
Detach the input to the duration predictor from the earlier computation graph so that the duraiton loss
|
||||||
does not pass to the earlier layers. Defaults to True.
|
does not pass to the earlier layers. Defaults to True.
|
||||||
|
@ -100,12 +91,26 @@ class ForwardTTSArgs(Coqpit):
|
||||||
max_duration (int):
|
max_duration (int):
|
||||||
Maximum duration accepted by the model. Defaults to 75.
|
Maximum duration accepted by the model. Defaults to 75.
|
||||||
|
|
||||||
|
num_speakers (int):
|
||||||
|
Number of speakers for the speaker embedding layer. Defaults to 0.
|
||||||
|
|
||||||
|
speakers_file (str):
|
||||||
|
Path to the speaker mapping file for the Speaker Manager. Defaults to None.
|
||||||
|
|
||||||
|
speaker_embedding_channels (int):
|
||||||
|
Number of speaker embedding channels. Defaults to 256.
|
||||||
|
|
||||||
|
use_d_vector_file (bool):
|
||||||
|
Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
|
||||||
|
|
||||||
|
d_vector_dim (int):
|
||||||
|
Number of d-vector channels. Defaults to 0.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
num_chars: int = None
|
num_chars: int = None
|
||||||
out_channels: int = 80
|
out_channels: int = 80
|
||||||
hidden_channels: int = 384
|
hidden_channels: int = 384
|
||||||
num_speakers: int = 0
|
|
||||||
use_aligner: bool = True
|
use_aligner: bool = True
|
||||||
use_pitch: bool = True
|
use_pitch: bool = True
|
||||||
pitch_predictor_hidden_channels: int = 256
|
pitch_predictor_hidden_channels: int = 256
|
||||||
|
@ -126,10 +131,14 @@ class ForwardTTSArgs(Coqpit):
|
||||||
decoder_params: dict = field(
|
decoder_params: dict = field(
|
||||||
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}
|
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}
|
||||||
)
|
)
|
||||||
use_d_vector: bool = False
|
|
||||||
d_vector_dim: int = 0
|
|
||||||
detach_duration_predictor: bool = False
|
detach_duration_predictor: bool = False
|
||||||
max_duration: int = 75
|
max_duration: int = 75
|
||||||
|
num_speakers: int = 1
|
||||||
|
use_speaker_embedding: bool = False
|
||||||
|
speakers_file: str = None
|
||||||
|
use_d_vector_file: bool = False
|
||||||
|
d_vector_dim: int = None
|
||||||
|
d_vector_file: str = None
|
||||||
|
|
||||||
|
|
||||||
class ForwardTTS(BaseTTS):
|
class ForwardTTS(BaseTTS):
|
||||||
|
@ -151,6 +160,8 @@ class ForwardTTS(BaseTTS):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model coqpit class.
|
config (Coqpit): Model coqpit class.
|
||||||
|
speaker_manager (SpeakerManager): Speaker manager for multi-speaker training. Only used for multi-speaker models.
|
||||||
|
Defaults to None.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> from TTS.tts.models.fast_pitch import ForwardTTS, ForwardTTSArgs
|
>>> from TTS.tts.models.fast_pitch import ForwardTTS, ForwardTTSArgs
|
||||||
|
@ -159,26 +170,12 @@ class ForwardTTS(BaseTTS):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
def __init__(self, config: Coqpit):
|
def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__(config)
|
||||||
|
|
||||||
# don't use isintance not to import recursively
|
self.speaker_manager = speaker_manager
|
||||||
if "Config" in config.__class__.__name__:
|
self.init_multispeaker(config)
|
||||||
if "characters" in config:
|
|
||||||
# loading from FasrPitchConfig
|
|
||||||
_, self.config, num_chars = self.get_characters(config)
|
|
||||||
config.model_args.num_chars = num_chars
|
|
||||||
self.args = self.config.model_args
|
|
||||||
else:
|
|
||||||
# loading from ForwardTTSArgs
|
|
||||||
self.config = config
|
|
||||||
self.args = config.model_args
|
|
||||||
elif isinstance(config, ForwardTTSArgs):
|
|
||||||
self.args = config
|
|
||||||
self.config = config
|
|
||||||
else:
|
|
||||||
raise ValueError("config must be either a *Config or ForwardTTSArgs")
|
|
||||||
|
|
||||||
self.max_duration = self.args.max_duration
|
self.max_duration = self.args.max_duration
|
||||||
self.use_aligner = self.args.use_aligner
|
self.use_aligner = self.args.use_aligner
|
||||||
|
@ -196,7 +193,7 @@ class ForwardTTS(BaseTTS):
|
||||||
self.args.hidden_channels,
|
self.args.hidden_channels,
|
||||||
self.args.encoder_type,
|
self.args.encoder_type,
|
||||||
self.args.encoder_params,
|
self.args.encoder_params,
|
||||||
self.args.d_vector_dim,
|
self.embedded_speaker_dim,
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.args.positional_encoding:
|
if self.args.positional_encoding:
|
||||||
|
@ -210,7 +207,7 @@ class ForwardTTS(BaseTTS):
|
||||||
)
|
)
|
||||||
|
|
||||||
self.duration_predictor = DurationPredictor(
|
self.duration_predictor = DurationPredictor(
|
||||||
self.args.hidden_channels + self.args.d_vector_dim,
|
self.args.hidden_channels + self.embedded_speaker_dim,
|
||||||
self.args.duration_predictor_hidden_channels,
|
self.args.duration_predictor_hidden_channels,
|
||||||
self.args.duration_predictor_kernel_size,
|
self.args.duration_predictor_kernel_size,
|
||||||
self.args.duration_predictor_dropout_p,
|
self.args.duration_predictor_dropout_p,
|
||||||
|
@ -218,7 +215,7 @@ class ForwardTTS(BaseTTS):
|
||||||
|
|
||||||
if self.args.use_pitch:
|
if self.args.use_pitch:
|
||||||
self.pitch_predictor = DurationPredictor(
|
self.pitch_predictor = DurationPredictor(
|
||||||
self.args.hidden_channels + self.args.d_vector_dim,
|
self.args.hidden_channels + self.embedded_speaker_dim,
|
||||||
self.args.pitch_predictor_hidden_channels,
|
self.args.pitch_predictor_hidden_channels,
|
||||||
self.args.pitch_predictor_kernel_size,
|
self.args.pitch_predictor_kernel_size,
|
||||||
self.args.pitch_predictor_dropout_p,
|
self.args.pitch_predictor_dropout_p,
|
||||||
|
@ -230,19 +227,37 @@ class ForwardTTS(BaseTTS):
|
||||||
padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
|
padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.args.num_speakers > 1 and not self.args.use_d_vector:
|
|
||||||
# speaker embedding layer
|
|
||||||
self.emb_g = nn.Embedding(self.args.num_speakers, self.args.d_vector_dim)
|
|
||||||
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
|
||||||
|
|
||||||
if self.args.d_vector_dim > 0 and self.args.d_vector_dim != self.args.hidden_channels:
|
|
||||||
self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1)
|
|
||||||
|
|
||||||
if self.args.use_aligner:
|
if self.args.use_aligner:
|
||||||
self.aligner = AlignmentNetwork(
|
self.aligner = AlignmentNetwork(
|
||||||
in_query_channels=self.args.out_channels, in_key_channels=self.args.hidden_channels
|
in_query_channels=self.args.out_channels, in_key_channels=self.args.hidden_channels
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def init_multispeaker(self, config: Coqpit):
|
||||||
|
"""Init for multi-speaker training.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (Coqpit): Model configuration.
|
||||||
|
"""
|
||||||
|
self.embedded_speaker_dim = 0
|
||||||
|
# init speaker manager
|
||||||
|
if self.speaker_manager is None and (config.use_d_vector_file or config.use_speaker_embedding):
|
||||||
|
raise ValueError(
|
||||||
|
" > SpeakerManager is not provided. You must provide the SpeakerManager before initializing a multi-speaker model."
|
||||||
|
)
|
||||||
|
# set number of speakers
|
||||||
|
if self.speaker_manager is not None:
|
||||||
|
self.num_speakers = self.speaker_manager.num_speakers
|
||||||
|
# init d-vector embedding
|
||||||
|
if config.use_d_vector_file:
|
||||||
|
self.embedded_speaker_dim = config.d_vector_dim
|
||||||
|
if self.args.d_vector_dim != self.args.hidden_channels:
|
||||||
|
self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1)
|
||||||
|
# init speaker embedding layer
|
||||||
|
if config.use_speaker_embedding and not config.use_d_vector_file:
|
||||||
|
print(" > Init speaker_embedding layer.")
|
||||||
|
self.emb_g = nn.Embedding(self.args.num_speakers, self.args.hidden_channels)
|
||||||
|
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def generate_attn(dr, x_mask, y_mask=None):
|
def generate_attn(dr, x_mask, y_mask=None):
|
||||||
"""Generate an attention mask from the durations.
|
"""Generate an attention mask from the durations.
|
||||||
|
@ -307,18 +322,6 @@ class ForwardTTS(BaseTTS):
|
||||||
o_dr = torch.round(o_dr)
|
o_dr = torch.round(o_dr)
|
||||||
return o_dr
|
return o_dr
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _concat_speaker_embedding(o_en, g):
|
|
||||||
g_exp = g.expand(-1, -1, o_en.size(-1)) # [B, C, T_en]
|
|
||||||
o_en = torch.cat([o_en, g_exp], 1)
|
|
||||||
return o_en
|
|
||||||
|
|
||||||
def _sum_speaker_embedding(self, x, g):
|
|
||||||
# project g to decoder dim.
|
|
||||||
if hasattr(self, "proj_g"):
|
|
||||||
g = self.proj_g(g)
|
|
||||||
return x + g
|
|
||||||
|
|
||||||
def _forward_encoder(
|
def _forward_encoder(
|
||||||
self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None
|
self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None
|
||||||
) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
|
) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
|
||||||
|
@ -327,7 +330,7 @@ class ForwardTTS(BaseTTS):
|
||||||
1. Embed speaker IDs if multi-speaker mode.
|
1. Embed speaker IDs if multi-speaker mode.
|
||||||
2. Embed character sequences.
|
2. Embed character sequences.
|
||||||
3. Run the encoder network.
|
3. Run the encoder network.
|
||||||
4. Concat speaker embedding to the encoder output for the duration predictor.
|
4. Sum encoder outputs and speaker embeddings
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x (torch.LongTensor): Input sequence IDs.
|
x (torch.LongTensor): Input sequence IDs.
|
||||||
|
@ -345,19 +348,18 @@ class ForwardTTS(BaseTTS):
|
||||||
- g: :math:`(B, C)`
|
- g: :math:`(B, C)`
|
||||||
"""
|
"""
|
||||||
if hasattr(self, "emb_g"):
|
if hasattr(self, "emb_g"):
|
||||||
g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1]
|
g = self.emb_g(g) # [B, C, 1]
|
||||||
if g is not None:
|
if g is not None:
|
||||||
g = g.unsqueeze(-1)
|
g = g.unsqueeze(-1)
|
||||||
# [B, T, C]
|
# [B, T, C]
|
||||||
x_emb = self.emb(x)
|
x_emb = self.emb(x)
|
||||||
# encoder pass
|
# encoder pass
|
||||||
o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask)
|
o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask)
|
||||||
# speaker conditioning for duration predictor
|
# speaker conditioning
|
||||||
|
# TODO: try different ways of conditioning
|
||||||
if g is not None:
|
if g is not None:
|
||||||
o_en_dp = self._concat_speaker_embedding(o_en, g)
|
o_en = o_en + g
|
||||||
else:
|
return o_en, x_mask, g, x_emb
|
||||||
o_en_dp = o_en
|
|
||||||
return o_en, o_en_dp, x_mask, g, x_emb
|
|
||||||
|
|
||||||
def _forward_decoder(
|
def _forward_decoder(
|
||||||
self,
|
self,
|
||||||
|
@ -391,9 +393,6 @@ class ForwardTTS(BaseTTS):
|
||||||
# positional encoding
|
# positional encoding
|
||||||
if hasattr(self, "pos_encoder"):
|
if hasattr(self, "pos_encoder"):
|
||||||
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
|
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
|
||||||
# speaker embedding
|
|
||||||
if g is not None:
|
|
||||||
o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
|
|
||||||
# decoder pass
|
# decoder pass
|
||||||
o_de = self.decoder(o_en_ex, y_mask, g=g)
|
o_de = self.decoder(o_en_ex, y_mask, g=g)
|
||||||
return o_de.transpose(1, 2), attn.transpose(1, 2)
|
return o_de.transpose(1, 2), attn.transpose(1, 2)
|
||||||
|
@ -475,6 +474,19 @@ class ForwardTTS(BaseTTS):
|
||||||
alignment_soft = alignment_soft.squeeze(1).transpose(1, 2)
|
alignment_soft = alignment_soft.squeeze(1).transpose(1, 2)
|
||||||
return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas
|
return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas
|
||||||
|
|
||||||
|
def _set_speaker_input(self, aux_input: Dict):
|
||||||
|
d_vectors = aux_input.get("d_vectors", None)
|
||||||
|
speaker_ids = aux_input.get("speaker_ids", None)
|
||||||
|
|
||||||
|
if d_vectors is not None and speaker_ids is not None:
|
||||||
|
raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
|
||||||
|
|
||||||
|
if speaker_ids is not None and not hasattr(self, "emb_g"):
|
||||||
|
raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
|
||||||
|
|
||||||
|
g = speaker_ids if speaker_ids is not None else d_vectors
|
||||||
|
return g
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
x: torch.LongTensor,
|
x: torch.LongTensor,
|
||||||
|
@ -505,17 +517,17 @@ class ForwardTTS(BaseTTS):
|
||||||
- g: :math:`[B, C]`
|
- g: :math:`[B, C]`
|
||||||
- pitch: :math:`[B, 1, T]`
|
- pitch: :math:`[B, 1, T]`
|
||||||
"""
|
"""
|
||||||
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
g = self._set_speaker_input(aux_input)
|
||||||
# compute sequence masks
|
# compute sequence masks
|
||||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).float()
|
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).float()
|
||||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).float()
|
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).float()
|
||||||
# encoder pass
|
# encoder pass
|
||||||
o_en, o_en_dp, x_mask, g, x_emb = self._forward_encoder(x, x_mask, g)
|
o_en, x_mask, g, x_emb = self._forward_encoder(x, x_mask, g)
|
||||||
# duration predictor pass
|
# duration predictor pass
|
||||||
if self.args.detach_duration_predictor:
|
if self.args.detach_duration_predictor:
|
||||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
o_dr_log = self.duration_predictor(o_en.detach(), x_mask)
|
||||||
else:
|
else:
|
||||||
o_dr_log = self.duration_predictor(o_en_dp, x_mask)
|
o_dr_log = self.duration_predictor(o_en, x_mask)
|
||||||
o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration)
|
o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration)
|
||||||
# generate attn mask from predicted durations
|
# generate attn mask from predicted durations
|
||||||
o_attn = self.generate_attn(o_dr.squeeze(1), x_mask)
|
o_attn = self.generate_attn(o_dr.squeeze(1), x_mask)
|
||||||
|
@ -535,10 +547,12 @@ class ForwardTTS(BaseTTS):
|
||||||
o_pitch = None
|
o_pitch = None
|
||||||
avg_pitch = None
|
avg_pitch = None
|
||||||
if self.args.use_pitch:
|
if self.args.use_pitch:
|
||||||
o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en_dp, x_mask, pitch, dr)
|
o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en, x_mask, pitch, dr)
|
||||||
o_en = o_en + o_pitch_emb
|
o_en = o_en + o_pitch_emb
|
||||||
# decoder pass
|
# decoder pass
|
||||||
o_de, attn = self._forward_decoder(o_en, dr, x_mask, y_lengths, g=g)
|
o_de, attn = self._forward_decoder(
|
||||||
|
o_en, dr, x_mask, y_lengths, g=None
|
||||||
|
) # TODO: maybe pass speaker embedding (g) too
|
||||||
outputs = {
|
outputs = {
|
||||||
"model_outputs": o_de, # [B, T, C]
|
"model_outputs": o_de, # [B, T, C]
|
||||||
"durations_log": o_dr_log.squeeze(1), # [B, T]
|
"durations_log": o_dr_log.squeeze(1), # [B, T]
|
||||||
|
@ -569,22 +583,22 @@ class ForwardTTS(BaseTTS):
|
||||||
- x_lengths: [B]
|
- x_lengths: [B]
|
||||||
- g: [B, C]
|
- g: [B, C]
|
||||||
"""
|
"""
|
||||||
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
|
g = self._set_speaker_input(aux_input)
|
||||||
x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
|
x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
|
||||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype).float()
|
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype).float()
|
||||||
# encoder pass
|
# encoder pass
|
||||||
o_en, o_en_dp, x_mask, g, _ = self._forward_encoder(x, x_mask, g)
|
o_en, x_mask, g, _ = self._forward_encoder(x, x_mask, g)
|
||||||
# duration predictor pass
|
# duration predictor pass
|
||||||
o_dr_log = self.duration_predictor(o_en_dp, x_mask)
|
o_dr_log = self.duration_predictor(o_en, x_mask)
|
||||||
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
||||||
y_lengths = o_dr.sum(1)
|
y_lengths = o_dr.sum(1)
|
||||||
# pitch predictor pass
|
# pitch predictor pass
|
||||||
o_pitch = None
|
o_pitch = None
|
||||||
if self.args.use_pitch:
|
if self.args.use_pitch:
|
||||||
o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en_dp, x_mask)
|
o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en, x_mask)
|
||||||
o_en = o_en + o_pitch_emb
|
o_en = o_en + o_pitch_emb
|
||||||
# decoder pass
|
# decoder pass
|
||||||
o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=g)
|
o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=None)
|
||||||
outputs = {
|
outputs = {
|
||||||
"model_outputs": o_de,
|
"model_outputs": o_de,
|
||||||
"alignments": attn,
|
"alignments": attn,
|
||||||
|
@ -634,7 +648,8 @@ class ForwardTTS(BaseTTS):
|
||||||
|
|
||||||
return outputs, loss_dict
|
return outputs, loss_dict
|
||||||
|
|
||||||
def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use
|
def _create_logs(self, batch, outputs, ap):
|
||||||
|
"""Create common logger outputs."""
|
||||||
model_outputs = outputs["model_outputs"]
|
model_outputs = outputs["model_outputs"]
|
||||||
alignments = outputs["alignments"]
|
alignments = outputs["alignments"]
|
||||||
mel_input = batch["mel_input"]
|
mel_input = batch["mel_input"]
|
||||||
|
@ -674,11 +689,22 @@ class ForwardTTS(BaseTTS):
|
||||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||||
return figures, {"audio": train_audio}
|
return figures, {"audio": train_audio}
|
||||||
|
|
||||||
|
def train_log(
|
||||||
|
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
|
||||||
|
) -> None: # pylint: disable=no-self-use
|
||||||
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.train_figures(steps, figures)
|
||||||
|
logger.train_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
||||||
def eval_step(self, batch: dict, criterion: nn.Module):
|
def eval_step(self, batch: dict, criterion: nn.Module):
|
||||||
return self.train_step(batch, criterion)
|
return self.train_step(batch, criterion)
|
||||||
|
|
||||||
def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict):
|
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
|
||||||
return self.train_log(ap, batch, outputs)
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.eval_figures(steps, figures)
|
||||||
|
logger.eval_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False
|
||||||
|
|
|
@ -1,19 +1,20 @@
|
||||||
import math
|
import math
|
||||||
|
from typing import Dict, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from coqpit import Coqpit
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.cuda.amp.autocast_mode import autocast
|
from torch.cuda.amp.autocast_mode import autocast
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from TTS.tts.configs import GlowTTSConfig
|
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
||||||
from TTS.tts.layers.glow_tts.decoder import Decoder
|
from TTS.tts.layers.glow_tts.decoder import Decoder
|
||||||
from TTS.tts.layers.glow_tts.encoder import Encoder
|
from TTS.tts.layers.glow_tts.encoder import Encoder
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||||
from TTS.tts.utils.speakers import get_speaker_manager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.synthesis import synthesis
|
from TTS.tts.utils.synthesis import synthesis
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
from TTS.utils.io import load_fsspec
|
from TTS.utils.io import load_fsspec
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,16 +39,18 @@ class GlowTTS(BaseTTS):
|
||||||
Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments.
|
Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> from TTS.tts.configs import GlowTTSConfig
|
>>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
||||||
>>> from TTS.tts.models.glow_tts import GlowTTS
|
>>> from TTS.tts.models.glow_tts import GlowTTS
|
||||||
>>> config = GlowTTSConfig()
|
>>> config = GlowTTSConfig()
|
||||||
>>> model = GlowTTS(config)
|
>>> model = GlowTTS(config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: GlowTTSConfig):
|
def __init__(self, config: GlowTTSConfig, speaker_manager: SpeakerManager = None):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__(config)
|
||||||
|
|
||||||
|
self.speaker_manager = speaker_manager
|
||||||
|
|
||||||
# pass all config fields to `self`
|
# pass all config fields to `self`
|
||||||
# for fewer code change
|
# for fewer code change
|
||||||
|
@ -58,19 +61,10 @@ class GlowTTS(BaseTTS):
|
||||||
_, self.config, self.num_chars = self.get_characters(config)
|
_, self.config, self.num_chars = self.get_characters(config)
|
||||||
self.decoder_output_dim = config.out_channels
|
self.decoder_output_dim = config.out_channels
|
||||||
|
|
||||||
|
# init multi-speaker layers if necessary
|
||||||
self.init_multispeaker(config)
|
self.init_multispeaker(config)
|
||||||
|
|
||||||
# if is a multispeaker and c_in_channels is 0, set to 256
|
|
||||||
self.c_in_channels = 0
|
|
||||||
if self.num_speakers > 1:
|
|
||||||
if self.d_vector_dim:
|
|
||||||
self.c_in_channels = self.d_vector_dim
|
|
||||||
elif self.c_in_channels == 0 and not self.d_vector_dim:
|
|
||||||
# TODO: make this adjustable
|
|
||||||
self.c_in_channels = 256
|
|
||||||
|
|
||||||
self.run_data_dep_init = config.data_dep_init_steps > 0
|
self.run_data_dep_init = config.data_dep_init_steps > 0
|
||||||
|
|
||||||
self.encoder = Encoder(
|
self.encoder = Encoder(
|
||||||
self.num_chars,
|
self.num_chars,
|
||||||
out_channels=self.out_channels,
|
out_channels=self.out_channels,
|
||||||
|
@ -98,28 +92,35 @@ class GlowTTS(BaseTTS):
|
||||||
c_in_channels=self.c_in_channels,
|
c_in_channels=self.c_in_channels,
|
||||||
)
|
)
|
||||||
|
|
||||||
def init_multispeaker(self, config: "Coqpit", data: list = None) -> None:
|
def init_multispeaker(self, config: Coqpit):
|
||||||
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
|
"""Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding
|
||||||
or with external `d_vectors` computed from a speaker encoder model.
|
vector dimension in the network. If model uses d-vectors, then it only sets the expected dimension.
|
||||||
|
|
||||||
If you need a different behaviour, override this function for your model.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model configuration.
|
config (Coqpit): Model configuration.
|
||||||
data (List, optional): Dataset items to infer number of speakers. Defaults to None.
|
|
||||||
"""
|
"""
|
||||||
|
self.embedded_speaker_dim = 0
|
||||||
# init speaker manager
|
# init speaker manager
|
||||||
self.speaker_manager = get_speaker_manager(config, data=data)
|
if self.speaker_manager is None and (self.use_speaker_embedding or self.use_d_vector_file):
|
||||||
self.num_speakers = self.speaker_manager.num_speakers
|
raise ValueError(
|
||||||
if config.use_d_vector_file:
|
" > SpeakerManager is not provided. You must provide the SpeakerManager before initializing a multi-speaker model."
|
||||||
self.external_d_vector_dim = config.d_vector_dim
|
)
|
||||||
else:
|
# set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
|
||||||
self.external_d_vector_dim = 0
|
if self.speaker_manager is not None:
|
||||||
|
self.num_speakers = self.speaker_manager.num_speakers
|
||||||
|
# set ultimate speaker embedding size
|
||||||
|
if config.use_speaker_embedding or config.use_d_vector_file:
|
||||||
|
self.embedded_speaker_dim = (
|
||||||
|
config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
|
||||||
|
)
|
||||||
# init speaker embedding layer
|
# init speaker embedding layer
|
||||||
if config.use_speaker_embedding and not config.use_d_vector_file:
|
if config.use_speaker_embedding and not config.use_d_vector_file:
|
||||||
self.embedded_speaker_dim = self.c_in_channels
|
print(" > Init speaker_embedding layer.")
|
||||||
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
self.embedded_speaker_dim = self.hidden_channels_enc
|
||||||
|
self.emb_g = nn.Embedding(self.num_speakers, self.hidden_channels_enc)
|
||||||
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
|
||||||
|
# set conditioning dimensions
|
||||||
|
self.c_in_channels = self.embedded_speaker_dim
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def compute_outputs(attn, o_mean, o_log_scale, x_mask):
|
def compute_outputs(attn, o_mean, o_log_scale, x_mask):
|
||||||
|
@ -146,6 +147,35 @@ class GlowTTS(BaseTTS):
|
||||||
if getattr(f, "set_ddi", False):
|
if getattr(f, "set_ddi", False):
|
||||||
f.set_ddi(False)
|
f.set_ddi(False)
|
||||||
|
|
||||||
|
def _set_speaker_input(self, aux_input: Dict):
|
||||||
|
if aux_input is None:
|
||||||
|
d_vectors = None
|
||||||
|
speaker_ids = None
|
||||||
|
else:
|
||||||
|
d_vectors = aux_input.get("d_vectors", None)
|
||||||
|
speaker_ids = aux_input.get("speaker_ids", None)
|
||||||
|
|
||||||
|
if d_vectors is not None and speaker_ids is not None:
|
||||||
|
raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
|
||||||
|
|
||||||
|
if speaker_ids is not None and not hasattr(self, "emb_g"):
|
||||||
|
raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
|
||||||
|
|
||||||
|
g = speaker_ids if speaker_ids is not None else d_vectors
|
||||||
|
return g
|
||||||
|
|
||||||
|
def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:
|
||||||
|
g = self._set_speaker_input(aux_input)
|
||||||
|
# speaker embedding
|
||||||
|
if g is not None:
|
||||||
|
if hasattr(self, "emb_g"):
|
||||||
|
# use speaker embedding layer
|
||||||
|
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
|
||||||
|
else:
|
||||||
|
# use d-vector
|
||||||
|
g = F.normalize(g).unsqueeze(-1) # [b, h, 1]
|
||||||
|
return g
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
|
self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
|
||||||
): # pylint: disable=dangerous-default-value
|
): # pylint: disable=dangerous-default-value
|
||||||
|
@ -161,12 +191,7 @@ class GlowTTS(BaseTTS):
|
||||||
y = y.transpose(1, 2)
|
y = y.transpose(1, 2)
|
||||||
y_max_length = y.size(2)
|
y_max_length = y.size(2)
|
||||||
# norm speaker embeddings
|
# norm speaker embeddings
|
||||||
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None
|
g = self._speaker_embedding(aux_input)
|
||||||
if self.use_speaker_embedding or self.use_d_vector_file:
|
|
||||||
if not self.use_d_vector_file:
|
|
||||||
g = F.normalize(g).unsqueeze(-1)
|
|
||||||
else:
|
|
||||||
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
|
|
||||||
# embedding pass
|
# embedding pass
|
||||||
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
|
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
|
||||||
# drop redisual frames wrt num_squeeze and set y_lengths.
|
# drop redisual frames wrt num_squeeze and set y_lengths.
|
||||||
|
@ -217,12 +242,7 @@ class GlowTTS(BaseTTS):
|
||||||
y = y.transpose(1, 2)
|
y = y.transpose(1, 2)
|
||||||
y_max_length = y.size(2)
|
y_max_length = y.size(2)
|
||||||
# norm speaker embeddings
|
# norm speaker embeddings
|
||||||
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None
|
g = self._speaker_embedding(aux_input)
|
||||||
if self.use_speaker_embedding or self.use_d_vector_file:
|
|
||||||
if not self.use_d_vector_file:
|
|
||||||
g = F.normalize(g).unsqueeze(-1)
|
|
||||||
else:
|
|
||||||
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
|
|
||||||
# embedding pass
|
# embedding pass
|
||||||
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
|
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
|
||||||
# drop redisual frames wrt num_squeeze and set y_lengths.
|
# drop redisual frames wrt num_squeeze and set y_lengths.
|
||||||
|
@ -272,22 +292,12 @@ class GlowTTS(BaseTTS):
|
||||||
"""
|
"""
|
||||||
y = y.transpose(1, 2)
|
y = y.transpose(1, 2)
|
||||||
y_max_length = y.size(2)
|
y_max_length = y.size(2)
|
||||||
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None
|
g = self._speaker_embedding(aux_input)
|
||||||
# norm speaker embeddings
|
|
||||||
if g is not None:
|
|
||||||
if self.external_d_vector_dim:
|
|
||||||
g = F.normalize(g).unsqueeze(-1)
|
|
||||||
else:
|
|
||||||
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
|
|
||||||
|
|
||||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype)
|
y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype)
|
||||||
|
|
||||||
# decoder pass
|
# decoder pass
|
||||||
z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
|
z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
|
||||||
|
|
||||||
# reverse decoder and predict
|
# reverse decoder and predict
|
||||||
y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
|
y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
|
||||||
|
|
||||||
outputs = {}
|
outputs = {}
|
||||||
outputs["model_outputs"] = y.transpose(1, 2)
|
outputs["model_outputs"] = y.transpose(1, 2)
|
||||||
outputs["logdet"] = logdet
|
outputs["logdet"] = logdet
|
||||||
|
@ -298,19 +308,12 @@ class GlowTTS(BaseTTS):
|
||||||
self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
|
self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
|
||||||
): # pylint: disable=dangerous-default-value
|
): # pylint: disable=dangerous-default-value
|
||||||
x_lengths = aux_input["x_lengths"]
|
x_lengths = aux_input["x_lengths"]
|
||||||
g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None
|
g = self._speaker_embedding(aux_input)
|
||||||
|
|
||||||
if g is not None:
|
|
||||||
if self.d_vector_dim:
|
|
||||||
g = F.normalize(g).unsqueeze(-1)
|
|
||||||
else:
|
|
||||||
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h]
|
|
||||||
|
|
||||||
# embedding pass
|
# embedding pass
|
||||||
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
|
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
|
||||||
# compute output durations
|
# compute output durations
|
||||||
w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale
|
w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale
|
||||||
w_ceil = torch.ceil(w)
|
w_ceil = torch.clamp_min(torch.ceil(w), 1)
|
||||||
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
||||||
y_max_length = None
|
y_max_length = None
|
||||||
# compute masks
|
# compute masks
|
||||||
|
@ -387,17 +390,17 @@ class GlowTTS(BaseTTS):
|
||||||
)
|
)
|
||||||
return outputs, loss_dict
|
return outputs, loss_dict
|
||||||
|
|
||||||
def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use
|
def _create_logs(self, batch, outputs, ap):
|
||||||
alignments = outputs["alignments"]
|
alignments = outputs["alignments"]
|
||||||
text_input = batch["text_input"]
|
text_input = batch["text_input"][:1] if batch["text_input"] is not None else None
|
||||||
text_lengths = batch["text_lengths"]
|
text_lengths = batch["text_lengths"]
|
||||||
mel_input = batch["mel_input"]
|
mel_input = batch["mel_input"]
|
||||||
d_vectors = batch["d_vectors"]
|
d_vectors = batch["d_vectors"][:1] if batch["d_vectors"] is not None else None
|
||||||
speaker_ids = batch["speaker_ids"]
|
speaker_ids = batch["speaker_ids"][:1] if batch["speaker_ids"] is not None else None
|
||||||
|
|
||||||
# model runs reverse flow to predict spectrograms
|
# model runs reverse flow to predict spectrograms
|
||||||
pred_outputs = self.inference(
|
pred_outputs = self.inference(
|
||||||
text_input[:1],
|
text_input,
|
||||||
aux_input={"x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids},
|
aux_input={"x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids},
|
||||||
)
|
)
|
||||||
model_outputs = pred_outputs["model_outputs"]
|
model_outputs = pred_outputs["model_outputs"]
|
||||||
|
@ -416,15 +419,26 @@ class GlowTTS(BaseTTS):
|
||||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
||||||
return figures, {"audio": train_audio}
|
return figures, {"audio": train_audio}
|
||||||
|
|
||||||
|
def train_log(
|
||||||
|
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
|
||||||
|
) -> None: # pylint: disable=no-self-use
|
||||||
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.train_figures(steps, figures)
|
||||||
|
logger.train_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def eval_step(self, batch: dict, criterion: nn.Module):
|
def eval_step(self, batch: dict, criterion: nn.Module):
|
||||||
return self.train_step(batch, criterion)
|
return self.train_step(batch, criterion)
|
||||||
|
|
||||||
def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict):
|
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
|
||||||
return self.train_log(ap, batch, outputs)
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.eval_figures(steps, figures)
|
||||||
|
logger.eval_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def test_run(self, ap):
|
def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
|
||||||
"""Generic test run for `tts` models used by `Trainer`.
|
"""Generic test run for `tts` models used by `Trainer`.
|
||||||
|
|
||||||
You can override this for a different behaviour.
|
You can override this for a different behaviour.
|
||||||
|
@ -432,11 +446,12 @@ class GlowTTS(BaseTTS):
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
|
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
|
||||||
"""
|
"""
|
||||||
|
ap = assets["audio_processor"]
|
||||||
print(" | > Synthesizing test sentences.")
|
print(" | > Synthesizing test sentences.")
|
||||||
test_audios = {}
|
test_audios = {}
|
||||||
test_figures = {}
|
test_figures = {}
|
||||||
test_sentences = self.config.test_sentences
|
test_sentences = self.config.test_sentences
|
||||||
aux_inputs = self.get_aux_input()
|
aux_inputs = self._get_test_aux_input()
|
||||||
if len(test_sentences) == 0:
|
if len(test_sentences) == 0:
|
||||||
print(" | [!] No test sentences provided.")
|
print(" | [!] No test sentences provided.")
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,29 +1,34 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
from typing import Dict, Tuple
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
from torch.cuda.amp.autocast_mode import autocast
|
||||||
|
|
||||||
from TTS.tts.layers.tacotron.gst_layers import GST
|
from TTS.tts.layers.tacotron.gst_layers import GST
|
||||||
from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
|
from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
|
||||||
from TTS.tts.models.base_tacotron import BaseTacotron
|
from TTS.tts.models.base_tacotron import BaseTacotron
|
||||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
from TTS.tts.utils.measures import alignment_diagonal_score
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
|
|
||||||
|
|
||||||
class Tacotron(BaseTacotron):
|
class Tacotron(BaseTacotron):
|
||||||
"""Tacotron as in https://arxiv.org/abs/1703.10135
|
"""Tacotron as in https://arxiv.org/abs/1703.10135
|
||||||
It's an autoregressive encoder-attention-decoder-postnet architecture.
|
It's an autoregressive encoder-attention-decoder-postnet architecture.
|
||||||
Check `TacotronConfig` for the arguments.
|
Check `TacotronConfig` for the arguments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (TacotronConfig): Configuration for the Tacotron model.
|
||||||
|
speaker_manager (SpeakerManager): Speaker manager to handle multi-speaker settings. Only use if the model is
|
||||||
|
a multi-speaker model. Defaults to None.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: Coqpit):
|
def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
chars, self.config = self.get_characters(config)
|
self.speaker_manager = speaker_manager
|
||||||
|
chars, self.config, _ = self.get_characters(config)
|
||||||
config.num_chars = self.num_chars = len(chars)
|
config.num_chars = self.num_chars = len(chars)
|
||||||
|
|
||||||
# pass all config fields to `self`
|
# pass all config fields to `self`
|
||||||
|
@ -243,40 +248,47 @@ class Tacotron(BaseTacotron):
|
||||||
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
|
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
|
||||||
|
|
||||||
# compute loss
|
# compute loss
|
||||||
loss_dict = criterion(
|
with autocast(enabled=False): # use float32 for the criterion
|
||||||
outputs["model_outputs"],
|
loss_dict = criterion(
|
||||||
outputs["decoder_outputs"],
|
outputs["model_outputs"].float(),
|
||||||
mel_input,
|
outputs["decoder_outputs"].float(),
|
||||||
linear_input,
|
mel_input.float(),
|
||||||
outputs["stop_tokens"],
|
linear_input.float(),
|
||||||
stop_targets,
|
outputs["stop_tokens"].float(),
|
||||||
stop_target_lengths,
|
stop_targets.float(),
|
||||||
mel_lengths,
|
stop_target_lengths,
|
||||||
outputs["decoder_outputs_backward"],
|
mel_lengths,
|
||||||
outputs["alignments"],
|
None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
|
||||||
alignment_lengths,
|
outputs["alignments"].float(),
|
||||||
outputs["alignments_backward"],
|
alignment_lengths,
|
||||||
text_lengths,
|
None if outputs["alignments_backward"] is None else outputs["alignments_backward"].float(),
|
||||||
)
|
text_lengths,
|
||||||
|
)
|
||||||
|
|
||||||
# compute alignment error (the lower the better )
|
# compute alignment error (the lower the better )
|
||||||
align_error = 1 - alignment_diagonal_score(outputs["alignments"])
|
align_error = 1 - alignment_diagonal_score(outputs["alignments"])
|
||||||
loss_dict["align_error"] = align_error
|
loss_dict["align_error"] = align_error
|
||||||
return outputs, loss_dict
|
return outputs, loss_dict
|
||||||
|
|
||||||
def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict) -> Tuple[Dict, Dict]:
|
def _create_logs(self, batch, outputs, ap):
|
||||||
postnet_outputs = outputs["model_outputs"]
|
postnet_outputs = outputs["model_outputs"]
|
||||||
|
decoder_outputs = outputs["decoder_outputs"]
|
||||||
alignments = outputs["alignments"]
|
alignments = outputs["alignments"]
|
||||||
alignments_backward = outputs["alignments_backward"]
|
alignments_backward = outputs["alignments_backward"]
|
||||||
mel_input = batch["mel_input"]
|
mel_input = batch["mel_input"]
|
||||||
|
linear_input = batch["linear_input"]
|
||||||
|
|
||||||
pred_spec = postnet_outputs[0].data.cpu().numpy()
|
pred_linear_spec = postnet_outputs[0].data.cpu().numpy()
|
||||||
gt_spec = mel_input[0].data.cpu().numpy()
|
pred_mel_spec = decoder_outputs[0].data.cpu().numpy()
|
||||||
|
gt_linear_spec = linear_input[0].data.cpu().numpy()
|
||||||
|
gt_mel_spec = mel_input[0].data.cpu().numpy()
|
||||||
align_img = alignments[0].data.cpu().numpy()
|
align_img = alignments[0].data.cpu().numpy()
|
||||||
|
|
||||||
figures = {
|
figures = {
|
||||||
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
|
"pred_linear_spec": plot_spectrogram(pred_linear_spec, ap, output_fig=False),
|
||||||
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
"real_linear_spec": plot_spectrogram(gt_linear_spec, ap, output_fig=False),
|
||||||
|
"pred_mel_spec": plot_spectrogram(pred_mel_spec, ap, output_fig=False),
|
||||||
|
"real_mel_spec": plot_spectrogram(gt_mel_spec, ap, output_fig=False),
|
||||||
"alignment": plot_alignment(align_img, output_fig=False),
|
"alignment": plot_alignment(align_img, output_fig=False),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -284,11 +296,22 @@ class Tacotron(BaseTacotron):
|
||||||
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
|
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
|
||||||
|
|
||||||
# Sample audio
|
# Sample audio
|
||||||
train_audio = ap.inv_spectrogram(pred_spec.T)
|
audio = ap.inv_spectrogram(pred_linear_spec.T)
|
||||||
return figures, {"audio": train_audio}
|
return figures, {"audio": audio}
|
||||||
|
|
||||||
def eval_step(self, batch, criterion):
|
def train_log(
|
||||||
|
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
|
||||||
|
) -> None: # pylint: disable=no-self-use
|
||||||
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.train_figures(steps, figures)
|
||||||
|
logger.train_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
||||||
|
def eval_step(self, batch: dict, criterion: nn.Module):
|
||||||
return self.train_step(batch, criterion)
|
return self.train_step(batch, criterion)
|
||||||
|
|
||||||
def eval_log(self, ap, batch, outputs):
|
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
|
||||||
return self.train_log(ap, batch, outputs)
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.eval_figures(steps, figures)
|
||||||
|
logger.eval_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
|
@ -1,28 +1,50 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
from typing import Dict, Tuple
|
from typing import Dict
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
from torch.cuda.amp.autocast_mode import autocast
|
||||||
|
|
||||||
from TTS.tts.layers.tacotron.gst_layers import GST
|
from TTS.tts.layers.tacotron.gst_layers import GST
|
||||||
from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
|
from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
|
||||||
from TTS.tts.models.base_tacotron import BaseTacotron
|
from TTS.tts.models.base_tacotron import BaseTacotron
|
||||||
from TTS.tts.utils.measures import alignment_diagonal_score
|
from TTS.tts.utils.measures import alignment_diagonal_score
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
|
|
||||||
|
|
||||||
class Tacotron2(BaseTacotron):
|
class Tacotron2(BaseTacotron):
|
||||||
"""Tacotron2 as in https://arxiv.org/abs/1712.05884
|
"""Tacotron2 model implementation inherited from :class:`TTS.tts.models.base_tacotron.BaseTacotron`.
|
||||||
Check `TacotronConfig` for the arguments.
|
|
||||||
|
Paper::
|
||||||
|
https://arxiv.org/abs/1712.05884
|
||||||
|
|
||||||
|
Paper abstract::
|
||||||
|
This paper describes Tacotron 2, a neural network architecture for speech synthesis directly from text.
|
||||||
|
The system is composed of a recurrent sequence-to-sequence feature prediction network that maps character
|
||||||
|
embeddings to mel-scale spectrograms, followed by a modified WaveNet model acting as a vocoder to synthesize
|
||||||
|
timedomain waveforms from those spectrograms. Our model achieves a mean opinion score (MOS) of 4.53 comparable
|
||||||
|
to a MOS of 4.58 for professionally recorded speech. To validate our design choices, we present ablation
|
||||||
|
studies of key components of our system and evaluate the impact of using mel spectrograms as the input to
|
||||||
|
WaveNet instead of linguistic, duration, and F0 features. We further demonstrate that using a compact acoustic
|
||||||
|
intermediate representation enables significant simplification of the WaveNet architecture.
|
||||||
|
|
||||||
|
Check :class:`TTS.tts.configs.tacotron2_config.Tacotron2Config` for model arguments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (TacotronConfig):
|
||||||
|
Configuration for the Tacotron2 model.
|
||||||
|
speaker_manager (SpeakerManager):
|
||||||
|
Speaker manager for multi-speaker training. Uuse only for multi-speaker training. Defaults to None.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: Coqpit):
|
def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
chars, self.config = self.get_characters(config)
|
self.speaker_manager = speaker_manager
|
||||||
|
chars, self.config, _ = self.get_characters(config)
|
||||||
config.num_chars = len(chars)
|
config.num_chars = len(chars)
|
||||||
self.decoder_output_dim = config.out_channels
|
self.decoder_output_dim = config.out_channels
|
||||||
|
|
||||||
|
@ -31,9 +53,7 @@ class Tacotron2(BaseTacotron):
|
||||||
for key in config:
|
for key in config:
|
||||||
setattr(self, key, config[key])
|
setattr(self, key, config[key])
|
||||||
|
|
||||||
# set speaker embedding channel size for determining `in_channels` for the connected layers.
|
# init multi-speaker layers
|
||||||
# `init_multispeaker` needs to be called once more in training to initialize the speaker embedding layer based
|
|
||||||
# on the number of speakers infered from the dataset.
|
|
||||||
if self.use_speaker_embedding or self.use_d_vector_file:
|
if self.use_speaker_embedding or self.use_d_vector_file:
|
||||||
self.init_multispeaker(config)
|
self.init_multispeaker(config)
|
||||||
self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim
|
self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim
|
||||||
|
@ -103,6 +123,7 @@ class Tacotron2(BaseTacotron):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
|
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
|
||||||
|
"""Final reshape of the model output tensors."""
|
||||||
mel_outputs = mel_outputs.transpose(1, 2)
|
mel_outputs = mel_outputs.transpose(1, 2)
|
||||||
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments
|
return mel_outputs, mel_outputs_postnet, alignments
|
||||||
|
@ -110,13 +131,14 @@ class Tacotron2(BaseTacotron):
|
||||||
def forward( # pylint: disable=dangerous-default-value
|
def forward( # pylint: disable=dangerous-default-value
|
||||||
self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input={"speaker_ids": None, "d_vectors": None}
|
self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input={"speaker_ids": None, "d_vectors": None}
|
||||||
):
|
):
|
||||||
"""
|
"""Forward pass for training with Teacher Forcing.
|
||||||
|
|
||||||
Shapes:
|
Shapes:
|
||||||
text: [B, T_in]
|
text: :math:`[B, T_in]`
|
||||||
text_lengths: [B]
|
text_lengths: :math:`[B]`
|
||||||
mel_specs: [B, T_out, C]
|
mel_specs: :math:`[B, T_out, C]`
|
||||||
mel_lengths: [B]
|
mel_lengths: :math:`[B]`
|
||||||
aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C]
|
aux_input: 'speaker_ids': :math:`[B, 1]` and 'd_vectors': :math:`[B, C]`
|
||||||
"""
|
"""
|
||||||
aux_input = self._format_aux_input(aux_input)
|
aux_input = self._format_aux_input(aux_input)
|
||||||
outputs = {"alignments_backward": None, "decoder_outputs_backward": None}
|
outputs = {"alignments_backward": None, "decoder_outputs_backward": None}
|
||||||
|
@ -177,6 +199,12 @@ class Tacotron2(BaseTacotron):
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def inference(self, text, aux_input=None):
|
def inference(self, text, aux_input=None):
|
||||||
|
"""Forward pass for inference with no Teacher-Forcing.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
text: :math:`[B, T_in]`
|
||||||
|
text_lengths: :math:`[B]`
|
||||||
|
"""
|
||||||
aux_input = self._format_aux_input(aux_input)
|
aux_input = self._format_aux_input(aux_input)
|
||||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||||
|
@ -210,18 +238,17 @@ class Tacotron2(BaseTacotron):
|
||||||
}
|
}
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def train_step(self, batch, criterion):
|
def train_step(self, batch: Dict, criterion: torch.nn.Module):
|
||||||
"""Perform a single training step by fetching the right set if samples from the batch.
|
"""A single training step. Forward pass and loss computation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
batch ([type]): [description]
|
batch ([Dict]): A dictionary of input tensors.
|
||||||
criterion ([type]): [description]
|
criterion ([type]): Callable criterion to compute model loss.
|
||||||
"""
|
"""
|
||||||
text_input = batch["text_input"]
|
text_input = batch["text_input"]
|
||||||
text_lengths = batch["text_lengths"]
|
text_lengths = batch["text_lengths"]
|
||||||
mel_input = batch["mel_input"]
|
mel_input = batch["mel_input"]
|
||||||
mel_lengths = batch["mel_lengths"]
|
mel_lengths = batch["mel_lengths"]
|
||||||
linear_input = batch["linear_input"]
|
|
||||||
stop_targets = batch["stop_targets"]
|
stop_targets = batch["stop_targets"]
|
||||||
stop_target_lengths = batch["stop_target_lengths"]
|
stop_target_lengths = batch["stop_target_lengths"]
|
||||||
speaker_ids = batch["speaker_ids"]
|
speaker_ids = batch["speaker_ids"]
|
||||||
|
@ -248,28 +275,30 @@ class Tacotron2(BaseTacotron):
|
||||||
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
|
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
|
||||||
|
|
||||||
# compute loss
|
# compute loss
|
||||||
loss_dict = criterion(
|
with autocast(enabled=False): # use float32 for the criterion
|
||||||
outputs["model_outputs"],
|
loss_dict = criterion(
|
||||||
outputs["decoder_outputs"],
|
outputs["model_outputs"].float(),
|
||||||
mel_input,
|
outputs["decoder_outputs"].float(),
|
||||||
linear_input,
|
mel_input.float(),
|
||||||
outputs["stop_tokens"],
|
None,
|
||||||
stop_targets,
|
outputs["stop_tokens"].float(),
|
||||||
stop_target_lengths,
|
stop_targets.float(),
|
||||||
mel_lengths,
|
stop_target_lengths,
|
||||||
outputs["decoder_outputs_backward"],
|
mel_lengths,
|
||||||
outputs["alignments"],
|
None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
|
||||||
alignment_lengths,
|
outputs["alignments"].float(),
|
||||||
outputs["alignments_backward"],
|
alignment_lengths,
|
||||||
text_lengths,
|
None if outputs["alignments_backward"] is None else outputs["alignments_backward"].float(),
|
||||||
)
|
text_lengths,
|
||||||
|
)
|
||||||
|
|
||||||
# compute alignment error (the lower the better )
|
# compute alignment error (the lower the better )
|
||||||
align_error = 1 - alignment_diagonal_score(outputs["alignments"])
|
align_error = 1 - alignment_diagonal_score(outputs["alignments"])
|
||||||
loss_dict["align_error"] = align_error
|
loss_dict["align_error"] = align_error
|
||||||
return outputs, loss_dict
|
return outputs, loss_dict
|
||||||
|
|
||||||
def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict) -> Tuple[Dict, Dict]:
|
def _create_logs(self, batch, outputs, ap):
|
||||||
|
"""Create dashboard log information."""
|
||||||
postnet_outputs = outputs["model_outputs"]
|
postnet_outputs = outputs["model_outputs"]
|
||||||
alignments = outputs["alignments"]
|
alignments = outputs["alignments"]
|
||||||
alignments_backward = outputs["alignments_backward"]
|
alignments_backward = outputs["alignments_backward"]
|
||||||
|
@ -289,11 +318,23 @@ class Tacotron2(BaseTacotron):
|
||||||
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
|
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
|
||||||
|
|
||||||
# Sample audio
|
# Sample audio
|
||||||
train_audio = ap.inv_melspectrogram(pred_spec.T)
|
audio = ap.inv_melspectrogram(pred_spec.T)
|
||||||
return figures, {"audio": train_audio}
|
return figures, {"audio": audio}
|
||||||
|
|
||||||
def eval_step(self, batch, criterion):
|
def train_log(
|
||||||
|
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
|
||||||
|
) -> None: # pylint: disable=no-self-use
|
||||||
|
"""Log training progress."""
|
||||||
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.train_figures(steps, figures)
|
||||||
|
logger.train_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
||||||
|
def eval_step(self, batch: dict, criterion: nn.Module):
|
||||||
return self.train_step(batch, criterion)
|
return self.train_step(batch, criterion)
|
||||||
|
|
||||||
def eval_log(self, ap, batch, outputs):
|
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
|
||||||
return self.train_log(ap, batch, outputs)
|
ap = assets["audio_processor"]
|
||||||
|
figures, audios = self._create_logs(batch, outputs, ap)
|
||||||
|
logger.eval_figures(steps, figures)
|
||||||
|
logger.eval_audios(steps, audios, ap.sample_rate)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import math
|
import math
|
||||||
|
import random
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
@ -14,10 +15,9 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock
|
||||||
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
|
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
|
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
|
||||||
from TTS.tts.utils.speakers import get_speaker_manager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.synthesis import synthesis
|
from TTS.tts.utils.synthesis import synthesis
|
||||||
from TTS.tts.utils.visual import plot_alignment
|
from TTS.tts.utils.visual import plot_alignment
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
from TTS.utils.trainer_utils import get_optimizer, get_scheduler
|
from TTS.utils.trainer_utils import get_optimizer, get_scheduler
|
||||||
from TTS.vocoder.models.hifigan_generator import HifiganGenerator
|
from TTS.vocoder.models.hifigan_generator import HifiganGenerator
|
||||||
from TTS.vocoder.utils.generic_utils import plot_results
|
from TTS.vocoder.utils.generic_utils import plot_results
|
||||||
|
@ -181,6 +181,7 @@ class VitsArgs(Coqpit):
|
||||||
speakers_file: str = None
|
speakers_file: str = None
|
||||||
speaker_embedding_channels: int = 256
|
speaker_embedding_channels: int = 256
|
||||||
use_d_vector_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
|
d_vector_file: str = None
|
||||||
d_vector_dim: int = 0
|
d_vector_dim: int = 0
|
||||||
detach_dp_input: bool = True
|
detach_dp_input: bool = True
|
||||||
|
|
||||||
|
@ -215,12 +216,13 @@ class Vits(BaseTTS):
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
|
|
||||||
def __init__(self, config: Coqpit):
|
def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__(config)
|
||||||
|
|
||||||
self.END2END = True
|
self.END2END = True
|
||||||
|
|
||||||
|
self.speaker_manager = speaker_manager
|
||||||
if config.__class__.__name__ == "VitsConfig":
|
if config.__class__.__name__ == "VitsConfig":
|
||||||
# loading from VitsConfig
|
# loading from VitsConfig
|
||||||
if "num_chars" not in config:
|
if "num_chars" not in config:
|
||||||
|
@ -312,31 +314,42 @@ class Vits(BaseTTS):
|
||||||
if args.init_discriminator:
|
if args.init_discriminator:
|
||||||
self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator)
|
self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator)
|
||||||
|
|
||||||
def init_multispeaker(self, config: Coqpit, data: List = None):
|
def init_multispeaker(self, config: Coqpit):
|
||||||
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
|
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
|
||||||
or with external `d_vectors` computed from a speaker encoder model.
|
or with external `d_vectors` computed from a speaker encoder model.
|
||||||
|
|
||||||
If you need a different behaviour, override this function for your model.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model configuration.
|
config (Coqpit): Model configuration.
|
||||||
data (List, optional): Dataset items to infer number of speakers. Defaults to None.
|
data (List, optional): Dataset items to infer number of speakers. Defaults to None.
|
||||||
"""
|
"""
|
||||||
|
self.embedded_speaker_dim = 0
|
||||||
if hasattr(config, "model_args"):
|
if hasattr(config, "model_args"):
|
||||||
config = config.model_args
|
config = config.model_args
|
||||||
self.embedded_speaker_dim = 0
|
|
||||||
# init speaker manager
|
self.num_speakers = config.num_speakers
|
||||||
self.speaker_manager = get_speaker_manager(config, data=data)
|
|
||||||
if config.num_speakers > 0 and self.speaker_manager.num_speakers == 0:
|
if config.use_speaker_embedding:
|
||||||
self.speaker_manager.num_speakers = config.num_speakers
|
self._init_speaker_embedding(config)
|
||||||
self.num_speakers = self.speaker_manager.num_speakers
|
|
||||||
# init speaker embedding layer
|
|
||||||
if config.use_speaker_embedding and not config.use_d_vector_file:
|
|
||||||
self.embedded_speaker_dim = config.speaker_embedding_channels
|
|
||||||
self.emb_g = nn.Embedding(config.num_speakers, config.speaker_embedding_channels)
|
|
||||||
# init d-vector usage
|
|
||||||
if config.use_d_vector_file:
|
if config.use_d_vector_file:
|
||||||
self.embedded_speaker_dim = config.d_vector_dim
|
self._init_d_vector(config)
|
||||||
|
|
||||||
|
def _init_speaker_embedding(self, config):
|
||||||
|
# pylint: disable=attribute-defined-outside-init
|
||||||
|
if config.speakers_file is not None:
|
||||||
|
self.speaker_manager = SpeakerManager(speaker_id_file_path=config.speakers_file_path)
|
||||||
|
|
||||||
|
if self.num_speakers > 0:
|
||||||
|
print(" > initialization of speaker-embedding layers.")
|
||||||
|
self.embedded_speaker_dim = config.speaker_embedding_channels
|
||||||
|
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
||||||
|
|
||||||
|
def _init_d_vector(self, config):
|
||||||
|
# pylint: disable=attribute-defined-outside-init
|
||||||
|
if hasattr(self, "emb_g"):
|
||||||
|
raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
|
||||||
|
self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
|
||||||
|
self.embedded_speaker_dim = config.d_vector_dim
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _set_cond_input(aux_input: Dict):
|
def _set_cond_input(aux_input: Dict):
|
||||||
|
@ -350,6 +363,10 @@ class Vits(BaseTTS):
|
||||||
g = aux_input["d_vectors"]
|
g = aux_input["d_vectors"]
|
||||||
return sid, g
|
return sid, g
|
||||||
|
|
||||||
|
def get_aux_input(self, aux_input: Dict):
|
||||||
|
sid, g = self._set_cond_input(aux_input)
|
||||||
|
return {"speaker_id": sid, "style_wav": None, "d_vector": g}
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
x: torch.tensor,
|
x: torch.tensor,
|
||||||
|
@ -457,7 +474,7 @@ class Vits(BaseTTS):
|
||||||
|
|
||||||
x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths)
|
x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths)
|
||||||
|
|
||||||
if self.num_speakers > 0 and sid:
|
if self.num_speakers > 0 and sid is not None:
|
||||||
g = self.emb_g(sid).unsqueeze(-1)
|
g = self.emb_g(sid).unsqueeze(-1)
|
||||||
|
|
||||||
if self.args.use_sdp:
|
if self.args.use_sdp:
|
||||||
|
@ -576,22 +593,7 @@ class Vits(BaseTTS):
|
||||||
)
|
)
|
||||||
return outputs, loss_dict
|
return outputs, loss_dict
|
||||||
|
|
||||||
def train_log(
|
def _log(self, ap, batch, outputs, name_prefix="train"): # pylint: disable=unused-argument,no-self-use
|
||||||
self, ap: AudioProcessor, batch: Dict, outputs: List, name_prefix="train"
|
|
||||||
): # pylint: disable=no-self-use
|
|
||||||
"""Create visualizations and waveform examples.
|
|
||||||
|
|
||||||
For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
|
|
||||||
be projected onto Tensorboard.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ap (AudioProcessor): audio processor used at training.
|
|
||||||
batch (Dict): Model inputs used at the previous training step.
|
|
||||||
outputs (Dict): Model outputs generated at the previoud training step.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple[Dict, np.ndarray]: training plots and output waveform.
|
|
||||||
"""
|
|
||||||
y_hat = outputs[0]["model_outputs"]
|
y_hat = outputs[0]["model_outputs"]
|
||||||
y = outputs[0]["waveform_seg"]
|
y = outputs[0]["waveform_seg"]
|
||||||
figures = plot_results(y_hat, y, ap, name_prefix)
|
figures = plot_results(y_hat, y, ap, name_prefix)
|
||||||
|
@ -609,12 +611,32 @@ class Vits(BaseTTS):
|
||||||
|
|
||||||
return figures, audios
|
return figures, audios
|
||||||
|
|
||||||
|
def train_log(
|
||||||
|
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
|
||||||
|
): # pylint: disable=no-self-use
|
||||||
|
"""Create visualizations and waveform examples.
|
||||||
|
|
||||||
|
For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
|
||||||
|
be projected onto Tensorboard.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ap (AudioProcessor): audio processor used at training.
|
||||||
|
batch (Dict): Model inputs used at the previous training step.
|
||||||
|
outputs (Dict): Model outputs generated at the previoud training step.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[Dict, np.ndarray]: training plots and output waveform.
|
||||||
|
"""
|
||||||
|
ap = assets["audio_processor"]
|
||||||
|
self._log(ap, batch, outputs, "train")
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
|
def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
|
||||||
return self.train_step(batch, criterion, optimizer_idx)
|
return self.train_step(batch, criterion, optimizer_idx)
|
||||||
|
|
||||||
def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict):
|
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
|
||||||
return self.train_log(ap, batch, outputs, "eval")
|
ap = assets["audio_processor"]
|
||||||
|
return self._log(ap, batch, outputs, "eval")
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def test_run(self, ap) -> Tuple[Dict, Dict]:
|
def test_run(self, ap) -> Tuple[Dict, Dict]:
|
||||||
|
@ -629,7 +651,15 @@ class Vits(BaseTTS):
|
||||||
test_audios = {}
|
test_audios = {}
|
||||||
test_figures = {}
|
test_figures = {}
|
||||||
test_sentences = self.config.test_sentences
|
test_sentences = self.config.test_sentences
|
||||||
aux_inputs = self.get_aux_input()
|
aux_inputs = {
|
||||||
|
"speaker_id": None
|
||||||
|
if not self.config.use_speaker_embedding
|
||||||
|
else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1),
|
||||||
|
"d_vector": None
|
||||||
|
if not self.config.use_d_vector_file
|
||||||
|
else random.samples(sorted(self.speaker_manager.d_vectors.values()), 1),
|
||||||
|
"style_wav": None,
|
||||||
|
}
|
||||||
for idx, sen in enumerate(test_sentences):
|
for idx, sen in enumerate(test_sentences):
|
||||||
wav, alignment, _, _ = synthesis(
|
wav, alignment, _, _ = synthesis(
|
||||||
self,
|
self,
|
||||||
|
@ -666,7 +696,7 @@ class Vits(BaseTTS):
|
||||||
)
|
)
|
||||||
# add the speaker embedding layer
|
# add the speaker embedding layer
|
||||||
if hasattr(self, "emb_g"):
|
if hasattr(self, "emb_g"):
|
||||||
gen_parameters = chain(gen_parameters, self.emb_g)
|
gen_parameters = chain(gen_parameters, self.emb_g.parameters())
|
||||||
optimizer0 = get_optimizer(
|
optimizer0 = get_optimizer(
|
||||||
self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters
|
self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters
|
||||||
)
|
)
|
||||||
|
|
|
@ -63,7 +63,6 @@ class SpeakerManager:
|
||||||
use_cuda: bool = False,
|
use_cuda: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.data_items = []
|
|
||||||
self.d_vectors = {}
|
self.d_vectors = {}
|
||||||
self.speaker_ids = {}
|
self.speaker_ids = {}
|
||||||
self.clip_ids = []
|
self.clip_ids = []
|
||||||
|
@ -72,7 +71,7 @@ class SpeakerManager:
|
||||||
self.use_cuda = use_cuda
|
self.use_cuda = use_cuda
|
||||||
|
|
||||||
if data_items:
|
if data_items:
|
||||||
self.speaker_ids, self.speaker_names, _ = self.parse_speakers_from_data(self.data_items)
|
self.speaker_ids, _ = self.parse_speakers_from_data(data_items)
|
||||||
|
|
||||||
if d_vectors_file_path:
|
if d_vectors_file_path:
|
||||||
self.set_d_vectors_from_file(d_vectors_file_path)
|
self.set_d_vectors_from_file(d_vectors_file_path)
|
||||||
|
@ -110,10 +109,10 @@ class SpeakerManager:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_speakers_from_data(items: list) -> Tuple[Dict, int]:
|
def parse_speakers_from_data(items: list) -> Tuple[Dict, int]:
|
||||||
"""Parse speaker IDs from data samples retured by `load_meta_data()`.
|
"""Parse speaker IDs from data samples retured by `load_tts_samples()`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
items (list): Data sampled returned by `load_meta_data()`.
|
items (list): Data sampled returned by `load_tts_samples()`.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[Dict, int]: speaker IDs and number of speakers.
|
Tuple[Dict, int]: speaker IDs and number of speakers.
|
||||||
|
@ -127,7 +126,7 @@ class SpeakerManager:
|
||||||
"""Set speaker IDs from data samples.
|
"""Set speaker IDs from data samples.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
items (List): Data sampled returned by `load_meta_data()`.
|
items (List): Data sampled returned by `load_tts_samples()`.
|
||||||
"""
|
"""
|
||||||
self.speaker_ids, _ = self.parse_speakers_from_data(items)
|
self.speaker_ids, _ = self.parse_speakers_from_data(items)
|
||||||
|
|
||||||
|
|
|
@ -23,8 +23,10 @@ def _ssim(img1, img2, window, window_size, channel, size_average=True):
|
||||||
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
|
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
|
||||||
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
|
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
|
||||||
|
|
||||||
mu1_sq = mu1.pow(2)
|
# TODO: check if you need AMP disabled
|
||||||
mu2_sq = mu2.pow(2)
|
# with torch.cuda.amp.autocast(enabled=False):
|
||||||
|
mu1_sq = mu1.float().pow(2)
|
||||||
|
mu2_sq = mu2.float().pow(2)
|
||||||
mu1_mu2 = mu1 * mu2
|
mu1_mu2 = mu1 * mu2
|
||||||
|
|
||||||
sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
|
sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
|
||||||
|
|
|
@ -172,7 +172,7 @@ def speaker_id_to_torch(speaker_id, cuda=False):
|
||||||
def embedding_to_torch(d_vector, cuda=False):
|
def embedding_to_torch(d_vector, cuda=False):
|
||||||
if d_vector is not None:
|
if d_vector is not None:
|
||||||
d_vector = np.asarray(d_vector)
|
d_vector = np.asarray(d_vector)
|
||||||
d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor)
|
d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
|
||||||
if cuda:
|
if cuda:
|
||||||
return d_vector.cuda()
|
return d_vector.cuda()
|
||||||
return d_vector
|
return d_vector
|
||||||
|
@ -210,20 +210,42 @@ def synthesis(
|
||||||
d_vector=None,
|
d_vector=None,
|
||||||
backend="torch",
|
backend="torch",
|
||||||
):
|
):
|
||||||
"""Synthesize voice for the given text.
|
"""Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
|
||||||
|
the vocoder model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model (TTS.tts.models): model to synthesize.
|
model (TTS.tts.models):
|
||||||
text (str): target text
|
The TTS model to synthesize audio with.
|
||||||
CONFIG (dict): config dictionary to be loaded from config.json.
|
|
||||||
use_cuda (bool): enable cuda.
|
text (str):
|
||||||
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
|
The input text to convert to speech.
|
||||||
model outputs.
|
|
||||||
speaker_id (int): id of speaker
|
CONFIG (Coqpit):
|
||||||
style_wav (str | Dict[str, float]): Uses for style embedding of GST.
|
Model configuration.
|
||||||
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
|
|
||||||
do_trim_silence (bool): trim silence after synthesis.
|
use_cuda (bool):
|
||||||
backend (str): tf or torch
|
Enable/disable CUDA.
|
||||||
|
|
||||||
|
ap (TTS.tts.utils.audio.AudioProcessor):
|
||||||
|
The audio processor for extracting features and pre/post-processing audio.
|
||||||
|
|
||||||
|
speaker_id (int):
|
||||||
|
Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
|
||||||
|
|
||||||
|
style_wav (str | Dict[str, float]):
|
||||||
|
Path or tensor to/of a waveform used for computing the style embedding. Defaults to None.
|
||||||
|
|
||||||
|
enable_eos_bos_chars (bool):
|
||||||
|
enable special chars for end of sentence and start of sentence. Defaults to False.
|
||||||
|
|
||||||
|
do_trim_silence (bool):
|
||||||
|
trim silence after synthesis. Defaults to False.
|
||||||
|
|
||||||
|
d_vector (torch.Tensor):
|
||||||
|
d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
|
||||||
|
|
||||||
|
backend (str):
|
||||||
|
tf or torch. Defaults to "torch".
|
||||||
"""
|
"""
|
||||||
# GST processing
|
# GST processing
|
||||||
style_mel = None
|
style_mel = None
|
||||||
|
|
|
@ -108,6 +108,8 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||||
class AudioProcessor(object):
|
class AudioProcessor(object):
|
||||||
"""Audio Processor for TTS used by all the data pipelines.
|
"""Audio Processor for TTS used by all the data pipelines.
|
||||||
|
|
||||||
|
TODO: Make this a dataclass to replace `BaseAudioConfig`.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
All the class arguments are set to default values to enable a flexible initialization
|
All the class arguments are set to default values to enable a flexible initialization
|
||||||
of the class with the model config. They are not meaningful for all the arguments.
|
of the class with the model config. They are not meaningful for all the arguments.
|
||||||
|
@ -643,6 +645,10 @@ class AudioProcessor(object):
|
||||||
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
||||||
>>> pitch = ap.compute_f0(wav)
|
>>> pitch = ap.compute_f0(wav)
|
||||||
"""
|
"""
|
||||||
|
# align F0 length to the spectrogram length
|
||||||
|
if len(x) % self.hop_length == 0:
|
||||||
|
x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
|
||||||
|
|
||||||
f0, t = pw.dio(
|
f0, t = pw.dio(
|
||||||
x.astype(np.double),
|
x.astype(np.double),
|
||||||
fs=self.sample_rate,
|
fs=self.sample_rate,
|
||||||
|
@ -745,6 +751,14 @@ class AudioProcessor(object):
|
||||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||||
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
|
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
|
||||||
|
|
||||||
|
def get_duration(self, filename: str) -> float:
|
||||||
|
"""Get the duration of a wav file using Librosa.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename (str): Path to the wav file.
|
||||||
|
"""
|
||||||
|
return librosa.get_duration(filename)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray:
|
def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray:
|
||||||
mu = 2 ** qc - 1
|
mu = 2 ** qc - 1
|
||||||
|
|
|
@ -38,7 +38,8 @@ def copy_model_files(config: Coqpit, out_path, new_fields):
|
||||||
"""
|
"""
|
||||||
copy_config_path = os.path.join(out_path, "config.json")
|
copy_config_path = os.path.join(out_path, "config.json")
|
||||||
# add extra information fields
|
# add extra information fields
|
||||||
config.update(new_fields, allow_new=True)
|
if new_fields:
|
||||||
|
config.update(new_fields, allow_new=True)
|
||||||
# TODO: Revert to config.save_json() once Coqpit supports arbitrary paths.
|
# TODO: Revert to config.save_json() once Coqpit supports arbitrary paths.
|
||||||
with fsspec.open(copy_config_path, "w", encoding="utf8") as f:
|
with fsspec.open(copy_config_path, "w", encoding="utf8") as f:
|
||||||
json.dump(config.to_dict(), f, indent=4)
|
json.dump(config.to_dict(), f, indent=4)
|
||||||
|
|
|
@ -47,11 +47,19 @@ class ConsoleLogger:
|
||||||
tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC
|
tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC
|
||||||
)
|
)
|
||||||
for key, value in loss_dict.items():
|
for key, value in loss_dict.items():
|
||||||
# print the avg value if given
|
|
||||||
if f"avg_{key}" in avg_loss_dict.keys():
|
if f"avg_{key}" in avg_loss_dict.keys():
|
||||||
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f"avg_{key}"])
|
# print the avg value if given
|
||||||
|
if isinstance(value, float) and round(value, 5) == 0:
|
||||||
|
# do not round the number if it is zero when rounded
|
||||||
|
log_text += "{}{}: {} ({})\n".format(indent, key, value, avg_loss_dict[f"avg_{key}"])
|
||||||
|
else:
|
||||||
|
# print the rounded value
|
||||||
|
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f"avg_{key}"])
|
||||||
else:
|
else:
|
||||||
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
|
if isinstance(value, float) and round(value, 5) == 0:
|
||||||
|
log_text += "{}{}: {} \n".format(indent, key, value)
|
||||||
|
else:
|
||||||
|
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
|
||||||
print(log_text, flush=True)
|
print(log_text, flush=True)
|
||||||
|
|
||||||
# pylint: disable=unused-argument
|
# pylint: disable=unused-argument
|
||||||
|
|
|
@ -87,52 +87,15 @@ class Synthesizer(object):
|
||||||
"""
|
"""
|
||||||
return pysbd.Segmenter(language=lang, clean=True)
|
return pysbd.Segmenter(language=lang, clean=True)
|
||||||
|
|
||||||
def _load_speakers(self, speaker_file: str) -> None:
|
|
||||||
"""Load the SpeakerManager to organize multi-speaker TTS. It loads the speakers meta-data and the speaker
|
|
||||||
encoder if it is defined.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
speaker_file (str): path to the speakers meta-data file.
|
|
||||||
"""
|
|
||||||
print("Loading speakers ...")
|
|
||||||
self.speaker_manager = SpeakerManager(
|
|
||||||
encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config
|
|
||||||
)
|
|
||||||
self.speaker_manager.load_d_vectors_file(self.tts_config.get("d_vector_file", speaker_file))
|
|
||||||
self.num_speakers = self.speaker_manager.num_speakers
|
|
||||||
self.d_vector_dim = self.speaker_manager.d_vector_dim
|
|
||||||
|
|
||||||
def _set_tts_speaker_file(self):
|
|
||||||
"""Set the TTS speaker file used by a multi-speaker model."""
|
|
||||||
# setup if multi-speaker settings are in the global model config
|
|
||||||
if hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True:
|
|
||||||
if self.tts_config.use_d_vector_file:
|
|
||||||
self.tts_speakers_file = (
|
|
||||||
self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"]
|
|
||||||
)
|
|
||||||
self.tts_config["d_vector_file"] = self.tts_speakers_file
|
|
||||||
else:
|
|
||||||
self.tts_speakers_file = (
|
|
||||||
self.tts_speakers_file if self.tts_speakers_file else self.tts_config["speakers_file"]
|
|
||||||
)
|
|
||||||
|
|
||||||
# setup if multi-speaker settings are in the model args config
|
|
||||||
if (
|
|
||||||
self.tts_speakers_file is None
|
|
||||||
and hasattr(self.tts_config, "model_args")
|
|
||||||
and hasattr(self.tts_config.model_args, "use_speaker_embedding")
|
|
||||||
and self.tts_config.model_args.use_speaker_embedding
|
|
||||||
):
|
|
||||||
_args = self.tts_config.model_args
|
|
||||||
if _args.use_d_vector_file:
|
|
||||||
self.tts_speakers_file = self.tts_speakers_file if self.tts_speakers_file else _args["d_vector_file"]
|
|
||||||
_args["d_vector_file"] = self.tts_speakers_file
|
|
||||||
else:
|
|
||||||
self.tts_speakers_file = self.tts_speakers_file if self.tts_speakers_file else _args["speakers_file"]
|
|
||||||
|
|
||||||
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None:
|
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None:
|
||||||
"""Load the TTS model.
|
"""Load the TTS model.
|
||||||
|
|
||||||
|
1. Load the model config.
|
||||||
|
2. Init the AudioProcessor.
|
||||||
|
3. Init the model from the config.
|
||||||
|
4. Move the model to the GPU if CUDA is enabled.
|
||||||
|
5. Init the speaker manager for the model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tts_checkpoint (str): path to the model checkpoint.
|
tts_checkpoint (str): path to the model checkpoint.
|
||||||
tts_config_path (str): path to the model config file.
|
tts_config_path (str): path to the model config file.
|
||||||
|
@ -144,15 +107,38 @@ class Synthesizer(object):
|
||||||
self.use_phonemes = self.tts_config.use_phonemes
|
self.use_phonemes = self.tts_config.use_phonemes
|
||||||
self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)
|
self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)
|
||||||
|
|
||||||
self.tts_model = setup_tts_model(config=self.tts_config)
|
speaker_manager = self._init_speaker_manager()
|
||||||
|
|
||||||
|
self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager)
|
||||||
self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
|
self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
self.tts_model.cuda()
|
self.tts_model.cuda()
|
||||||
self._set_tts_speaker_file()
|
|
||||||
|
def _init_speaker_manager(self):
|
||||||
|
"""Initialize the SpeakerManager"""
|
||||||
|
# setup if multi-speaker settings are in the global model config
|
||||||
|
speaker_manager = None
|
||||||
|
if hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True:
|
||||||
|
if self.tts_speakers_file:
|
||||||
|
speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_speakers_file)
|
||||||
|
if self.tts_config.get("speakers_file", None):
|
||||||
|
speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_config.speakers_file)
|
||||||
|
|
||||||
|
if hasattr(self.tts_config, "use_d_vector_file") and self.tts_config.use_speaker_embedding is True:
|
||||||
|
if self.tts_speakers_file:
|
||||||
|
speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_speakers_file)
|
||||||
|
if self.tts_config.get("d_vector_file", None):
|
||||||
|
speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_config.d_vector_file)
|
||||||
|
return speaker_manager
|
||||||
|
|
||||||
def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None:
|
def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None:
|
||||||
"""Load the vocoder model.
|
"""Load the vocoder model.
|
||||||
|
|
||||||
|
1. Load the vocoder config.
|
||||||
|
2. Init the AudioProcessor for the vocoder.
|
||||||
|
3. Init the vocoder model from the config.
|
||||||
|
4. Move the model to the GPU if CUDA is enabled.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_file (str): path to the model checkpoint.
|
model_file (str): path to the model checkpoint.
|
||||||
model_config (str): path to the model config file.
|
model_config (str): path to the model config file.
|
||||||
|
@ -207,11 +193,12 @@ class Synthesizer(object):
|
||||||
# handle multi-speaker
|
# handle multi-speaker
|
||||||
speaker_embedding = None
|
speaker_embedding = None
|
||||||
speaker_id = None
|
speaker_id = None
|
||||||
if self.tts_speakers_file:
|
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
|
||||||
if speaker_idx and isinstance(speaker_idx, str):
|
if speaker_idx and isinstance(speaker_idx, str):
|
||||||
if self.tts_config.use_d_vector_file:
|
if self.tts_config.use_d_vector_file:
|
||||||
# get the speaker embedding from the saved d_vectors.
|
# get the speaker embedding from the saved d_vectors.
|
||||||
speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0]
|
speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0]
|
||||||
|
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||||
else:
|
else:
|
||||||
# get speaker idx from the speaker name
|
# get speaker idx from the speaker name
|
||||||
speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_idx]
|
speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_idx]
|
||||||
|
@ -226,7 +213,7 @@ class Synthesizer(object):
|
||||||
else:
|
else:
|
||||||
if speaker_idx:
|
if speaker_idx:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}."
|
f" [!] Missing speakers.json file path for selecting speaker {speaker_idx}."
|
||||||
"Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
|
"Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,13 @@
|
||||||
import importlib
|
import importlib
|
||||||
|
import os
|
||||||
|
import re
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import fsspec
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from TTS.utils.io import load_fsspec
|
||||||
from TTS.utils.training import NoamLR
|
from TTS.utils.training import NoamLR
|
||||||
|
|
||||||
|
|
||||||
|
@ -80,3 +85,66 @@ def get_optimizer(
|
||||||
if model is not None:
|
if model is not None:
|
||||||
parameters = model.parameters()
|
parameters = model.parameters()
|
||||||
return optimizer(parameters, lr=lr, **optimizer_params)
|
return optimizer(parameters, lr=lr, **optimizer_params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_checkpoint(path: str) -> Tuple[str, str]:
|
||||||
|
"""Get latest checkpoint or/and best model in path.
|
||||||
|
|
||||||
|
It is based on globbing for `*.pth.tar` and the RegEx
|
||||||
|
`(checkpoint|best_model)_([0-9]+)`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to files to be compared.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If no checkpoint or best_model files are found.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the last checkpoint
|
||||||
|
Path to best checkpoint
|
||||||
|
"""
|
||||||
|
fs = fsspec.get_mapper(path).fs
|
||||||
|
file_names = fs.glob(os.path.join(path, "*.pth.tar"))
|
||||||
|
scheme = urlparse(path).scheme
|
||||||
|
if scheme: # scheme is not preserved in fs.glob, add it back
|
||||||
|
file_names = [scheme + "://" + file_name for file_name in file_names]
|
||||||
|
last_models = {}
|
||||||
|
last_model_nums = {}
|
||||||
|
for key in ["checkpoint", "best_model"]:
|
||||||
|
last_model_num = None
|
||||||
|
last_model = None
|
||||||
|
# pass all the checkpoint files and find
|
||||||
|
# the one with the largest model number suffix.
|
||||||
|
for file_name in file_names:
|
||||||
|
match = re.search(f"{key}_([0-9]+)", file_name)
|
||||||
|
if match is not None:
|
||||||
|
model_num = int(match.groups()[0])
|
||||||
|
if last_model_num is None or model_num > last_model_num:
|
||||||
|
last_model_num = model_num
|
||||||
|
last_model = file_name
|
||||||
|
|
||||||
|
# if there is no checkpoint found above
|
||||||
|
# find the checkpoint with the latest
|
||||||
|
# modification date.
|
||||||
|
key_file_names = [fn for fn in file_names if key in fn]
|
||||||
|
if last_model is None and len(key_file_names) > 0:
|
||||||
|
last_model = max(key_file_names, key=os.path.getctime)
|
||||||
|
last_model_num = load_fsspec(last_model)["step"]
|
||||||
|
|
||||||
|
if last_model is not None:
|
||||||
|
last_models[key] = last_model
|
||||||
|
last_model_nums[key] = last_model_num
|
||||||
|
|
||||||
|
# check what models were found
|
||||||
|
if not last_models:
|
||||||
|
raise ValueError(f"No models found in continue path {path}!")
|
||||||
|
if "checkpoint" not in last_models: # no checkpoint just best model
|
||||||
|
last_models["checkpoint"] = last_models["best_model"]
|
||||||
|
elif "best_model" not in last_models: # no best model
|
||||||
|
# this shouldn't happen, but let's handle it just in case
|
||||||
|
last_models["best_model"] = last_models["checkpoint"]
|
||||||
|
# finally check if last best model is more recent than checkpoint
|
||||||
|
elif last_model_nums["best_model"] > last_model_nums["checkpoint"]:
|
||||||
|
last_models["checkpoint"] = last_models["best_model"]
|
||||||
|
|
||||||
|
return last_models["checkpoint"], last_models["best_model"]
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from coqpit import MISSING
|
|
||||||
|
|
||||||
from TTS.config import BaseAudioConfig, BaseTrainingConfig
|
from TTS.config import BaseAudioConfig, BaseTrainingConfig
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,11 +15,11 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
||||||
Number of instances used for evaluation. Defaults to 10.
|
Number of instances used for evaluation. Defaults to 10.
|
||||||
data_path (str):
|
data_path (str):
|
||||||
Root path of the training data. All the audio files found recursively from this root path are used for
|
Root path of the training data. All the audio files found recursively from this root path are used for
|
||||||
training. Defaults to MISSING.
|
training. Defaults to `""`.
|
||||||
feature_path (str):
|
feature_path (str):
|
||||||
Root path to the precomputed feature files. Defaults to None.
|
Root path to the precomputed feature files. Defaults to None.
|
||||||
seq_len (int):
|
seq_len (int):
|
||||||
Length of the waveform segments used for training. Defaults to MISSING.
|
Length of the waveform segments used for training. Defaults to 1000.
|
||||||
pad_short (int):
|
pad_short (int):
|
||||||
Extra padding for the waveforms shorter than `seq_len`. Defaults to 0.
|
Extra padding for the waveforms shorter than `seq_len`. Defaults to 0.
|
||||||
conv_path (int):
|
conv_path (int):
|
||||||
|
@ -45,9 +43,9 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
||||||
use_noise_augment: bool = False # enable/disable random noise augmentation in spectrograms.
|
use_noise_augment: bool = False # enable/disable random noise augmentation in spectrograms.
|
||||||
eval_split_size: int = 10 # number of samples used for evaluation.
|
eval_split_size: int = 10 # number of samples used for evaluation.
|
||||||
# dataset
|
# dataset
|
||||||
data_path: str = MISSING # root data path. It finds all wav files recursively from there.
|
data_path: str = "" # root data path. It finds all wav files recursively from there.
|
||||||
feature_path: str = None # if you use precomputed features
|
feature_path: str = None # if you use precomputed features
|
||||||
seq_len: int = MISSING # signal length used in training.
|
seq_len: int = 1000 # signal length used in training.
|
||||||
pad_short: int = 0 # additional padding for short wavs
|
pad_short: int = 0 # additional padding for short wavs
|
||||||
conv_pad: int = 0 # additional padding against convolutions applied to spectrograms
|
conv_pad: int = 0 # additional padding against convolutions applied to spectrograms
|
||||||
use_cache: bool = False # use in memory cache to keep the computed features. This might cause OOM.
|
use_cache: bool = False # use in memory cache to keep the computed features. This might cause OOM.
|
||||||
|
|
|
@ -75,7 +75,7 @@ class WavernnConfig(BaseVocoderConfig):
|
||||||
model: str = "wavernn"
|
model: str = "wavernn"
|
||||||
|
|
||||||
# Model specific params
|
# Model specific params
|
||||||
model_params: WavernnArgs = field(default_factory=WavernnArgs)
|
model_args: WavernnArgs = field(default_factory=WavernnArgs)
|
||||||
target_loss: str = "loss"
|
target_loss: str = "loss"
|
||||||
|
|
||||||
# Inference
|
# Inference
|
||||||
|
|
|
@ -5,6 +5,7 @@ from torch.utils.data import Dataset
|
||||||
|
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||||
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
||||||
from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
|
from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
|
||||||
|
|
||||||
|
|
|
@ -29,9 +29,7 @@ def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor):
|
||||||
mel = ap.melspectrogram(y)
|
mel = ap.melspectrogram(y)
|
||||||
np.save(mel_path, mel)
|
np.save(mel_path, mel)
|
||||||
if isinstance(config.mode, int):
|
if isinstance(config.mode, int):
|
||||||
quant = (
|
quant = ap.mulaw_encode(y, qc=config.mode) if config.model_args.mulaw else ap.quantize(y, bits=config.mode)
|
||||||
ap.mulaw_encode(y, qc=config.mode) if config.model_params.mulaw else ap.quantize(y, bits=config.mode)
|
|
||||||
)
|
|
||||||
np.save(quant_path, quant)
|
np.save(quant_path, quant)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from coqpit import Coqpit
|
||||||
|
|
||||||
from TTS.model import BaseModel
|
from TTS.model import BaseModel
|
||||||
|
|
||||||
# pylint: skip-file
|
# pylint: skip-file
|
||||||
|
@ -16,5 +18,35 @@ class BaseVocoder(BaseModel):
|
||||||
- 1D tensors `batch x 1`
|
- 1D tensors `batch x 1`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__(config)
|
||||||
|
|
||||||
|
def _set_model_args(self, config: Coqpit):
|
||||||
|
"""Setup model args based on the config type.
|
||||||
|
|
||||||
|
If the config is for training with a name like "*Config", then the model args are embeded in the
|
||||||
|
config.model_args
|
||||||
|
|
||||||
|
If the config is for the model with a name like "*Args", then we assign the directly.
|
||||||
|
"""
|
||||||
|
# don't use isintance not to import recursively
|
||||||
|
if "Config" in config.__class__.__name__:
|
||||||
|
if "characters" in config:
|
||||||
|
_, self.config, num_chars = self.get_characters(config)
|
||||||
|
self.config.num_chars = num_chars
|
||||||
|
if hasattr(self.config, "model_args"):
|
||||||
|
config.model_args.num_chars = num_chars
|
||||||
|
if "model_args" in config:
|
||||||
|
self.args = self.config.model_args
|
||||||
|
# This is for backward compatibility
|
||||||
|
if "model_params" in config:
|
||||||
|
self.args = self.config.model_params
|
||||||
|
else:
|
||||||
|
self.config = config
|
||||||
|
if "model_args" in config:
|
||||||
|
self.args = self.config.model_args
|
||||||
|
# This is for backward compatibility
|
||||||
|
if "model_params" in config:
|
||||||
|
self.args = self.config.model_params
|
||||||
|
else:
|
||||||
|
raise ValueError("config must be either a *Config or *Args")
|
||||||
|
|
|
@ -35,7 +35,7 @@ class GAN(BaseVocoder):
|
||||||
>>> config = HifiganConfig()
|
>>> config = HifiganConfig()
|
||||||
>>> model = GAN(config)
|
>>> model = GAN(config)
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.model_g = setup_generator(config)
|
self.model_g = setup_generator(config)
|
||||||
self.model_d = setup_discriminator(config)
|
self.model_d = setup_discriminator(config)
|
||||||
|
@ -197,18 +197,24 @@ class GAN(BaseVocoder):
|
||||||
audios = {f"{name}/audio": sample_voice}
|
audios = {f"{name}/audio": sample_voice}
|
||||||
return figures, audios
|
return figures, audios
|
||||||
|
|
||||||
def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]:
|
def train_log(
|
||||||
|
self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument
|
||||||
|
) -> Tuple[Dict, np.ndarray]:
|
||||||
"""Call `_log()` for training."""
|
"""Call `_log()` for training."""
|
||||||
return self._log("train", ap, batch, outputs)
|
ap = assets["audio_processor"]
|
||||||
|
self._log("train", ap, batch, outputs)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
|
def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
|
||||||
"""Call `train_step()` with `no_grad()`"""
|
"""Call `train_step()` with `no_grad()`"""
|
||||||
return self.train_step(batch, criterion, optimizer_idx)
|
return self.train_step(batch, criterion, optimizer_idx)
|
||||||
|
|
||||||
def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]:
|
def eval_log(
|
||||||
|
self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument
|
||||||
|
) -> Tuple[Dict, np.ndarray]:
|
||||||
"""Call `_log()` for evaluation."""
|
"""Call `_log()` for evaluation."""
|
||||||
return self._log("eval", ap, batch, outputs)
|
ap = assets["audio_processor"]
|
||||||
|
self._log("eval", ap, batch, outputs)
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self,
|
self,
|
||||||
|
@ -299,7 +305,7 @@ class GAN(BaseVocoder):
|
||||||
def get_data_loader( # pylint: disable=no-self-use
|
def get_data_loader( # pylint: disable=no-self-use
|
||||||
self,
|
self,
|
||||||
config: Coqpit,
|
config: Coqpit,
|
||||||
ap: AudioProcessor,
|
assets: Dict,
|
||||||
is_eval: True,
|
is_eval: True,
|
||||||
data_items: List,
|
data_items: List,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
|
@ -318,6 +324,7 @@ class GAN(BaseVocoder):
|
||||||
Returns:
|
Returns:
|
||||||
DataLoader: Torch dataloader.
|
DataLoader: Torch dataloader.
|
||||||
"""
|
"""
|
||||||
|
ap = assets["audio_processor"]
|
||||||
dataset = GANDataset(
|
dataset = GANDataset(
|
||||||
ap=ap,
|
ap=ap,
|
||||||
items=data_items,
|
items=data_items,
|
||||||
|
|
|
@ -9,7 +9,6 @@ from torch.nn.utils import weight_norm
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
from TTS.utils.audio import AudioProcessor
|
|
||||||
from TTS.utils.io import load_fsspec
|
from TTS.utils.io import load_fsspec
|
||||||
from TTS.utils.trainer_utils import get_optimizer, get_scheduler
|
from TTS.utils.trainer_utils import get_optimizer, get_scheduler
|
||||||
from TTS.vocoder.datasets import WaveGradDataset
|
from TTS.vocoder.datasets import WaveGradDataset
|
||||||
|
@ -58,7 +57,7 @@ class Wavegrad(BaseVocoder):
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
def __init__(self, config: Coqpit):
|
def __init__(self, config: Coqpit):
|
||||||
super().__init__()
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.use_weight_norm = config.model_params.use_weight_norm
|
self.use_weight_norm = config.model_params.use_weight_norm
|
||||||
self.hop_len = np.prod(config.model_params.upsample_factors)
|
self.hop_len = np.prod(config.model_params.upsample_factors)
|
||||||
|
@ -258,21 +257,22 @@ class Wavegrad(BaseVocoder):
|
||||||
return {"model_output": noise_hat}, {"loss": loss}
|
return {"model_output": noise_hat}, {"loss": loss}
|
||||||
|
|
||||||
def train_log( # pylint: disable=no-self-use
|
def train_log( # pylint: disable=no-self-use
|
||||||
self, ap: AudioProcessor, batch: Dict, outputs: Dict # pylint: disable=unused-argument
|
self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument
|
||||||
) -> Tuple[Dict, np.ndarray]:
|
) -> Tuple[Dict, np.ndarray]:
|
||||||
return None, None
|
pass
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
|
def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
|
||||||
return self.train_step(batch, criterion)
|
return self.train_step(batch, criterion)
|
||||||
|
|
||||||
def eval_log( # pylint: disable=no-self-use
|
def eval_log( # pylint: disable=no-self-use
|
||||||
self, ap: AudioProcessor, batch: Dict, outputs: Dict # pylint: disable=unused-argument
|
self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument
|
||||||
) -> Tuple[Dict, np.ndarray]:
|
) -> None:
|
||||||
return None, None
|
pass
|
||||||
|
|
||||||
def test_run(self, ap: AudioProcessor, samples: List[Dict], ouputs: Dict): # pylint: disable=unused-argument
|
def test_run(self, assets: Dict, samples: List[Dict], outputs: Dict): # pylint: disable=unused-argument
|
||||||
# setup noise schedule and inference
|
# setup noise schedule and inference
|
||||||
|
ap = assets["audio_processor"]
|
||||||
noise_schedule = self.config["test_noise_schedule"]
|
noise_schedule = self.config["test_noise_schedule"]
|
||||||
betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"])
|
betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"])
|
||||||
self.compute_noise_level(betas)
|
self.compute_noise_level(betas)
|
||||||
|
@ -307,8 +307,9 @@ class Wavegrad(BaseVocoder):
|
||||||
return {"input": m, "waveform": y}
|
return {"input": m, "waveform": y}
|
||||||
|
|
||||||
def get_data_loader(
|
def get_data_loader(
|
||||||
self, config: Coqpit, ap: AudioProcessor, is_eval: True, data_items: List, verbose: bool, num_gpus: int
|
self, config: Coqpit, assets: Dict, is_eval: True, data_items: List, verbose: bool, num_gpus: int
|
||||||
):
|
):
|
||||||
|
ap = assets["audio_processor"]
|
||||||
dataset = WaveGradDataset(
|
dataset = WaveGradDataset(
|
||||||
ap=ap,
|
ap=ap,
|
||||||
items=data_items,
|
items=data_items,
|
||||||
|
|
|
@ -222,10 +222,7 @@ class Wavernn(BaseVocoder):
|
||||||
samples at once. The Subscale WaveRNN produces 16 samples per step without loss of quality and offers an
|
samples at once. The Subscale WaveRNN produces 16 samples per step without loss of quality and offers an
|
||||||
orthogonal method for increasing sampling efficiency.
|
orthogonal method for increasing sampling efficiency.
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__(config)
|
||||||
|
|
||||||
self.args = config.model_params
|
|
||||||
self.config = config
|
|
||||||
|
|
||||||
if isinstance(self.args.mode, int):
|
if isinstance(self.args.mode, int):
|
||||||
self.n_classes = 2 ** self.args.mode
|
self.n_classes = 2 ** self.args.mode
|
||||||
|
@ -572,8 +569,9 @@ class Wavernn(BaseVocoder):
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def test_run(
|
def test_run(
|
||||||
self, ap: AudioProcessor, samples: List[Dict], output: Dict # pylint: disable=unused-argument
|
self, assets: Dict, samples: List[Dict], output: Dict # pylint: disable=unused-argument
|
||||||
) -> Tuple[Dict, Dict]:
|
) -> Tuple[Dict, Dict]:
|
||||||
|
ap = assets["audio_processor"]
|
||||||
figures = {}
|
figures = {}
|
||||||
audios = {}
|
audios = {}
|
||||||
for idx, sample in enumerate(samples):
|
for idx, sample in enumerate(samples):
|
||||||
|
@ -600,20 +598,21 @@ class Wavernn(BaseVocoder):
|
||||||
def get_data_loader( # pylint: disable=no-self-use
|
def get_data_loader( # pylint: disable=no-self-use
|
||||||
self,
|
self,
|
||||||
config: Coqpit,
|
config: Coqpit,
|
||||||
ap: AudioProcessor,
|
assets: Dict,
|
||||||
is_eval: True,
|
is_eval: True,
|
||||||
data_items: List,
|
data_items: List,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
num_gpus: int,
|
num_gpus: int,
|
||||||
):
|
):
|
||||||
|
ap = assets["audio_processor"]
|
||||||
dataset = WaveRNNDataset(
|
dataset = WaveRNNDataset(
|
||||||
ap=ap,
|
ap=ap,
|
||||||
items=data_items,
|
items=data_items,
|
||||||
seq_len=config.seq_len,
|
seq_len=config.seq_len,
|
||||||
hop_len=ap.hop_length,
|
hop_len=ap.hop_length,
|
||||||
pad=config.model_params.pad,
|
pad=config.model_args.pad,
|
||||||
mode=config.model_params.mode,
|
mode=config.model_args.mode,
|
||||||
mulaw=config.model_params.mulaw,
|
mulaw=config.model_args.mulaw,
|
||||||
is_training=not is_eval,
|
is_training=not is_eval,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
)
|
)
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
We keep tests under `tests` folder. You can add `tts` layers tests under `tts_tests` folder.
|
We keep tests under `tests` folder. You can add `tts` layers tests under `tts_tests` folder.
|
||||||
Basic tests are checking input-output tensor shapes and output values for a given input. Consider testing extreme cases that are more likely to cause problems like `zero` tensors.
|
Basic tests are checking input-output tensor shapes and output values for a given input. Consider testing extreme cases that are more likely to cause problems like `zero` tensors.
|
||||||
|
|
||||||
3. Implement loss function.
|
3. Implement a loss function.
|
||||||
|
|
||||||
We keep loss functions under `TTS/tts/layers/losses.py`. You can also mix-and-match implemented loss functions as you like.
|
We keep loss functions under `TTS/tts/layers/losses.py`. You can also mix-and-match implemented loss functions as you like.
|
||||||
|
|
||||||
|
@ -29,19 +29,20 @@
|
||||||
|
|
||||||
A model interacts with the `Trainer API` for training, `Synthesizer API` for inference and testing.
|
A model interacts with the `Trainer API` for training, `Synthesizer API` for inference and testing.
|
||||||
|
|
||||||
A 🐸TTS model must return a dictionary by the `forward()` and `inference()` functions. This dictionary must also include the `model_outputs` key that is considered as the main model output by the `Trainer` and `Synthesizer`.
|
A 🐸TTS model must return a dictionary by the `forward()` and `inference()` functions. This dictionary must `model_outputs` key that is considered as the main model output by the `Trainer` and `Synthesizer`.
|
||||||
|
|
||||||
You can place your `tts` model implementation under `TTS/tts/models/new_model.py` then inherit and implement the `BaseTTS`.
|
You can place your `tts` model implementation under `TTS/tts/models/new_model.py` then inherit and implement the `BaseTTS`.
|
||||||
|
|
||||||
There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you
|
There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you
|
||||||
the infinite flexibility to add custom behaviours for your model and training routines.
|
an infinite flexibility to add custom behaviours for your model and training routines.
|
||||||
|
|
||||||
For more details, see {ref}`BaseTTS <Base TTS Model>` and :obj:`TTS.utils.callbacks`.
|
For more details, see {ref}`BaseTTS <Base TTS Model>` and :obj:`TTS.utils.callbacks`.
|
||||||
|
|
||||||
6. Optionally, define `MyModelArgs`.
|
6. Optionally, define `MyModelArgs`.
|
||||||
|
|
||||||
`MyModelArgs` is a 👨✈️Coqpit class that sets all the class arguments of the `MyModel`. It should be enough to pass
|
`MyModelArgs` is a 👨✈️Coqpit class that sets all the class arguments of the `MyModel`. `MyModelArgs` must have
|
||||||
an `MyModelArgs` instance to initiate the `MyModel`.
|
all the fields neccessary to instantiate the `MyModel`. However, for training, you need to pass `MyModelConfig` to
|
||||||
|
the model.
|
||||||
|
|
||||||
7. Test `MyModel`.
|
7. Test `MyModel`.
|
||||||
|
|
||||||
|
@ -59,3 +60,149 @@
|
||||||
9. Write Docstrings.
|
9. Write Docstrings.
|
||||||
|
|
||||||
We love you more when you document your code. ❤️
|
We love you more when you document your code. ❤️
|
||||||
|
|
||||||
|
|
||||||
|
# Template 🐸TTS Model implementation
|
||||||
|
|
||||||
|
You can start implementing your model by copying the following base class.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
|
|
||||||
|
|
||||||
|
class MyModel(BaseTTS):
|
||||||
|
"""
|
||||||
|
Notes on input/output tensor shapes:
|
||||||
|
Any input or output tensor of the model must be shaped as
|
||||||
|
|
||||||
|
- 3D tensors `batch x time x channels`
|
||||||
|
- 2D tensors `batch x channels`
|
||||||
|
- 1D tensors `batch x 1`
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Coqpit):
|
||||||
|
super().__init__()
|
||||||
|
self._set_model_args(config)
|
||||||
|
|
||||||
|
def _set_model_args(self, config: Coqpit):
|
||||||
|
"""Set model arguments from the config. Override this."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
|
||||||
|
"""Forward pass for the model mainly used in training.
|
||||||
|
|
||||||
|
You can be flexible here and use different number of arguments and argument names since it is intended to be
|
||||||
|
used by `train_step()` without exposing it out of the model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input (torch.Tensor): Input tensor.
|
||||||
|
aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Model outputs. Main model output must be named as "model_outputs".
|
||||||
|
"""
|
||||||
|
outputs_dict = {"model_outputs": None}
|
||||||
|
...
|
||||||
|
return outputs_dict
|
||||||
|
|
||||||
|
def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
|
||||||
|
"""Forward pass for inference.
|
||||||
|
|
||||||
|
We don't use `*kwargs` since it is problematic with the TorchScript API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input (torch.Tensor): [description]
|
||||||
|
aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: [description]
|
||||||
|
"""
|
||||||
|
outputs_dict = {"model_outputs": None}
|
||||||
|
...
|
||||||
|
return outputs_dict
|
||||||
|
|
||||||
|
def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
|
||||||
|
"""Perform a single training step. Run the model forward pass and compute losses.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch (Dict): Input tensors.
|
||||||
|
criterion (nn.Module): Loss layer designed for the model.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[Dict, Dict]: Model ouputs and computed losses.
|
||||||
|
"""
|
||||||
|
outputs_dict = {}
|
||||||
|
loss_dict = {} # this returns from the criterion
|
||||||
|
...
|
||||||
|
return outputs_dict, loss_dict
|
||||||
|
|
||||||
|
def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets:Dict, steps:int) -> None:
|
||||||
|
"""Create visualizations and waveform examples for training.
|
||||||
|
|
||||||
|
For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
|
||||||
|
be projected onto Tensorboard.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ap (AudioProcessor): audio processor used at training.
|
||||||
|
batch (Dict): Model inputs used at the previous training step.
|
||||||
|
outputs (Dict): Model outputs generated at the previoud training step.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[Dict, np.ndarray]: training plots and output waveform.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
|
||||||
|
"""Perform a single evaluation step. Run the model forward pass and compute losses. In most cases, you can
|
||||||
|
call `train_step()` with no changes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch (Dict): Input tensors.
|
||||||
|
criterion (nn.Module): Loss layer designed for the model.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[Dict, Dict]: Model ouputs and computed losses.
|
||||||
|
"""
|
||||||
|
outputs_dict = {}
|
||||||
|
loss_dict = {} # this returns from the criterion
|
||||||
|
...
|
||||||
|
return outputs_dict, loss_dict
|
||||||
|
|
||||||
|
def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets:Dict, steps:int) -> None:
|
||||||
|
"""The same as `train_log()`"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False) -> None:
|
||||||
|
"""Load a checkpoint and get ready for training or inference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (Coqpit): Model configuration.
|
||||||
|
checkpoint_path (str): Path to the model checkpoint file.
|
||||||
|
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]:
|
||||||
|
"""Setup an return optimizer or optimizers."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_lr(self) -> Union[float, List[float]]:
|
||||||
|
"""Return learning rate(s).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Union[float, List[float]]: Model's initial learning rates.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_scheduler(self, optimizer: torch.optim.Optimizer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_criterion(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def format_batch(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,6 +47,7 @@
|
||||||
models/glow_tts.md
|
models/glow_tts.md
|
||||||
models/vits.md
|
models/vits.md
|
||||||
models/forward_tts.md
|
models/forward_tts.md
|
||||||
|
models/tacotron1-2.md
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
# 🌮 Tacotron 1 and 2
|
||||||
|
|
||||||
|
Tacotron is one of the first successful DL-based text-to-mel models and opened up the whole TTS field for more DL research.
|
||||||
|
|
||||||
|
Tacotron mainly is an encoder-decoder model with attention.
|
||||||
|
|
||||||
|
The encoder takes input tokens (characters or phonemes) and the decoder outputs mel-spectrogram* frames. Attention module in-between learns to align the input tokens with the output mel-spectrgorams.
|
||||||
|
|
||||||
|
Tacotron1 and 2 are both built on the same encoder-decoder architecture but they use different layers. Additionally, Tacotron1 uses a Postnet module to convert mel-spectrograms to linear spectrograms with a higher resolution before the vocoder.
|
||||||
|
|
||||||
|
Vanilla Tacotron models are slow at inference due to the auto-regressive* nature that prevents the model to process all the inputs in parallel. One trick is to use a higher “reduction rate” that helps the model to predict multiple frames at once. That is, reduction rate 2 reduces the number of decoder iterations by half.
|
||||||
|
|
||||||
|
Tacotron also uses a Prenet module with Dropout that projects the model’s previous output before feeding it to the decoder again. The paper and most of the implementations use the Dropout layer even in inference and they report the attention fails or the voice quality degrades otherwise. But the issue with that, you get a slightly different output speech every time you run the model.
|
||||||
|
|
||||||
|
Tsraining the attention is notoriously problematic in Tacoron models. Especially, in inference, for some input sequences, the alignment fails and causes the model to produce unexpected results. There are many different methods proposed to improve the attention.
|
||||||
|
|
||||||
|
After hundreds of experiments, @ 🐸TTS we suggest Double Decoder Consistency that leads to the most robust model performance.
|
||||||
|
|
||||||
|
If you have a limited VRAM, then you can try using the Guided Attention Loss or the Dynamic Convolutional Attention. You can also combine the two.
|
||||||
|
|
||||||
|
|
||||||
|
## Important resources & papers
|
||||||
|
- Tacotron: https://arxiv.org/abs/2006.06873
|
||||||
|
- Tacotron2: https://arxiv.org/abs/2008.03802
|
||||||
|
- Double Decoder Consistency: https://coqui.ai/blog/tts/solving-attention-problems-of-tts-models-with-double-decoder-consistency
|
||||||
|
- Guided Attention Loss: https://arxiv.org/abs/1710.08969
|
||||||
|
- Forward & Backward Decoder: https://arxiv.org/abs/1907.09006
|
||||||
|
- Forward Attention: https://arxiv.org/abs/1807.06736
|
||||||
|
- Gaussian Attention: https://arxiv.org/abs/1910.10288
|
||||||
|
- Dynamic Convolutional Attention: https://arxiv.org/pdf/1910.10288.pdf
|
||||||
|
|
||||||
|
|
||||||
|
## BaseTacotron
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.models.base_tacotron.BaseTacotron
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tacotron
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.models.tacotron.Tacotron
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tacotron2
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.models.tacotron2.Tacotron2
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
## TacotronConfig
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.configs.tacotron_config.TacotronConfig
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tacotron2Config
|
||||||
|
```{eval-rst}
|
||||||
|
.. autoclass:: TTS.tts.configs.tacotron2_config.Tacotron2Config
|
||||||
|
:members:
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,31 +1,36 @@
|
||||||
# Training a Model
|
# Training a Model
|
||||||
|
|
||||||
1. Decide what model you want to use.
|
1. Decide the model you want to use.
|
||||||
|
|
||||||
Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS
|
Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS
|
||||||
community models and see how fast and good each of the models. Or you can start a discussion on our communication channels.
|
community models and see how fast and good each of the models. Or you can start a discussion on our communication channels.
|
||||||
|
|
||||||
2. Understand the configuration class, its fields and values of your model.
|
2. Understand the configuration, its fields and values.
|
||||||
|
|
||||||
For instance, if you want to train a `Tacotron` model then see the `TacotronConfig` class and make sure you understand it.
|
For instance, if you want to train a `Tacotron` model then see the `TacotronConfig` class and make sure you understand it.
|
||||||
|
|
||||||
3. Go to the recipes and check the recipe of your target model.
|
3. Check the recipes.
|
||||||
|
|
||||||
Recipes do not promise perfect models but they provide a good start point for `Nervous Beginners`. A recipe script training
|
Recipes are located under `TTS/recipes/`. They do not promise perfect models but they provide a good start point for
|
||||||
a `GlowTTS` model on `LJSpeech` dataset looks like below. Let's be creative and call this script `train_glowtts.py`.
|
`Nervous Beginners`.
|
||||||
|
A recipe for `GlowTTS` using `LJSpeech` dataset looks like below. Let's be creative and call this `train_glowtts.py`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# train_glowtts.py
|
# train_glowtts.py
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.tts.configs import GlowTTSConfig
|
|
||||||
from TTS.tts.configs import BaseDatasetConfig
|
|
||||||
from TTS.trainer import init_training, Trainer, TrainingArgs
|
|
||||||
|
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.shared_config import BaseDatasetConfig
|
||||||
|
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.glow_tts import GlowTTS
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/"))
|
dataset_config = BaseDatasetConfig(
|
||||||
|
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
|
)
|
||||||
config = GlowTTSConfig(
|
config = GlowTTSConfig(
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
eval_batch_size=16,
|
eval_batch_size=16,
|
||||||
|
@ -34,33 +39,50 @@
|
||||||
run_eval=True,
|
run_eval=True,
|
||||||
test_delay_epochs=-1,
|
test_delay_epochs=-1,
|
||||||
epochs=1000,
|
epochs=1000,
|
||||||
text_cleaner="english_cleaners",
|
text_cleaner="phoneme_cleaners",
|
||||||
use_phonemes=False,
|
use_phonemes=True,
|
||||||
phoneme_language="en-us",
|
phoneme_language="en-us",
|
||||||
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
print_step=25,
|
print_step=25,
|
||||||
print_eval=True,
|
print_eval=False,
|
||||||
mixed_precision=False,
|
mixed_precision=True,
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
datasets=[dataset_config]
|
datasets=[dataset_config],
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = GlowTTS(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
|
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
You need to change fields of the `BaseDatasetConfig` to match your own dataset and then update `GlowTTSConfig`
|
You need to change fields of the `BaseDatasetConfig` to match your dataset and then update `GlowTTSConfig`
|
||||||
fields as you need.
|
fields as you need.
|
||||||
|
|
||||||
4. Run the training.
|
4. Run the training.
|
||||||
|
|
||||||
You need to run the training script.
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py
|
$ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py
|
||||||
```
|
```
|
||||||
|
|
||||||
Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable.
|
Notice that we set the GPU for the training by `CUDA_VISIBLE_DEVICES` environment variable.
|
||||||
To see available GPUs on your system, you can use `nvidia-smi` command on the terminal.
|
To see available GPUs on your system, you can use `nvidia-smi` command on the terminal.
|
||||||
|
|
||||||
If you like to run a multi-gpu training using DDP back-end,
|
If you like to run a multi-gpu training using DDP back-end,
|
||||||
|
@ -71,7 +93,7 @@
|
||||||
|
|
||||||
The example above runs a multi-gpu training using GPUs `0, 1, 2`.
|
The example above runs a multi-gpu training using GPUs `0, 1, 2`.
|
||||||
|
|
||||||
The beginning of a training run looks like below.
|
Beginning of a training log looks like this:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
> Experiment folder: /your/output_path/-Juni-23-2021_02+52-78899209
|
> Experiment folder: /your/output_path/-Juni-23-2021_02+52-78899209
|
||||||
|
@ -140,11 +162,11 @@
|
||||||
$ tensorboard --logdir=<path to your training directory>
|
$ tensorboard --logdir=<path to your training directory>
|
||||||
```
|
```
|
||||||
|
|
||||||
6. Check the logs and the Tensorboard and monitor the training.
|
6. Monitor the training process.
|
||||||
|
|
||||||
On the terminal and Tensorboard, you can monitor the losses and their changes over time. Also Tensorboard provides certain figures and sample outputs.
|
On the terminal and Tensorboard, you can monitor the progress of your model. Also Tensorboard provides certain figures and sample outputs.
|
||||||
|
|
||||||
Note that different models have different metrics, visuals and outputs to be displayed.
|
Note that different models have different metrics, visuals and outputs.
|
||||||
|
|
||||||
You should also check the [FAQ page](https://github.com/coqui-ai/TTS/wiki/FAQ) for common problems and solutions
|
You should also check the [FAQ page](https://github.com/coqui-ai/TTS/wiki/FAQ) for common problems and solutions
|
||||||
that occur in a training.
|
that occur in a training.
|
||||||
|
@ -163,3 +185,80 @@
|
||||||
8. Return to the step 1 and reiterate for training a `vocoder` model.
|
8. Return to the step 1 and reiterate for training a `vocoder` model.
|
||||||
|
|
||||||
In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models.
|
In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models.
|
||||||
|
|
||||||
|
|
||||||
|
# Multi-speaker Training
|
||||||
|
|
||||||
|
Training a multi-speaker model is mostly the same as training a single-speaker model.
|
||||||
|
You need to specify a couple of configuration parameters, initiate a `SpeakerManager` instance and pass it to the model.
|
||||||
|
|
||||||
|
The configuration parameters define whether you want to train the model with a speaker-embedding layer or pre-computed
|
||||||
|
d-vectors. For using d-vectors, you first need to compute the d-vectors using the `SpeakerEncoder`.
|
||||||
|
|
||||||
|
The same Glow-TTS model above can be trained on a multi-speaker VCTK dataset with the script below.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts import BaseDatasetConfig, GlowTTSConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.glow_tts import GlowTTS
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
# define dataset config for VCTK
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
|
# init audio processing config
|
||||||
|
audio_config = BaseAudioConfig(sample_rate=22050, do_trim_silence=True, trim_db=23.0)
|
||||||
|
|
||||||
|
# init training config
|
||||||
|
config = GlowTTSConfig(
|
||||||
|
batch_size=64,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="phoneme_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=25,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=True,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
use_speaker_embedding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# ONLY FOR MULTI-SPEAKER: init speaker manager for multi-speaker training
|
||||||
|
speaker_manager = SpeakerManager()
|
||||||
|
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
|
||||||
|
config.num_speakers = speaker_manager.num_speakers
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = GlowTTS(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
||||||
|
```
|
||||||
|
|
|
@ -23,63 +23,104 @@ each line.
|
||||||
|
|
||||||
### Pure Python Way
|
### Pure Python Way
|
||||||
|
|
||||||
```python
|
1. Define `train.py`.
|
||||||
import os
|
|
||||||
|
|
||||||
# GlowTTSConfig: all model related values for training, validating and testing.
|
```python
|
||||||
from TTS.tts.configs import GlowTTSConfig
|
import os
|
||||||
|
|
||||||
# BaseDatasetConfig: defines name, formatter and path of the dataset.
|
# GlowTTSConfig: all model related values for training, validating and testing.
|
||||||
from TTS.tts.configs import BaseDatasetConfig
|
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
||||||
|
|
||||||
# init_training: Initialize and setup the training environment.
|
# BaseDatasetConfig: defines name, formatter and path of the dataset.
|
||||||
# Trainer: Where the ✨️ happens.
|
from TTS.tts.configs.shared_config import BaseDatasetConfig
|
||||||
# TrainingArgs: Defines the set of arguments of the Trainer.
|
|
||||||
from TTS.trainer import init_training, Trainer, TrainingArgs
|
|
||||||
|
|
||||||
# we use the same path as this script as our training folder.
|
# init_training: Initialize and setup the training environment.
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
# Trainer: Where the ✨️ happens.
|
||||||
|
# TrainingArgs: Defines the set of arguments of the Trainer.
|
||||||
|
from TTS.trainer import init_training, Trainer, TrainingArgs
|
||||||
|
|
||||||
# set LJSpeech as our target dataset and define its path so that the Trainer knows what data formatter it needs.
|
# we use the same path as this script as our training folder.
|
||||||
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/"))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# Configure the model. Every config class inherits the BaseTTSConfig to have all the fields defined for the Trainer.
|
# set LJSpeech as our target dataset and define its path so that the Trainer knows what data formatter it needs.
|
||||||
config = GlowTTSConfig(
|
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/"))
|
||||||
batch_size=32,
|
|
||||||
eval_batch_size=16,
|
|
||||||
num_loader_workers=4,
|
|
||||||
num_eval_loader_workers=4,
|
|
||||||
run_eval=True,
|
|
||||||
test_delay_epochs=-1,
|
|
||||||
epochs=1000,
|
|
||||||
text_cleaner="english_cleaners",
|
|
||||||
use_phonemes=False,
|
|
||||||
phoneme_language="en-us",
|
|
||||||
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
|
||||||
print_step=25,
|
|
||||||
print_eval=True,
|
|
||||||
mixed_precision=False,
|
|
||||||
output_path=output_path,
|
|
||||||
datasets=[dataset_config]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Take the config and the default Trainer arguments, setup the training environment and override the existing
|
# Configure the model. Every config class inherits the BaseTTSConfig to have all the fields defined for the Trainer.
|
||||||
# config values from the terminal. So you can do the following.
|
config = GlowTTSConfig(
|
||||||
# >>> python train.py --coqpit.batch_size 128
|
batch_size=32,
|
||||||
args, config, output_path, _, _, _= init_training(TrainingArgs(), config)
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="english_cleaners",
|
||||||
|
use_phonemes=False,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=25,
|
||||||
|
print_eval=True,
|
||||||
|
mixed_precision=False,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config]
|
||||||
|
)
|
||||||
|
|
||||||
# Initiate the Trainer.
|
# initialize the audio processor used for feature extraction and audio I/O.
|
||||||
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
|
# It is mainly used by the dataloader and the training loggers.
|
||||||
# distributed training etc.
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
trainer = Trainer(args, config, output_path)
|
|
||||||
|
|
||||||
# And kick it 🚀
|
# load a list of training samples
|
||||||
trainer.fit()
|
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
|
||||||
```
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# initialize the model
|
||||||
|
# Models only takes the config object as input.
|
||||||
|
model = GlowTTS(config)
|
||||||
|
|
||||||
|
# Initiate the Trainer.
|
||||||
|
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
|
||||||
|
# distributed training, etc.
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
|
||||||
|
# And kick it 🚀
|
||||||
|
trainer.fit()
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Run the script.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=0 python train.py
|
||||||
|
```
|
||||||
|
|
||||||
|
- Continue a previous run.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=0 python train.py --continue_path path/to/previous/run/folder/
|
||||||
|
```
|
||||||
|
|
||||||
|
- Fine-tune a model.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth.tar
|
||||||
|
```
|
||||||
|
|
||||||
|
- Run multi-gpu training.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1,2 python TTS/bin/distribute.py --script train.py
|
||||||
|
```
|
||||||
|
|
||||||
### CLI Way
|
### CLI Way
|
||||||
|
|
||||||
We still support running training from CLI like in the old days. The same training can be started as follows.
|
We still support running training from CLI like in the old days. The same training run can also be started as follows.
|
||||||
|
|
||||||
1. Define your `config.json`
|
1. Define your `config.json`
|
||||||
|
|
||||||
|
@ -111,45 +152,63 @@ We still support running training from CLI like in the old days. The same traini
|
||||||
$ CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path config.json
|
$ CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path config.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Training a `vocoder` Model
|
## Training a `vocoder` Model
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.vocoder.configs import HifiganConfig
|
from TTS.vocoder.configs import HifiganConfig
|
||||||
from TTS.trainer import init_training, Trainer, TrainingArgs
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
|
from TTS.vocoder.models.gan import GAN
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
config = HifiganConfig(
|
config = HifiganConfig(
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
eval_batch_size=16,
|
eval_batch_size=16,
|
||||||
num_loader_workers=4,
|
num_loader_workers=4,
|
||||||
num_eval_loader_workers=4,
|
num_eval_loader_workers=4,
|
||||||
run_eval=True,
|
run_eval=True,
|
||||||
test_delay_epochs=-1,
|
test_delay_epochs=5,
|
||||||
epochs=1000,
|
epochs=1000,
|
||||||
seq_len=8192,
|
seq_len=8192,
|
||||||
pad_short=2000,
|
pad_short=2000,
|
||||||
use_noise_augment=True,
|
use_noise_augment=True,
|
||||||
eval_split_size=10,
|
eval_split_size=10,
|
||||||
print_step=25,
|
print_step=25,
|
||||||
print_eval=True,
|
print_eval=False,
|
||||||
mixed_precision=False,
|
mixed_precision=False,
|
||||||
lr_gen=1e-4,
|
lr_gen=1e-4,
|
||||||
lr_disc=1e-4,
|
lr_disc=1e-4,
|
||||||
# `vocoder` only needs a data path and they read recursively all the `.wav` files underneath.
|
|
||||||
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = GAN(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
```
|
```
|
||||||
|
|
||||||
❗️ Note that you can also start the training run from CLI as the `tts` model above.
|
❗️ Note that you can also use ```train_vocoder.py``` as the ```tts``` models above.
|
||||||
|
|
||||||
## Synthesizing Speech
|
## Synthesizing Speech
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,342 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "6LWsNd3_M3MP"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "FAqrSIWgLyP0"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
|
|
||||||
"\n",
|
|
||||||
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
|
|
||||||
"\n",
|
|
||||||
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
|
|
||||||
"\n",
|
|
||||||
"Note that both model performances can be improved with more training."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Ku-dA4DKoeXk"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Download Models"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 162
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "jGIgnWhGsxU1",
|
|
||||||
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
|
|
||||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 235
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "4dnpE0-kvTsu",
|
|
||||||
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
|
|
||||||
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
|
|
||||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Zlgi8fPdpRF0"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Define TTS function"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "f-Yc42nQZG5A"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
|
|
||||||
" t_1 = time.time()\n",
|
|
||||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
|
||||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
|
||||||
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
|
|
||||||
" if not use_gl:\n",
|
|
||||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
|
||||||
" waveform = waveform.flatten()\n",
|
|
||||||
" if use_cuda:\n",
|
|
||||||
" waveform = waveform.cpu()\n",
|
|
||||||
" waveform = waveform.numpy()\n",
|
|
||||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
|
||||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
|
||||||
" print(waveform.shape)\n",
|
|
||||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
|
||||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
|
||||||
" print(\" > Time per step: {}\".format(tps))\n",
|
|
||||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
|
||||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "ZksegYQepkFg"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Load Models"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "oVa0kOamprgj"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import torch\n",
|
|
||||||
"import time\n",
|
|
||||||
"import IPython\n",
|
|
||||||
"\n",
|
|
||||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
|
||||||
"from TTS.utils.io import load_config\n",
|
|
||||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
|
||||||
"from TTS.utils.audio import AudioProcessor\n",
|
|
||||||
"from TTS.tts.utils.synthesis import synthesis"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "EY-sHVO8IFSH"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# runtime settings\n",
|
|
||||||
"use_cuda = False"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "_1aIUp2FpxOQ"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# model paths\n",
|
|
||||||
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
|
|
||||||
"TTS_CONFIG = \"data/config.json\"\n",
|
|
||||||
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
|
|
||||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "CpgmdBVQplbv"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# load configs\n",
|
|
||||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
|
||||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 471
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "zmrQxiozIUVE",
|
|
||||||
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# load the audio processor\n",
|
|
||||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
|
||||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 35
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "8fLoI4ipqMeS",
|
|
||||||
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# LOAD TTS MODEL\n",
|
|
||||||
"# multi speaker \n",
|
|
||||||
"speaker_id = None\n",
|
|
||||||
"speakers = []\n",
|
|
||||||
"\n",
|
|
||||||
"# load the model\n",
|
|
||||||
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
|
|
||||||
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
|
|
||||||
"\n",
|
|
||||||
"# load model state\n",
|
|
||||||
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
|
|
||||||
"\n",
|
|
||||||
"# load the model\n",
|
|
||||||
"model.load_state_dict(cp['model'])\n",
|
|
||||||
"if use_cuda:\n",
|
|
||||||
" model.cuda()\n",
|
|
||||||
"model.eval()\n",
|
|
||||||
"\n",
|
|
||||||
"# set model stepsize\n",
|
|
||||||
"if 'r' in cp:\n",
|
|
||||||
" model.decoder.set_r(cp['r'])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 1000
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "zKoq0GgzqzhQ",
|
|
||||||
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
|
||||||
"\n",
|
|
||||||
"# LOAD VOCODER MODEL\n",
|
|
||||||
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
|
|
||||||
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
|
|
||||||
"vocoder_model.remove_weight_norm()\n",
|
|
||||||
"vocoder_model.inference_padding = 0\n",
|
|
||||||
"\n",
|
|
||||||
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
|
|
||||||
"if use_cuda:\n",
|
|
||||||
" vocoder_model.cuda()\n",
|
|
||||||
"vocoder_model.eval()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Ws_YkPKsLgo-"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## Run Inference"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 134
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "FuWxZ9Ey5Puj",
|
|
||||||
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
|
||||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"accelerator": "GPU",
|
|
||||||
"colab": {
|
|
||||||
"collapsed_sections": [],
|
|
||||||
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
|
|
||||||
"provenance": [],
|
|
||||||
"toc_visible": true
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.8.5"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
|
|
@ -1,346 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "6LWsNd3_M3MP"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"# Mozilla TTS on CPU Real-Time Speech Synthesis with Tensorflow"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "FAqrSIWgLyP0"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"**These models are converted from released [PyTorch models](https://colab.research.google.com/drive/1u_16ZzHjKYFn1HNVuA4Qf_i2MMFB9olY?usp=sharing) using our TF utilities provided in Mozilla TTS.**\n",
|
|
||||||
"\n",
|
|
||||||
"These TF models support TF 2.2 and for different versions you might need to\n",
|
|
||||||
"regenerate them. \n",
|
|
||||||
"\n",
|
|
||||||
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
|
|
||||||
"\n",
|
|
||||||
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
|
|
||||||
"\n",
|
|
||||||
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
|
|
||||||
"\n",
|
|
||||||
"Note that both model performances can be improved with more training.\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Ku-dA4DKoeXk"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Download Models"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 162
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "jGIgnWhGsxU1",
|
|
||||||
"outputId": "08b0dddd-4edf-48c9-e8e5-a419b36a5c3d",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!gdown --id 1p7OSEEW_Z7ORxNgfZwhMy7IiLE1s0aH7 -O data/tts_model.pkl\n",
|
|
||||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 235
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "4dnpE0-kvTsu",
|
|
||||||
"outputId": "2fe836eb-c7e7-4f1e-9352-0142126bb19f",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!gdown --id 1rHmj7CqD3Sfa716Y3ub_vpIBrQg_b1yF -O data/vocoder_model.pkl\n",
|
|
||||||
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
|
|
||||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Zlgi8fPdpRF0"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Define TTS function"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "f-Yc42nQZG5A"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def tts(model, text, CONFIG, p):\n",
|
|
||||||
" t_1 = time.time()\n",
|
|
||||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
|
||||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n",
|
|
||||||
" backend='tf')\n",
|
|
||||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
|
||||||
" waveform = waveform.numpy()[0, 0]\n",
|
|
||||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
|
||||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
|
||||||
" print(waveform.shape)\n",
|
|
||||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
|
||||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
|
||||||
" print(\" > Time per step: {}\".format(tps))\n",
|
|
||||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
|
||||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "ZksegYQepkFg"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Load Models"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "oVa0kOamprgj"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import torch\n",
|
|
||||||
"import time\n",
|
|
||||||
"import IPython\n",
|
|
||||||
"\n",
|
|
||||||
"from TTS.tts.tf.utils.generic_utils import setup_model\n",
|
|
||||||
"from TTS.tts.tf.utils.io import load_checkpoint\n",
|
|
||||||
"from TTS.utils.io import load_config\n",
|
|
||||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
|
||||||
"from TTS.utils.audio import AudioProcessor\n",
|
|
||||||
"from TTS.tts.utils.synthesis import synthesis"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "EY-sHVO8IFSH"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# runtime settings\n",
|
|
||||||
"use_cuda = False"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "_1aIUp2FpxOQ"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# model paths\n",
|
|
||||||
"TTS_MODEL = \"data/tts_model.pkl\"\n",
|
|
||||||
"TTS_CONFIG = \"data/config.json\"\n",
|
|
||||||
"VOCODER_MODEL = \"data/vocoder_model.pkl\"\n",
|
|
||||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "CpgmdBVQplbv"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# load configs\n",
|
|
||||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
|
||||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 471
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "zmrQxiozIUVE",
|
|
||||||
"outputId": "fa71bd05-401f-4e5b-a6f7-60ae765966db",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# load the audio processor\n",
|
|
||||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
|
||||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 72
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "8fLoI4ipqMeS",
|
|
||||||
"outputId": "595d990f-930d-4698-ee14-77796b5eed7d",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# LOAD TTS MODEL\n",
|
|
||||||
"# multi speaker \n",
|
|
||||||
"speaker_id = None\n",
|
|
||||||
"speakers = []\n",
|
|
||||||
"\n",
|
|
||||||
"# load the model\n",
|
|
||||||
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
|
|
||||||
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
|
|
||||||
"model.build_inference()\n",
|
|
||||||
"model = load_checkpoint(model, TTS_MODEL)\n",
|
|
||||||
"model.decoder.set_max_decoder_steps(1000)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 489
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "zKoq0GgzqzhQ",
|
|
||||||
"outputId": "2cc3deae-144f-4465-da3b-98628d948506"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from TTS.vocoder.tf.utils.generic_utils import setup_generator\n",
|
|
||||||
"from TTS.vocoder.tf.utils.io import load_checkpoint\n",
|
|
||||||
"\n",
|
|
||||||
"# LOAD VOCODER MODEL\n",
|
|
||||||
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
|
|
||||||
"vocoder_model.build_inference()\n",
|
|
||||||
"vocoder_model = load_checkpoint(vocoder_model, VOCODER_MODEL)\n",
|
|
||||||
"vocoder_model.inference_padding = 0\n",
|
|
||||||
"\n",
|
|
||||||
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Ws_YkPKsLgo-"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## Run Inference"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"Collapsed": "false",
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 134
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "FuWxZ9Ey5Puj",
|
|
||||||
"outputId": "07ede6e5-06e6-4612-f687-7984d20e5254"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
|
||||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"collapsed_sections": [],
|
|
||||||
"name": "DDC-TTS_and_MultiBand-MelGAN_TF_Example.ipynb",
|
|
||||||
"provenance": [],
|
|
||||||
"toc_visible": true
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.8.5"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
|
|
@ -1,342 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "6LWsNd3_M3MP"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "FAqrSIWgLyP0"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
|
|
||||||
"\n",
|
|
||||||
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
|
|
||||||
"\n",
|
|
||||||
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
|
|
||||||
"\n",
|
|
||||||
"Note that both model performances can be improved with more training."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Ku-dA4DKoeXk"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Download Models"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 162
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "jGIgnWhGsxU1",
|
|
||||||
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
|
|
||||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 235
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "4dnpE0-kvTsu",
|
|
||||||
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!gdown --id 1X09hHAyAJOnrplCUMAdW_t341Kor4YR4 -O data/vocoder_model.pth.tar\n",
|
|
||||||
"!gdown --id \"1qN7vQRIYkzvOX_DtiZtTajzoZ1eW1-Eg\" -O data/config_vocoder.json\n",
|
|
||||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Zlgi8fPdpRF0"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Define TTS function"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "f-Yc42nQZG5A"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
|
|
||||||
" t_1 = time.time()\n",
|
|
||||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
|
||||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
|
||||||
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
|
|
||||||
" if not use_gl:\n",
|
|
||||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
|
||||||
" waveform = waveform.flatten()\n",
|
|
||||||
" if use_cuda:\n",
|
|
||||||
" waveform = waveform.cpu()\n",
|
|
||||||
" waveform = waveform.numpy()\n",
|
|
||||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
|
||||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
|
||||||
" print(waveform.shape)\n",
|
|
||||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
|
||||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
|
||||||
" print(\" > Time per step: {}\".format(tps))\n",
|
|
||||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
|
||||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "ZksegYQepkFg"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Load Models"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "oVa0kOamprgj"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import torch\n",
|
|
||||||
"import time\n",
|
|
||||||
"import IPython\n",
|
|
||||||
"\n",
|
|
||||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
|
||||||
"from TTS.utils.io import load_config\n",
|
|
||||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
|
||||||
"from TTS.utils.audio import AudioProcessor\n",
|
|
||||||
"from TTS.tts.utils.synthesis import synthesis"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "EY-sHVO8IFSH"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# runtime settings\n",
|
|
||||||
"use_cuda = False"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "_1aIUp2FpxOQ"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# model paths\n",
|
|
||||||
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
|
|
||||||
"TTS_CONFIG = \"data/config.json\"\n",
|
|
||||||
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
|
|
||||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "CpgmdBVQplbv"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# load configs\n",
|
|
||||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
|
||||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 471
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "zmrQxiozIUVE",
|
|
||||||
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# load the audio processor\n",
|
|
||||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
|
||||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 35
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "8fLoI4ipqMeS",
|
|
||||||
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# LOAD TTS MODEL\n",
|
|
||||||
"# multi speaker \n",
|
|
||||||
"speaker_id = None\n",
|
|
||||||
"speakers = []\n",
|
|
||||||
"\n",
|
|
||||||
"# load the model\n",
|
|
||||||
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
|
|
||||||
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
|
|
||||||
"\n",
|
|
||||||
"# load model state\n",
|
|
||||||
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
|
|
||||||
"\n",
|
|
||||||
"# load the model\n",
|
|
||||||
"model.load_state_dict(cp['model'])\n",
|
|
||||||
"if use_cuda:\n",
|
|
||||||
" model.cuda()\n",
|
|
||||||
"model.eval()\n",
|
|
||||||
"\n",
|
|
||||||
"# set model stepsize\n",
|
|
||||||
"if 'r' in cp:\n",
|
|
||||||
" model.decoder.set_r(cp['r'])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 1000
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "zKoq0GgzqzhQ",
|
|
||||||
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
|
||||||
"\n",
|
|
||||||
"# LOAD VOCODER MODEL\n",
|
|
||||||
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
|
|
||||||
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
|
|
||||||
"vocoder_model.remove_weight_norm()\n",
|
|
||||||
"vocoder_model.inference_padding = 0\n",
|
|
||||||
"\n",
|
|
||||||
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
|
|
||||||
"if use_cuda:\n",
|
|
||||||
" vocoder_model.cuda()\n",
|
|
||||||
"vocoder_model.eval()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"colab_type": "text",
|
|
||||||
"id": "Ws_YkPKsLgo-"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## Run Inference"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/",
|
|
||||||
"height": 134
|
|
||||||
},
|
|
||||||
"colab_type": "code",
|
|
||||||
"id": "FuWxZ9Ey5Puj",
|
|
||||||
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
|
||||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"accelerator": "GPU",
|
|
||||||
"colab": {
|
|
||||||
"collapsed_sections": [],
|
|
||||||
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
|
|
||||||
"provenance": [],
|
|
||||||
"toc_visible": true
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.8.5"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
|
|
@ -2,14 +2,16 @@
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training."
|
"This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training."
|
||||||
],
|
]
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%load_ext autoreload\n",
|
"%load_ext autoreload\n",
|
||||||
"%autoreload 2\n",
|
"%autoreload 2\n",
|
||||||
|
@ -20,7 +22,7 @@
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"from tqdm import tqdm as tqdm\n",
|
"from tqdm import tqdm as tqdm\n",
|
||||||
"from torch.utils.data import DataLoader\n",
|
"from torch.utils.data import DataLoader\n",
|
||||||
"from TTS.tts.datasets.TTSDataset import TTSDataset\n",
|
"from TTS.tts.datasets.dataset import TTSDataset\n",
|
||||||
"from TTS.tts.layers.losses import L1LossMasked\n",
|
"from TTS.tts.layers.losses import L1LossMasked\n",
|
||||||
"from TTS.utils.audio import AudioProcessor\n",
|
"from TTS.utils.audio import AudioProcessor\n",
|
||||||
"from TTS.config import load_config\n",
|
"from TTS.config import load_config\n",
|
||||||
|
@ -33,13 +35,13 @@
|
||||||
"\n",
|
"\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"os.environ['CUDA_VISIBLE_DEVICES']='2'"
|
"os.environ['CUDA_VISIBLE_DEVICES']='2'"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def set_filename(wav_path, out_path):\n",
|
"def set_filename(wav_path, out_path):\n",
|
||||||
" wav_file = os.path.basename(wav_path)\n",
|
" wav_file = os.path.basename(wav_path)\n",
|
||||||
|
@ -51,13 +53,13 @@
|
||||||
" mel_path = os.path.join(out_path, \"mel\", file_name)\n",
|
" mel_path = os.path.join(out_path, \"mel\", file_name)\n",
|
||||||
" wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n",
|
" wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n",
|
||||||
" return file_name, wavq_path, mel_path, wav_path"
|
" return file_name, wavq_path, mel_path, wav_path"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n",
|
"OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n",
|
||||||
"DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n",
|
"DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n",
|
||||||
|
@ -77,13 +79,13 @@
|
||||||
"C = load_config(CONFIG_PATH)\n",
|
"C = load_config(CONFIG_PATH)\n",
|
||||||
"C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n",
|
"C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n",
|
||||||
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
|
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"print(C['r'])\n",
|
"print(C['r'])\n",
|
||||||
"# if the vocabulary was passed, replace the default\n",
|
"# if the vocabulary was passed, replace the default\n",
|
||||||
|
@ -95,13 +97,13 @@
|
||||||
"# TODO: multiple speaker\n",
|
"# TODO: multiple speaker\n",
|
||||||
"model = setup_model(C)\n",
|
"model = setup_model(C)\n",
|
||||||
"model.load_checkpoint(C, MODEL_FILE, eval=True)"
|
"model.load_checkpoint(C, MODEL_FILE, eval=True)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n",
|
"preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n",
|
||||||
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
||||||
|
@ -120,20 +122,20 @@
|
||||||
"loader = DataLoader(\n",
|
"loader = DataLoader(\n",
|
||||||
" dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n",
|
" dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n",
|
||||||
")\n"
|
")\n"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Generate model outputs "
|
"### Generate model outputs "
|
||||||
],
|
]
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import pickle\n",
|
"import pickle\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -212,42 +214,42 @@
|
||||||
"\n",
|
"\n",
|
||||||
" print(np.mean(losses))\n",
|
" print(np.mean(losses))\n",
|
||||||
" print(np.mean(postnet_losses))"
|
" print(np.mean(postnet_losses))"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# for pwgan\n",
|
"# for pwgan\n",
|
||||||
"with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n",
|
"with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n",
|
||||||
" for data in metadata:\n",
|
" for data in metadata:\n",
|
||||||
" f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")"
|
" f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Sanity Check"
|
"### Sanity Check"
|
||||||
],
|
]
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"idx = 1\n",
|
"idx = 1\n",
|
||||||
"ap.melspectrogram(ap.load_wav(item_idx[idx])).shape"
|
"ap.melspectrogram(ap.load_wav(item_idx[idx])).shape"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import soundfile as sf\n",
|
"import soundfile as sf\n",
|
||||||
"wav, sr = sf.read(item_idx[idx])\n",
|
"wav, sr = sf.read(item_idx[idx])\n",
|
||||||
|
@ -255,46 +257,46 @@
|
||||||
"mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n",
|
"mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n",
|
||||||
"mel_truth = ap.melspectrogram(wav)\n",
|
"mel_truth = ap.melspectrogram(wav)\n",
|
||||||
"print(mel_truth.shape)"
|
"print(mel_truth.shape)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# plot posnet output\n",
|
"# plot posnet output\n",
|
||||||
"print(mel_postnet[:mel_lengths[idx], :].shape)\n",
|
"print(mel_postnet[:mel_lengths[idx], :].shape)\n",
|
||||||
"plot_spectrogram(mel_postnet, ap)"
|
"plot_spectrogram(mel_postnet, ap)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# plot decoder output\n",
|
"# plot decoder output\n",
|
||||||
"print(mel_decoder.shape)\n",
|
"print(mel_decoder.shape)\n",
|
||||||
"plot_spectrogram(mel_decoder, ap)"
|
"plot_spectrogram(mel_decoder, ap)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# plot GT specgrogram\n",
|
"# plot GT specgrogram\n",
|
||||||
"print(mel_truth.shape)\n",
|
"print(mel_truth.shape)\n",
|
||||||
"plot_spectrogram(mel_truth.T, ap)"
|
"plot_spectrogram(mel_truth.T, ap)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# postnet, decoder diff\n",
|
"# postnet, decoder diff\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
|
@ -303,13 +305,13 @@
|
||||||
"plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
"plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
"plt.colorbar()\n",
|
"plt.colorbar()\n",
|
||||||
"plt.tight_layout()"
|
"plt.tight_layout()"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# PLOT GT SPECTROGRAM diff\n",
|
"# PLOT GT SPECTROGRAM diff\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
|
@ -318,13 +320,13 @@
|
||||||
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
"plt.colorbar()\n",
|
"plt.colorbar()\n",
|
||||||
"plt.tight_layout()"
|
"plt.tight_layout()"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# PLOT GT SPECTROGRAM diff\n",
|
"# PLOT GT SPECTROGRAM diff\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
|
@ -334,22 +336,23 @@
|
||||||
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
||||||
"plt.colorbar()\n",
|
"plt.colorbar()\n",
|
||||||
"plt.tight_layout()"
|
"plt.tight_layout()"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"source": [],
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"metadata": {}
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "822ce188d9bce5372c4adbb11364eeb49293228c2224eb55307f4664778e7f56"
|
||||||
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"name": "python3",
|
"display_name": "Python 3.9.7 64-bit ('base': conda)",
|
||||||
"display_name": "Python 3.9.7 64-bit ('base': conda)"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
|
@ -362,9 +365,6 @@
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.7"
|
"version": "3.9.7"
|
||||||
},
|
|
||||||
"interpreter": {
|
|
||||||
"hash": "822ce188d9bce5372c4adbb11364eeb49293228c2224eb55307f4664778e7f56"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
@ -19,19 +19,16 @@
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import glob\n",
|
"import glob\n",
|
||||||
"import random\n",
|
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"import torch\n",
|
|
||||||
"import umap\n",
|
"import umap\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
|
|
||||||
"from TTS.utils.audio import AudioProcessor\n",
|
"from TTS.utils.audio import AudioProcessor\n",
|
||||||
"from TTS.tts.utils.generic_utils import load_config\n",
|
"from TTS.config import load_config\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from bokeh.io import output_notebook, show\n",
|
"from bokeh.io import output_notebook, show\n",
|
||||||
"from bokeh.plotting import figure\n",
|
"from bokeh.plotting import figure\n",
|
||||||
"from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
|
"from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
|
||||||
"from bokeh.transform import factor_cmap, factor_mark\n",
|
"from bokeh.transform import factor_cmap\n",
|
||||||
"from bokeh.palettes import Category10"
|
"from bokeh.palettes import Category10"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -22,7 +22,6 @@
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import sys\n",
|
"import sys\n",
|
||||||
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
|
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
|
||||||
"import glob\n",
|
|
||||||
"import librosa\n",
|
"import librosa\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
|
|
|
@ -21,10 +21,9 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import os, sys\n",
|
"import os\n",
|
||||||
"import glob\n",
|
"import glob\n",
|
||||||
"import subprocess\n",
|
"import subprocess\n",
|
||||||
"import tempfile\n",
|
|
||||||
"import IPython\n",
|
"import IPython\n",
|
||||||
"import soundfile as sf\n",
|
"import soundfile as sf\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -50,7 +50,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# import stuff\n",
|
"# import stuff\n",
|
||||||
"from TTS.utils.io import load_config\n",
|
"from TTS.utils.io import load_config\n",
|
||||||
"from TTS.tts.datasets.formatters import load_meta_data\n",
|
"from TTS.tts.datasets.formatters import load_tts_samples\n",
|
||||||
"from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n",
|
"from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n",
|
||||||
"from tqdm import tqdm\n",
|
"from tqdm import tqdm\n",
|
||||||
"from matplotlib import pylab as plt\n",
|
"from matplotlib import pylab as plt\n",
|
||||||
|
@ -75,7 +75,7 @@
|
||||||
"CONFIG = load_config(CONFIG_FILE)\n",
|
"CONFIG = load_config(CONFIG_FILE)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Load some properties from config.json\n",
|
"# Load some properties from config.json\n",
|
||||||
"CONFIG_METADATA = sorted(load_meta_data(CONFIG.datasets)[0])\n",
|
"CONFIG_METADATA = sorted(load_tts_samples(CONFIG.datasets)[0])\n",
|
||||||
"CONFIG_METADATA = CONFIG_METADATA\n",
|
"CONFIG_METADATA = CONFIG_METADATA\n",
|
||||||
"CONFIG_DATASET = CONFIG.datasets[0]\n",
|
"CONFIG_DATASET = CONFIG.datasets[0]\n",
|
||||||
"CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n",
|
"CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n",
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
# 🐸💬 TTS Training Recipes
|
# 🐸💬 TTS Training Recipes
|
||||||
|
|
||||||
TTS recipes intended to host bash scripts running all the necessary steps to train a TTS model with a particular dataset.
|
TTS recipes intended to host scripts running all the necessary steps to train a TTS model on a particular dataset.
|
||||||
|
|
||||||
Run each script from the root TTS folder as follows
|
For each dataset, you need to download the dataset once. Then you run the training for the model you want.
|
||||||
|
|
||||||
|
Run each script from the root TTS folder as follows.
|
||||||
|
|
||||||
```console
|
```console
|
||||||
$ bash ./recipes/<dataset>/<model>/run.sh
|
$ sh ./recipes/<dataset>/download_<dataset>.sh
|
||||||
|
$ python recipes/<dataset>/<model_name>/train.py
|
||||||
```
|
```
|
||||||
|
|
||||||
All the outputs are held under the recipe directory unless you change the paths in the bash script.
|
|
||||||
|
|
||||||
If you train a new model using TTS, feel free to share your training to expand the list of recipes.
|
If you train a new model using TTS, feel free to share your training to expand the list of recipes.
|
||||||
|
|
||||||
|
You can also open a new discussion and share your progress with the 🐸 community.
|
|
@ -1,9 +1,14 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
from TTS.tts.configs import AlignTTSConfig, BaseDatasetConfig
|
from TTS.tts.configs.align_tts_config import AlignTTSConfig, BaseDatasetConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.align_tts import AlignTTS
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# init configs
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
)
|
)
|
||||||
|
@ -25,6 +30,24 @@ config = AlignTTSConfig(
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
datasets=[dataset_config],
|
datasets=[dataset_config],
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = AlignTTS(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -10,5 +10,5 @@ tar -xjf LJSpeech-1.1.tar.bz2
|
||||||
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
|
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
|
||||||
head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
|
head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
|
||||||
tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
|
tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
|
||||||
mv LJSpeech-1.1 $RUN_DIR/
|
mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/
|
||||||
rm LJSpeech-1.1.tar.bz2
|
rm LJSpeech-1.1.tar.bz2
|
|
@ -1,8 +1,11 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
from TTS.tts.configs import FastPitchConfig
|
from TTS.tts.configs.fast_pitch_config import FastPitchConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.forward_tts import ForwardTTS
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
@ -64,7 +67,23 @@ if not config.model_args.use_aligner:
|
||||||
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
|
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
|
||||||
)
|
)
|
||||||
|
|
||||||
# train the model
|
# init audio processor
|
||||||
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
|
ap = AudioProcessor(**config.audio)
|
||||||
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init the model
|
||||||
|
model = ForwardTTS(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.fast_speech_config import FastSpeechConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.forward_tts import ForwardTTS
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.manage import ModelManager
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# init configs
|
||||||
|
dataset_config = BaseDatasetConfig(
|
||||||
|
name="ljspeech",
|
||||||
|
meta_file_train="metadata.csv",
|
||||||
|
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
|
||||||
|
path=os.path.join(output_path, "../LJSpeech-1.1/"),
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=60.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = FastSpeechConfig(
|
||||||
|
run_name="fast_speech_ljspeech",
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=32,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=8,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
compute_input_seq_cache=True,
|
||||||
|
compute_f0=False,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="english_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
use_espeak_phonemes=False,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=50,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=False,
|
||||||
|
sort_by_audio_len=True,
|
||||||
|
max_seq_len=500000,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
)
|
||||||
|
|
||||||
|
# compute alignments
|
||||||
|
if not config.model_args.use_aligner:
|
||||||
|
manager = ModelManager()
|
||||||
|
model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
|
||||||
|
# TODO: make compute_attention python callable
|
||||||
|
os.system(
|
||||||
|
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio)
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init the model
|
||||||
|
model = ForwardTTS(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -1,7 +1,11 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
from TTS.tts.configs import BaseDatasetConfig, GlowTTSConfig
|
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
||||||
|
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.glow_tts import GlowTTS
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
|
@ -25,6 +29,24 @@ config = GlowTTSConfig(
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
datasets=[dataset_config],
|
datasets=[dataset_config],
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = GlowTTS(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -1,29 +1,51 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.vocoder.configs import HifiganConfig
|
from TTS.vocoder.configs import HifiganConfig
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
|
from TTS.vocoder.models.gan import GAN
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
config = HifiganConfig(
|
config = HifiganConfig(
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
eval_batch_size=16,
|
eval_batch_size=16,
|
||||||
num_loader_workers=4,
|
num_loader_workers=4,
|
||||||
num_eval_loader_workers=4,
|
num_eval_loader_workers=4,
|
||||||
run_eval=True,
|
run_eval=True,
|
||||||
test_delay_epochs=-1,
|
test_delay_epochs=5,
|
||||||
epochs=1000,
|
epochs=1000,
|
||||||
seq_len=8192,
|
seq_len=8192,
|
||||||
pad_short=2000,
|
pad_short=2000,
|
||||||
use_noise_augment=True,
|
use_noise_augment=True,
|
||||||
eval_split_size=10,
|
eval_split_size=10,
|
||||||
print_step=25,
|
print_step=25,
|
||||||
print_eval=True,
|
print_eval=False,
|
||||||
mixed_precision=False,
|
mixed_precision=False,
|
||||||
lr_gen=1e-4,
|
lr_gen=1e-4,
|
||||||
lr_disc=1e-4,
|
lr_disc=1e-4,
|
||||||
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = GAN(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -1,29 +1,51 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.vocoder.configs import MultibandMelganConfig
|
from TTS.vocoder.configs import MultibandMelganConfig
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
|
from TTS.vocoder.models.gan import GAN
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
config = MultibandMelganConfig(
|
config = MultibandMelganConfig(
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
eval_batch_size=16,
|
eval_batch_size=16,
|
||||||
num_loader_workers=4,
|
num_loader_workers=4,
|
||||||
num_eval_loader_workers=4,
|
num_eval_loader_workers=4,
|
||||||
run_eval=True,
|
run_eval=True,
|
||||||
test_delay_epochs=-1,
|
test_delay_epochs=5,
|
||||||
epochs=1000,
|
epochs=1000,
|
||||||
seq_len=8192,
|
seq_len=8192,
|
||||||
pad_short=2000,
|
pad_short=2000,
|
||||||
use_noise_augment=True,
|
use_noise_augment=True,
|
||||||
eval_split_size=10,
|
eval_split_size=10,
|
||||||
print_step=25,
|
print_step=25,
|
||||||
print_eval=True,
|
print_eval=False,
|
||||||
mixed_precision=False,
|
mixed_precision=False,
|
||||||
lr_gen=1e-4,
|
lr_gen=1e-4,
|
||||||
lr_disc=1e-4,
|
lr_disc=1e-4,
|
||||||
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = GAN(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -1,18 +1,15 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
from TTS.tts.configs import SpeedySpeechConfig
|
from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.forward_tts import ForwardTTS
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# init configs
|
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech",
|
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
meta_file_train="metadata.csv",
|
|
||||||
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
|
|
||||||
path=os.path.join(output_path, "../LJSpeech-1.1/"),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
|
@ -53,16 +50,32 @@ config = SpeedySpeechConfig(
|
||||||
datasets=[dataset_config],
|
datasets=[dataset_config],
|
||||||
)
|
)
|
||||||
|
|
||||||
# compute alignments
|
# # compute alignments
|
||||||
if not config.model_args.use_aligner:
|
# if not config.model_args.use_aligner:
|
||||||
manager = ModelManager()
|
# manager = ModelManager()
|
||||||
model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
|
# model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
|
||||||
# TODO: make compute_attention python callable
|
# # TODO: make compute_attention python callable
|
||||||
os.system(
|
# os.system(
|
||||||
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
|
# f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
|
||||||
)
|
# )
|
||||||
|
|
||||||
# train the model
|
# init audio processor
|
||||||
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = ForwardTTS(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -1,22 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# take the scripts's parent's directory to prefix all the output paths.
|
|
||||||
RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
|
||||||
echo $RUN_DIR
|
|
||||||
# # download LJSpeech dataset
|
|
||||||
# wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
|
||||||
# # extract
|
|
||||||
# tar -xjf LJSpeech-1.1.tar.bz2
|
|
||||||
# # create train-val splits
|
|
||||||
# shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
|
|
||||||
# head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
|
|
||||||
# tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
|
|
||||||
# mv LJSpeech-1.1 $RUN_DIR/
|
|
||||||
# rm LJSpeech-1.1.tar.bz2
|
|
||||||
# # compute dataset mean and variance for normalization
|
|
||||||
# python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/
|
|
||||||
# training ....
|
|
||||||
# change the GPU id if needed
|
|
||||||
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DCA.json \
|
|
||||||
--coqpit.output_path $RUN_DIR \
|
|
||||||
--coqpit.datasets.0.path /media/erogol/nvme_linux/gdrive/Projects/TTS/recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/ \
|
|
||||||
--coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
|
|
Binary file not shown.
|
@ -1,85 +0,0 @@
|
||||||
{
|
|
||||||
"datasets": [
|
|
||||||
{
|
|
||||||
"name": "ljspeech",
|
|
||||||
"path": "DEFINE THIS",
|
|
||||||
"meta_file_train": "metadata.csv",
|
|
||||||
"meta_file_val": null
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"audio": {
|
|
||||||
"fft_size": 1024,
|
|
||||||
"win_length": 1024,
|
|
||||||
"hop_length": 256,
|
|
||||||
"frame_length_ms": null,
|
|
||||||
"frame_shift_ms": null,
|
|
||||||
"sample_rate": 22050,
|
|
||||||
"preemphasis": 0.0,
|
|
||||||
"ref_level_db": 20,
|
|
||||||
"do_trim_silence": true,
|
|
||||||
"trim_db": 60,
|
|
||||||
"power": 1.5,
|
|
||||||
"griffin_lim_iters": 60,
|
|
||||||
"num_mels": 80,
|
|
||||||
"mel_fmin": 50.0,
|
|
||||||
"mel_fmax": 7600.0,
|
|
||||||
"spec_gain": 1,
|
|
||||||
"signal_norm": true,
|
|
||||||
"min_level_db": -100,
|
|
||||||
"symmetric_norm": true,
|
|
||||||
"max_norm": 4.0,
|
|
||||||
"clip_norm": true,
|
|
||||||
"stats_path": "scale_stats.npy"
|
|
||||||
},
|
|
||||||
"distributed_backend": "nlcc",
|
|
||||||
"distributed_url": "tcp:\/\/localhost:54321",
|
|
||||||
"model": "Tacotron2",
|
|
||||||
"run_name": "ljspeech-dca",
|
|
||||||
"run_description": "tacotron2 with dynamic conv attention.",
|
|
||||||
"batch_size": 64,
|
|
||||||
"eval_batch_size": 16,
|
|
||||||
"mixed_precision": true,
|
|
||||||
"loss_masking": true,
|
|
||||||
"decoder_loss_alpha": 0.25,
|
|
||||||
"postnet_loss_alpha": 0.25,
|
|
||||||
"postnet_diff_spec_alpha": 0.25,
|
|
||||||
"decoder_diff_spec_alpha": 0.25,
|
|
||||||
"decoder_ssim_alpha": 0.25,
|
|
||||||
"postnet_ssim_alpha": 0.25,
|
|
||||||
"ga_alpha": 5.0,
|
|
||||||
"stopnet_pos_weight": 15.0,
|
|
||||||
"run_eval": true,
|
|
||||||
"test_delay_epochs": 10,
|
|
||||||
"max_decoder_steps": 1000,
|
|
||||||
"noam_schedule": true,
|
|
||||||
"grad_clip": 0.05,
|
|
||||||
"epochs": 1000,
|
|
||||||
"lr": 0.001,
|
|
||||||
"wd": 1e-06,
|
|
||||||
"warmup_steps": 4000,
|
|
||||||
"memory_size": -1,
|
|
||||||
"prenet_type": "original",
|
|
||||||
"prenet_dropout": true,
|
|
||||||
"attention_type": "dynamic_convolution",
|
|
||||||
"location_attn": true,
|
|
||||||
"attention_norm": "sigmoid",
|
|
||||||
"r": 2,
|
|
||||||
"stopnet": true,
|
|
||||||
"separate_stopnet": true,
|
|
||||||
"print_step": 25,
|
|
||||||
"tb_plot_step": 100,
|
|
||||||
"print_eval": false,
|
|
||||||
"save_step": 10000,
|
|
||||||
"checkpoint": true,
|
|
||||||
"text_cleaner": "phoneme_cleaners",
|
|
||||||
"num_loader_workers": 4,
|
|
||||||
"num_val_loader_workers": 4,
|
|
||||||
"batch_group_size": 4,
|
|
||||||
"min_seq_len": 6,
|
|
||||||
"max_seq_len": 180,
|
|
||||||
"compute_input_seq_cache": true,
|
|
||||||
"output_path": "DEFINE THIS",
|
|
||||||
"phoneme_cache_path": "DEFINE THIS",
|
|
||||||
"use_phonemes": false,
|
|
||||||
"phoneme_language": "en-us"
|
|
||||||
}
|
|
|
@ -0,0 +1,75 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs import BaseDatasetConfig, Tacotron2Config
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.tacotron2 import Tacotron2
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
# from TTS.tts.datasets.tokenizer import Tokenizer
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# init configs
|
||||||
|
dataset_config = BaseDatasetConfig(
|
||||||
|
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=60.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = Tacotron2Config( # This is the config that is saved for the future use
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=64,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
ga_alpha=5.0,
|
||||||
|
r=2,
|
||||||
|
attention_type="dynamic_convolution",
|
||||||
|
double_decoder_consistency=True,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="phoneme_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=25,
|
||||||
|
print_eval=True,
|
||||||
|
mixed_precision=False,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = Tacotron2(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -1,22 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# take the scripts's parent's directory to prefix all the output paths.
|
|
||||||
RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
|
||||||
echo $RUN_DIR
|
|
||||||
# download LJSpeech dataset
|
|
||||||
wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
|
||||||
# extract
|
|
||||||
tar -xjf LJSpeech-1.1.tar.bz2
|
|
||||||
# create train-val splits
|
|
||||||
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
|
|
||||||
head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
|
|
||||||
tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
|
|
||||||
mv LJSpeech-1.1 $RUN_DIR/
|
|
||||||
rm LJSpeech-1.1.tar.bz2
|
|
||||||
# compute dataset mean and variance for normalization
|
|
||||||
python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/
|
|
||||||
# training ....
|
|
||||||
# change the GPU id if needed
|
|
||||||
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \
|
|
||||||
--coqpit.output_path $RUN_DIR \
|
|
||||||
--coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \
|
|
||||||
--coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
|
|
Binary file not shown.
|
@ -1,94 +0,0 @@
|
||||||
{
|
|
||||||
"model": "Tacotron2",
|
|
||||||
"datasets": [
|
|
||||||
{
|
|
||||||
"name": "ljspeech",
|
|
||||||
"path": "DEFINE THIS",
|
|
||||||
"meta_file_train": "metadata.csv",
|
|
||||||
"meta_file_val": null
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"audio": {
|
|
||||||
"fft_size": 1024,
|
|
||||||
"win_length": 1024,
|
|
||||||
"hop_length": 256,
|
|
||||||
"frame_length_ms": null,
|
|
||||||
"frame_shift_ms": null,
|
|
||||||
"sample_rate": 22050,
|
|
||||||
"preemphasis": 0.0,
|
|
||||||
"ref_level_db": 20,
|
|
||||||
"do_trim_silence": true,
|
|
||||||
"trim_db": 60,
|
|
||||||
"power": 1.5,
|
|
||||||
"griffin_lim_iters": 60,
|
|
||||||
"num_mels": 80,
|
|
||||||
"mel_fmin": 50.0,
|
|
||||||
"mel_fmax": 7600.0,
|
|
||||||
"spec_gain": 1,
|
|
||||||
"signal_norm": true,
|
|
||||||
"min_level_db": -100,
|
|
||||||
"symmetric_norm": true,
|
|
||||||
"max_norm": 4.0,
|
|
||||||
"clip_norm": true,
|
|
||||||
"stats_path": "scale_stats.npy"
|
|
||||||
},
|
|
||||||
"gst":{
|
|
||||||
"gst_embedding_dim": 256,
|
|
||||||
"gst_num_heads": 4,
|
|
||||||
"gst_num_style_tokens": 10
|
|
||||||
},
|
|
||||||
"distributed_backend": "gloo",
|
|
||||||
"distributed_url": "tcp:\/\/localhost:54321",
|
|
||||||
"run_name": "ljspeech-ddc",
|
|
||||||
"run_description": "tacotron2 with double decoder consistency.",
|
|
||||||
"batch_size": 64,
|
|
||||||
"eval_batch_size": 16,
|
|
||||||
"mixed_precision": false,
|
|
||||||
"loss_masking": true,
|
|
||||||
"decoder_loss_alpha": 0.25,
|
|
||||||
"postnet_loss_alpha": 0.25,
|
|
||||||
"postnet_diff_spec_alpha": 0.25,
|
|
||||||
"decoder_diff_spec_alpha": 0.25,
|
|
||||||
"decoder_ssim_alpha": 0.25,
|
|
||||||
"postnet_ssim_alpha": 0.25,
|
|
||||||
"ga_alpha": 5.0,
|
|
||||||
"stopnet_pos_weight": 15.0,
|
|
||||||
"run_eval": true,
|
|
||||||
"test_delay_epochs": 10,
|
|
||||||
"test_sentences_file": null,
|
|
||||||
"max_decoder_steps": 1000,
|
|
||||||
"noam_schedule": true,
|
|
||||||
"grad_clip": 0.05,
|
|
||||||
"epochs": 1000,
|
|
||||||
"lr": 0.001,
|
|
||||||
"wd": 1e-06,
|
|
||||||
"warmup_steps": 4000,
|
|
||||||
"memory_size": -1,
|
|
||||||
"prenet_type": "original",
|
|
||||||
"prenet_dropout": true,
|
|
||||||
"attention_type": "original",
|
|
||||||
"location_attn": true,
|
|
||||||
"double_decoder_consistency": true,
|
|
||||||
"ddc_r": 6,
|
|
||||||
"attention_norm": "sigmoid",
|
|
||||||
"r": 6,
|
|
||||||
"gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
|
|
||||||
"stopnet": true,
|
|
||||||
"separate_stopnet": true,
|
|
||||||
"print_step": 25,
|
|
||||||
"tb_plot_step": 100,
|
|
||||||
"print_eval": false,
|
|
||||||
"save_step": 10000,
|
|
||||||
"checkpoint": true,
|
|
||||||
"text_cleaner": "phoneme_cleaners",
|
|
||||||
"num_loader_workers": 4,
|
|
||||||
"num_val_loader_workers": 4,
|
|
||||||
"batch_group_size": 4,
|
|
||||||
"min_seq_len": 6,
|
|
||||||
"max_seq_len": 180,
|
|
||||||
"compute_input_seq_cache": true,
|
|
||||||
"output_path": "DEFINE THIS",
|
|
||||||
"phoneme_cache_path": "DEFINE THIS",
|
|
||||||
"use_phonemes": false,
|
|
||||||
"phoneme_language": "en-us"
|
|
||||||
}
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs import BaseDatasetConfig, Tacotron2Config
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.tacotron2 import Tacotron2
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
# from TTS.tts.datasets.tokenizer import Tokenizer
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# init configs
|
||||||
|
dataset_config = BaseDatasetConfig(
|
||||||
|
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=60.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = Tacotron2Config( # This is the config that is saved for the future use
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=64,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
r=6,
|
||||||
|
gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
|
||||||
|
double_decoder_consistency=True,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="phoneme_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=25,
|
||||||
|
print_eval=True,
|
||||||
|
mixed_precision=False,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = Tacotron2(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -1,7 +1,10 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.vocoder.configs import UnivnetConfig
|
from TTS.vocoder.configs import UnivnetConfig
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
|
from TTS.vocoder.models.gan import GAN
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
config = UnivnetConfig(
|
config = UnivnetConfig(
|
||||||
|
@ -24,6 +27,24 @@ config = UnivnetConfig(
|
||||||
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = GAN(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.config.shared_configs import BaseAudioConfig
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
from TTS.tts.configs import BaseDatasetConfig, VitsConfig
|
from TTS.tts.configs import BaseDatasetConfig, VitsConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.vits import Vits
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
|
@ -24,6 +27,7 @@ audio_config = BaseAudioConfig(
|
||||||
signal_norm=False,
|
signal_norm=False,
|
||||||
do_amp_to_db_linear=False,
|
do_amp_to_db_linear=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
config = VitsConfig(
|
config = VitsConfig(
|
||||||
audio=audio_config,
|
audio=audio_config,
|
||||||
run_name="vits_ljspeech",
|
run_name="vits_ljspeech",
|
||||||
|
@ -47,6 +51,24 @@ config = VitsConfig(
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
datasets=[dataset_config],
|
datasets=[dataset_config],
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=True)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = Vits(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.vocoder.configs import WavegradConfig
|
from TTS.vocoder.configs import WavegradConfig
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
|
from TTS.vocoder.models.wavegrad import Wavegrad
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
config = WavegradConfig(
|
config = WavegradConfig(
|
||||||
|
@ -22,6 +25,24 @@ config = WavegradConfig(
|
||||||
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = Wavegrad(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from TTS.trainer import Trainer, TrainingArgs, init_training
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.vocoder.configs import WavernnConfig
|
from TTS.vocoder.configs import WavernnConfig
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
|
from TTS.vocoder.models.wavernn import Wavernn
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
config = WavernnConfig(
|
config = WavernnConfig(
|
||||||
|
@ -24,6 +27,24 @@ config = WavernnConfig(
|
||||||
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
|
|
||||||
trainer = Trainer(args, config, output_path, c_logger, dashboard_logger, cudnn_benchmark=True)
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = Wavernn(config)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
trainer.fit()
|
trainer.fit()
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# take the scripts's parent's directory to prefix all the output paths.
|
||||||
|
RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||||
|
echo $RUN_DIR
|
||||||
|
# download LJSpeech dataset
|
||||||
|
wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip
|
||||||
|
# extract
|
||||||
|
mkdir VCTK
|
||||||
|
unzip VCTK-Corpus-0.92 -d VCTK
|
||||||
|
# create train-val splits
|
||||||
|
mv VCTK $RUN_DIR/recipes/vctk/
|
||||||
|
rm VCTK-Corpus-0.92.zip
|
|
@ -0,0 +1,80 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.fast_pitch_config import FastPitchConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.forward_tts import ForwardTTS
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=23.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = FastPitchConfig(
|
||||||
|
run_name="fast_pitch_ljspeech",
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=32,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=8,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
compute_input_seq_cache=True,
|
||||||
|
compute_f0=True,
|
||||||
|
f0_cache_path=os.path.join(output_path, "f0_cache"),
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="english_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
use_espeak_phonemes=False,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=50,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=False,
|
||||||
|
sort_by_audio_len=True,
|
||||||
|
max_seq_len=500000,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
use_speaker_embedding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio)
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init speaker manager for multi-speaker training
|
||||||
|
# it maps speaker-id to speaker-name in the model and data-loader
|
||||||
|
speaker_manager = SpeakerManager()
|
||||||
|
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
|
||||||
|
config.model_args.num_speakers = speaker_manager.num_speakers
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = ForwardTTS(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -0,0 +1,80 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.fast_speech_config import FastSpeechConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.forward_tts import ForwardTTS
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=23.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = FastSpeechConfig(
|
||||||
|
run_name="fast_pitch_ljspeech",
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=32,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=8,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
compute_input_seq_cache=True,
|
||||||
|
compute_f0=True,
|
||||||
|
f0_cache_path=os.path.join(output_path, "f0_cache"),
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="english_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
use_espeak_phonemes=False,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=50,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=False,
|
||||||
|
sort_by_audio_len=True,
|
||||||
|
max_seq_len=500000,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
use_speaker_embedding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio)
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init speaker manager for multi-speaker training
|
||||||
|
# it maps speaker-id to speaker-name in the model and data-loader
|
||||||
|
speaker_manager = SpeakerManager()
|
||||||
|
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
|
||||||
|
config.model_args.num_speakers = speaker_manager.num_speakers
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = ForwardTTS(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -0,0 +1,62 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
||||||
|
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.glow_tts import GlowTTS
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(sample_rate=22050, do_trim_silence=True, trim_db=23.0)
|
||||||
|
|
||||||
|
config = GlowTTSConfig(
|
||||||
|
batch_size=64,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="phoneme_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=25,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=True,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
use_speaker_embedding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init speaker manager for multi-speaker training
|
||||||
|
# it maps speaker-id to speaker-name in the model and data-loader
|
||||||
|
speaker_manager = SpeakerManager()
|
||||||
|
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
|
||||||
|
config.num_speakers = speaker_manager.num_speakers
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = GlowTTS(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -0,0 +1,80 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config import BaseAudioConfig, BaseDatasetConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.forward_tts import ForwardTTS
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=23.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = SpeedySpeechConfig(
|
||||||
|
run_name="fast_pitch_ljspeech",
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=32,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=8,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
compute_input_seq_cache=True,
|
||||||
|
compute_f0=True,
|
||||||
|
f0_cache_path=os.path.join(output_path, "f0_cache"),
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="english_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
use_espeak_phonemes=False,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=50,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=False,
|
||||||
|
sort_by_audio_len=True,
|
||||||
|
max_seq_len=500000,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
use_speaker_embedding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio)
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init speaker manager for multi-speaker training
|
||||||
|
# it maps speaker-id to speaker-name in the model and data-loader
|
||||||
|
speaker_manager = SpeakerManager()
|
||||||
|
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
|
||||||
|
config.model_args.num_speakers = speaker_manager.num_speakers
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = ForwardTTS(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -0,0 +1,80 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
||||||
|
from TTS.tts.configs.tacotron_config import TacotronConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.tacotron import Tacotron
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
resample=True, # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training.
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=23.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
ref_level_db=20,
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = TacotronConfig( # This is the config that is saved for the future use
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=48,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
r=6,
|
||||||
|
gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
|
||||||
|
double_decoder_consistency=True,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="phoneme_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=25,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=True,
|
||||||
|
sort_by_audio_len=True,
|
||||||
|
min_seq_len=0,
|
||||||
|
max_seq_len=44000 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
use_speaker_embedding=True, # set this to enable multi-sepeaker training
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init speaker manager for multi-speaker training
|
||||||
|
# it mainly handles speaker-id to speaker-name for the model and the data-loader
|
||||||
|
speaker_manager = SpeakerManager()
|
||||||
|
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = Tacotron(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -0,0 +1,87 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
||||||
|
from TTS.tts.configs.tacotron2_config import Tacotron2Config
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.tacotron2 import Tacotron2
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
resample=False, # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training.
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=23.0,
|
||||||
|
signal_norm=False,
|
||||||
|
mel_fmin=0.0,
|
||||||
|
mel_fmax=8000,
|
||||||
|
spec_gain=1.0,
|
||||||
|
log_func="np.log",
|
||||||
|
preemphasis=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = Tacotron2Config( # This is the config that is saved for the future use
|
||||||
|
audio=audio_config,
|
||||||
|
batch_size=32,
|
||||||
|
eval_batch_size=16,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
r=2,
|
||||||
|
# gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
|
||||||
|
double_decoder_consistency=False,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="phoneme_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
print_step=150,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=True,
|
||||||
|
sort_by_audio_len=True,
|
||||||
|
min_seq_len=14800,
|
||||||
|
max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
use_speaker_embedding=True, # set this to enable multi-sepeaker training
|
||||||
|
decoder_ssim_alpha=0.0, # disable ssim losses that causes NaN for some runs.
|
||||||
|
postnet_ssim_alpha=0.0,
|
||||||
|
postnet_diff_spec_alpha=0.0,
|
||||||
|
decoder_diff_spec_alpha=0.0,
|
||||||
|
attention_norm="softmax",
|
||||||
|
optimizer="Adam",
|
||||||
|
lr_scheduler=None,
|
||||||
|
lr=3e-5,
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init speaker manager for multi-speaker training
|
||||||
|
# it mainly handles speaker-id to speaker-name for the model and the data-loader
|
||||||
|
speaker_manager = SpeakerManager()
|
||||||
|
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = Tacotron2(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
|
@ -0,0 +1,86 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import BaseAudioConfig
|
||||||
|
from TTS.trainer import Trainer, TrainingArgs
|
||||||
|
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
||||||
|
from TTS.tts.configs.vits_config import VitsConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
from TTS.tts.models.vits import Vits
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
|
|
||||||
|
audio_config = BaseAudioConfig(
|
||||||
|
sample_rate=22050,
|
||||||
|
win_length=1024,
|
||||||
|
hop_length=256,
|
||||||
|
num_mels=80,
|
||||||
|
preemphasis=0.0,
|
||||||
|
ref_level_db=20,
|
||||||
|
log_func="np.log",
|
||||||
|
do_trim_silence=True,
|
||||||
|
trim_db=23.0,
|
||||||
|
mel_fmin=0,
|
||||||
|
mel_fmax=None,
|
||||||
|
spec_gain=1.0,
|
||||||
|
signal_norm=False,
|
||||||
|
do_amp_to_db_linear=False,
|
||||||
|
resample=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = VitsConfig(
|
||||||
|
audio=audio_config,
|
||||||
|
run_name="vits_vctk",
|
||||||
|
use_speaker_embedding=True,
|
||||||
|
batch_size=32,
|
||||||
|
eval_batch_size=16,
|
||||||
|
batch_group_size=5,
|
||||||
|
num_loader_workers=4,
|
||||||
|
num_eval_loader_workers=4,
|
||||||
|
run_eval=True,
|
||||||
|
test_delay_epochs=-1,
|
||||||
|
epochs=1000,
|
||||||
|
text_cleaner="english_cleaners",
|
||||||
|
use_phonemes=True,
|
||||||
|
phoneme_language="en-us",
|
||||||
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
|
compute_input_seq_cache=True,
|
||||||
|
print_step=25,
|
||||||
|
print_eval=False,
|
||||||
|
mixed_precision=True,
|
||||||
|
sort_by_audio_len=True,
|
||||||
|
min_seq_len=32 * 256 * 4,
|
||||||
|
max_seq_len=1500000,
|
||||||
|
output_path=output_path,
|
||||||
|
datasets=[dataset_config],
|
||||||
|
)
|
||||||
|
|
||||||
|
# init audio processor
|
||||||
|
ap = AudioProcessor(**config.audio.to_dict())
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
|
||||||
|
# init speaker manager for multi-speaker training
|
||||||
|
# it maps speaker-id to speaker-name in the model and data-loader
|
||||||
|
speaker_manager = SpeakerManager()
|
||||||
|
speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
|
||||||
|
config.model_args.num_speakers = speaker_manager.num_speakers
|
||||||
|
|
||||||
|
# init model
|
||||||
|
model = Vits(config, speaker_manager)
|
||||||
|
|
||||||
|
# init the trainer and 🚀
|
||||||
|
trainer = Trainer(
|
||||||
|
TrainingArgs(),
|
||||||
|
config,
|
||||||
|
output_path,
|
||||||
|
model=model,
|
||||||
|
train_samples=train_samples,
|
||||||
|
eval_samples=eval_samples,
|
||||||
|
training_assets={"audio_processor": ap},
|
||||||
|
)
|
||||||
|
trainer.fit()
|
20
setup.py
20
setup.py
|
@ -1,4 +1,24 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# ,*++++++*, ,*++++++*,
|
||||||
|
# *++. .+++ *++. .++*
|
||||||
|
# *+* ,++++* *+* *+* ,++++, *+*
|
||||||
|
# ,+, .++++++++++* ,++,,,,*+, ,++++++++++. *+,
|
||||||
|
# *+. .++++++++++++..++ *+.,++++++++++++. .+*
|
||||||
|
# .+* ++++++++++++.*+, .+*.++++++++++++ *+,
|
||||||
|
# .++ *++++++++* ++, .++.*++++++++* ++,
|
||||||
|
# ,+++*. . .*++, ,++*. .*+++*
|
||||||
|
# *+, .,*++**. .**++**. ,+*
|
||||||
|
# .+* *+,
|
||||||
|
# *+. .+*
|
||||||
|
# *+* +++ +++ *+*
|
||||||
|
# .+++*. . . *+++.
|
||||||
|
# ,+* *+++*... ...*+++* *+,
|
||||||
|
# .++. .""""+++++++****+++++++"""". ++.
|
||||||
|
# ,++. .++,
|
||||||
|
# .++* *++.
|
||||||
|
# *+++, ,+++*
|
||||||
|
# .,*++++::::::++++*,.
|
||||||
|
# ``````
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue